From aa6014f20da6634515bba436ebee77f66c44a312 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Thu, 8 Feb 2024 05:22:50 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 90879 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 91274 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..efa176df --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-01-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.18070v1","updated":"2024-01-31T18:48:20Z","published":"2024-01-31T18:48:20Z","title":"Do Language Models Exhibit the Same Cognitive Biases in Problem Solving\n as Human Learners?","summary":" There is increasing interest in employing large language models (LLMs) as\ncognitive models. For such purposes, it is central to understand which\ncognitive properties are well-modeled by LLMs, and which are not. In this work,\nwe study the biases of LLMs in relation to those known in children when solving\narithmetic word problems. Surveying the learning science literature, we posit\nthat the problem-solving process can be split into three distinct steps: text\ncomprehension, solution planning and solution execution. We construct tests for\neach one in order to understand which parts of this process can be faithfully\nmodeled by current state-of-the-art LLMs. We generate a novel set of word\nproblems for each of these tests, using a neuro-symbolic method that enables\nfine-grained control over the problem features. We find evidence that LLMs,\nwith and without instruction-tuning, exhibit human-like biases in both the\ntext-comprehension and the solution-planning steps of the solving process, but\nnot during the final step which relies on the problem's arithmetic expressions\n(solution execution).\n","authors":["Andreas Opedal","Alessandro Stolfo","Haruki Shirakami","Ying Jiao","Ryan Cotterell","Bernhard Schölkopf","Abulhair Saparov","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2401.18070v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.18059v1","updated":"2024-01-31T18:30:21Z","published":"2024-01-31T18:30:21Z","title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval","summary":" Retrieval-augmented language models can better adapt to changes in world\nstate and incorporate long-tail knowledge. However, most existing methods\nretrieve only short contiguous chunks from a retrieval corpus, limiting\nholistic understanding of the overall document context. We introduce the novel\napproach of recursively embedding, clustering, and summarizing chunks of text,\nconstructing a tree with differing levels of summarization from the bottom up.\nAt inference time, our RAPTOR model retrieves from this tree, integrating\ninformation across lengthy documents at different levels of abstraction.\nControlled experiments show that retrieval with recursive summaries offers\nsignificant improvements over traditional retrieval-augmented LMs on several\ntasks. On question-answering tasks that involve complex, multi-step reasoning,\nwe show state-of-the-art results; for example, by coupling RAPTOR retrieval\nwith the use of GPT-4, we can improve the best performance on the QuALITY\nbenchmark by 20% in absolute accuracy.\n","authors":["Parth Sarthi","Salman Abdullah","Aditi Tuli","Shubh Khanna","Anna Goldie","Christopher D. Manning"],"pdf_url":"https://arxiv.org/pdf/2401.18059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18058v1","updated":"2024-01-31T18:29:39Z","published":"2024-01-31T18:29:39Z","title":"LongAlign: A Recipe for Long Context Alignment of Large Language Models","summary":" Extending large language models to effectively handle long contexts requires\ninstruction fine-tuning on input sequences of similar length. To address this,\nwe present LongAlign -- a recipe of the instruction data, training, and\nevaluation for long context alignment. First, we construct a long\ninstruction-following dataset using Self-Instruct. To ensure the data\ndiversity, it covers a broad range of tasks from various long context sources.\nSecond, we adopt the packing and sorted batching strategies to speed up\nsupervised fine-tuning on data with varied length distributions. Additionally,\nwe develop a loss weighting method to balance the contribution to the loss\nacross different sequences during packing training. Third, we introduce the\nLongBench-Chat benchmark for evaluating instruction-following capabilities on\nqueries of 10k-100k in length. Experiments show that LongAlign outperforms\nexisting recipes for LLMs in long context tasks by up to 30\\%, while also\nmaintaining their proficiency in handling short, generic tasks. The code, data,\nand long-aligned models are open-sourced at https://github.com/THUDM/LongAlign.\n","authors":["Yushi Bai","Xin Lv","Jiajie Zhang","Yuze He","Ji Qi","Lei Hou","Jie Tang","Yuxiao Dong","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2401.18058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18046v1","updated":"2024-01-31T18:07:12Z","published":"2024-01-31T18:07:12Z","title":"Multipath parsing in the brain","summary":" Humans understand sentences word-by-word, in the order that they hear them.\nThis incrementality entails resolving temporary ambiguities about syntactic\nrelationships. We investigate how humans process these syntactic ambiguities by\ncorrelating predictions from incremental generative dependency parsers with\ntimecourse data from people undergoing functional neuroimaging while listening\nto an audiobook. In particular, we compare competing hypotheses regarding the\nnumber of developing syntactic analyses in play during word-by-word\ncomprehension: one vs more than one. This comparison involves evaluating\nsyntactic surprisal from a state-of-the-art dependency parser with LLM-adapted\nencodings against an existing fMRI dataset. In both English and Chinese data,\nwe find evidence for multipath parsing. Brain regions associated with this\nmultipath effect include bilateral superior temporal gyrus.\n","authors":["Berta Franzluebbers","Donald Dunagan","Miloš Stanojević","Jan Buys","John T. Hale"],"pdf_url":"https://arxiv.org/pdf/2401.18046v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2401.18045v1","updated":"2024-01-31T18:06:29Z","published":"2024-01-31T18:06:29Z","title":"SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition","summary":" Recent advancements in language models have significantly enhanced\nperformance in multiple speech-related tasks. Existing speech language models\ntypically utilize task-dependent prompt tokens to unify various speech tasks in\na single model. However, this design omits the intrinsic connections between\ndifferent speech tasks, which can potentially boost the performance of each\ntask. In this work, we propose a novel decoder-only speech language model,\nSpeechComposer, that can unify common speech tasks by composing a fixed set of\nprompt tokens. Built upon four primary tasks -- speech synthesis, speech\nrecognition, speech language modeling, and text language modeling --\nSpeechComposer can easily extend to more speech tasks via compositions of\nwell-designed prompt tokens, like voice conversion and speech enhancement. The\nunification of prompt tokens also makes it possible for knowledge sharing among\ndifferent speech tasks in a more structured manner. Experimental results\ndemonstrate that our proposed SpeechComposer can improve the performance of\nboth primary tasks and composite tasks, showing the effectiveness of the shared\nprompt tokens. Remarkably, the unified decoder-only model achieves a comparable\nand even better performance than the baselines which are expert models designed\nfor single tasks.\n","authors":["Yihan Wu","Soumi Maiti","Yifan Peng","Wangyou Zhang","Chenda Li","Yuyue Wang","Xihua Wang","Shinji Watanabe","Ruihua Song"],"pdf_url":"https://arxiv.org/pdf/2401.18045v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.18040v1","updated":"2024-01-31T18:03:39Z","published":"2024-01-31T18:03:39Z","title":"Enhancing End-to-End Multi-Task Dialogue Systems: A Study on Intrinsic\n Motivation Reinforcement Learning Algorithms for Improved Training and\n Adaptability","summary":" End-to-end multi-task dialogue systems are usually designed with separate\nmodules for the dialogue pipeline. Among these, the policy module is essential\nfor deciding what to do in response to user input. This policy is trained by\nreinforcement learning algorithms by taking advantage of an environment in\nwhich an agent receives feedback in the form of a reward signal. The current\ndialogue systems, however, only provide meagre and simplistic rewards.\nInvestigating intrinsic motivation reinforcement learning algorithms is the\ngoal of this study. Through this, the agent can quickly accelerate training and\nimprove its capacity to judge the quality of its actions by teaching it an\ninternal incentive system. In particular, we adapt techniques for random\nnetwork distillation and curiosity-driven reinforcement learning to measure the\nfrequency of state visits and encourage exploration by using semantic\nsimilarity between utterances. Experimental results on MultiWOZ, a\nheterogeneous dataset, show that intrinsic motivation-based debate systems\noutperform policies that depend on extrinsic incentives. By adopting random\nnetwork distillation, for example, which is trained using semantic similarity\nbetween user-system dialogues, an astounding average success rate of 73% is\nachieved. This is a significant improvement over the baseline Proximal Policy\nOptimization (PPO), which has an average success rate of 60%. In addition,\nperformance indicators such as booking rates and completion rates show a 10%\nrise over the baseline. Furthermore, these intrinsic incentive models help\nimprove the system's policy's resilience in an increasing amount of domains.\nThis implies that they could be useful in scaling up to settings that cover a\nwider range of domains.\n","authors":["Navin Kamuni","Hardik Shah","Sathishkumar Chintala","Naveen Kunchakuri","Sujatha Alla Old Dominion"],"pdf_url":"https://arxiv.org/pdf/2401.18040v1.pdf","comment":"6 pages, 1 figure, 18th IEEE International Conference on Semantic\n Computing"},{"id":"http://arxiv.org/abs/2401.18034v1","updated":"2024-01-31T17:58:10Z","published":"2024-01-31T17:58:10Z","title":"Paramanu: A Family of Novel Efficient Indic Generative Foundation\n Language Models","summary":" We present Gyan AI Paramanu (\"atom\"), a family of novel language models for\nIndian languages. It is a collection of auto-regressive monolingual, bilingual,\nand multilingual Indic language models pretrained from scratch on a single GPU\nfor 10 Indian languages (Assamese, Bangla, Hindi, Konkani, Maithili, Marathi,\nOdia, Sanskrit, Tamil, Telugu) across 5 scripts (Bangla, Devanagari, Odia,\nTamil, Telugu) of varying sizes ranging from 13.29M to 367.5M.The models are\npretrained with a context size of 1024 on a single GPU. The models are very\nefficient, small, fast, and powerful. We have also developed an efficient most\nadvanced Indic tokenizer that can even tokenize unseen languages. In order to\navoid the \"curse of multi-linguality\" in our multilingual mParamanu model, we\npretrained on comparable corpora by typological grouping using the same script.\nWe performed human evaluation of our pretrained models for open end text\ngeneration on grammar, coherence, creativity, and factuality metrics for\nBangla, Hindi, and Sanskrit. Our Bangla, Hindi, and Sanskrit models\noutperformed GPT-3.5-Turbo (ChatGPT), Bloom 7B, LLaMa-2 7B, OPT 6.7B, GPT-J 6B,\nGPTNeo 1.3B, GPT2-XL large language models (LLMs) by a large margin despite\nbeing smaller in size by 66 to 20 times compared to standard 7B LLMs. To run\ninference on our pretrained models, CPU is enough, and GPU is not needed. We\nalso instruction-tuned our pretrained Bangla, Hindi, Marathi, Tamil, and Telugu\nmodels on 23k instructions in respective languages. Our pretrained and\ninstruction-tuned models which are first of its kind, most powerful efficient\nsmall generative language models ever developed for Indic languages, and the\nvarious results lead to the conclusion that high quality generative language\nmodels are possible without high amount of compute power and humongous number\nof parameters. We plan to release our models at https://www.bharatgpts.com.\n","authors":["Mitodru Niyogi","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2401.18034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18028v1","updated":"2024-01-31T17:43:04Z","published":"2024-01-31T17:43:04Z","title":"Supporting Anticipatory Governance using LLMs: Evaluating and Aligning\n Large Language Models with the News Media to Anticipate the Negative Impacts\n of AI","summary":" Anticipating the negative impacts of emerging AI technologies is a challenge,\nespecially in the early stages of development. An understudied approach to such\nanticipation is the use of LLMs to enhance and guide this process. Despite\nadvancements in LLMs and evaluation metrics to account for biases in generated\ntext, it is unclear how well these models perform in anticipatory tasks.\nSpecifically, the use of LLMs to anticipate AI impacts raises questions about\nthe quality and range of categories of negative impacts these models are\ncapable of generating. In this paper we leverage news media, a diverse data\nsource that is rich with normative assessments of emerging technologies, to\nformulate a taxonomy of impacts to act as a baseline for comparing against. By\ncomputationally analyzing thousands of news articles published by hundreds of\nonline news domains around the world, we develop a taxonomy consisting of ten\ncategories of AI impacts. We then evaluate both instruction-based (GPT-4 and\nMistral-7B-Instruct) and fine-tuned completion models (Mistral-7B and GPT-3)\nusing a sample from this baseline. We find that the generated impacts using\nMistral-7B, fine-tuned on impacts from the news media, tend to be qualitatively\non par with impacts generated using a larger scale model such as GPT-4.\nMoreover, we find that these LLMs generate impacts that largely reflect the\ntaxonomy of negative impacts identified in the news media, however the impacts\nproduced by instruction-based models had gaps in the production of certain\ncategories of impacts in comparison to fine-tuned models. This research\nhighlights a potential bias in state-of-the-art LLMs when used for anticipating\nimpacts and demonstrates the advantages of aligning smaller LLMs with a diverse\nrange of impacts, such as those reflected in the news media, to better reflect\nsuch impacts during anticipatory exercises.\n","authors":["Mowafak Allaham","Nicholas Diakopoulos"],"pdf_url":"https://arxiv.org/pdf/2401.18028v1.pdf","comment":"14 pages + research ethics and social impact statement, references,\n and appendix. Under conference review"},{"id":"http://arxiv.org/abs/2401.15496v2","updated":"2024-01-31T17:36:29Z","published":"2024-01-27T20:20:39Z","title":"Baichuan2-Sum: Instruction Finetune Baichuan2-7B Model for Dialogue\n Summarization","summary":" Large language models (LLMs) like Llama, Baichuan and Bloom models show\nremarkable ability with instruction fine-tuning in many natural language tasks.\nNevertheless, for the dialogue summarization task, which aims to generate\nsummaries for different roles in dialogue, most of the state-of-the-art methods\nconduct on small models (e.g Bart and Bert). Existing methods try to add task\nspecified optimization on small models like adding global-local centrality\nscore to models. In this paper, we propose an instruction fine-tuning model:\nBaichuan2-Sum, for role-oriented diaglouge summarization. By setting different\ninstructions for different roles, the model can learn from the dialogue\ninteractions and output the expected summaries. Furthermore, we applied NEFTune\ntechnique to add suitable noise during training to improve the results. The\nexperiments demonstrate that the proposed model achieves the new\nstate-of-the-art results on two public dialogue summarization datasets: CSDS\nand SAMSUM. We release our model and related codes to facilitate future studies\non dialogue summarization task.\n","authors":["Jianfei Xiao","Yancan Chen","Yimin Ou","Hanyi Yu","Yiyong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.15496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18018v1","updated":"2024-01-31T17:28:24Z","published":"2024-01-31T17:28:24Z","title":"Prompt-Driven LLM Safeguarding via Directed Representation Optimization","summary":" Prepending model inputs with safety prompts is a common practice of\nsafeguarding large language models (LLMs) from complying with queries that\ncontain harmful intents. However, the working mechanisms of safety prompts have\nnot yet been fully understood, which hinders the potential for automatically\noptimizing them for improved LLM safety. Motivated by this problem, we\ninvestigate the impact of safety prompts from the perspective of model\nrepresentations. We find that in models' representation space, harmful and\nharmless queries can be largely distinguished, but this is not noticeably\nenhanced by safety prompts. Instead, the queries' representations are moved by\ndifferent safety prompts in similar directions, where models become more prone\nto refusal (i.e., refusing to provide assistance) even when the queries are\nharmless. Inspired by these findings, we propose a method called DRO (Directed\nRepresentation Optimization) for automatic safety prompt optimization. DRO\ntreats safety prompts as continuous, trainable embeddings and learns to move\nthe representations of harmful/harmless queries along/opposite the direction in\nwhich the model's refusal probability increases. We demonstrate that DRO\nremarkably improves the safeguarding performance of human-crafted safety\nprompts and outperforms strong baselines, as evaluated on out-of-domain\nbenchmarks, without compromising the general model capability.\n","authors":["Chujie Zheng","Fan Yin","Hao Zhou","Fandong Meng","Jie Zhou","Kai-Wei Chang","Minlie Huang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.18018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18001v1","updated":"2024-01-31T17:02:31Z","published":"2024-01-31T17:02:31Z","title":"Desiderata for the Context Use of Question Answering Systems","summary":" Prior work has uncovered a set of common problems in state-of-the-art\ncontext-based question answering (QA) systems: a lack of attention to the\ncontext when the latter conflicts with a model's parametric knowledge, little\nrobustness to noise, and a lack of consistency with their answers. However,\nmost prior work focus on one or two of those problems in isolation, which makes\nit difficult to see trends across them. We aim to close this gap, by first\noutlining a set of -- previously discussed as well as novel -- desiderata for\nQA models. We then survey relevant analysis and methods papers to provide an\noverview of the state of the field. The second part of our work presents\nexperiments where we evaluate 15 QA systems on 5 datasets according to all\ndesiderata at once. We find many novel trends, including (1) systems that are\nless susceptible to noise are not necessarily more consistent with their\nanswers when given irrelevant context; (2) most systems that are more\nsusceptible to noise are more likely to correctly answer according to a context\nthat conflicts with their parametric knowledge; and (3) the combination of\nconflicting knowledge and noise can reduce system performance by up to 96%. As\nsuch, our desiderata help increase our understanding of how these models work\nand reveal potential avenues for improvements.\n","authors":["Sagi Shaier","Lawrence E Hunter","Katharina von der Wense"],"pdf_url":"https://arxiv.org/pdf/2401.18001v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2401.17979v1","updated":"2024-01-31T16:34:10Z","published":"2024-01-31T16:34:10Z","title":"Entity Linking in the Job Market Domain","summary":" In Natural Language Processing, entity linking (EL) has centered around\nWikipedia, but yet remains underexplored for the job market domain.\nDisambiguating skill mentions can help us get insight into the current labor\nmarket demands. In this work, we are the first to explore EL in this domain,\nspecifically targeting the linkage of occupational skills to the ESCO taxonomy\n(le Vrang et al., 2014). Previous efforts linked coarse-grained (full)\nsentences to a corresponding ESCO skill. In this work, we link more\nfine-grained span-level mentions of skills. We tune two high-performing neural\nEL models, a bi-encoder (Wu et al., 2020) and an autoregressive model (Cao et\nal., 2021), on a synthetically generated mention--skill pair dataset and\nevaluate them on a human-annotated skill-linking benchmark. Our findings reveal\nthat both models are capable of linking implicit mentions of skills to their\ncorrect taxonomy counterparts. Empirically, BLINK outperforms GENRE in strict\nevaluation, but GENRE performs better in loose evaluation (accuracy@$k$).\n","authors":["Mike Zhang","Rob van der Goot","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2401.17979v1.pdf","comment":"Accepted at EACL 2024 Findings"},{"id":"http://arxiv.org/abs/2401.17974v1","updated":"2024-01-31T16:30:50Z","published":"2024-01-31T16:30:50Z","title":"GUMsley: Evaluating Entity Salience in Summarization for 12 English\n Genres","summary":" As NLP models become increasingly capable of understanding documents in terms\nof coherent entities rather than strings, obtaining the most salient entities\nfor each document is not only an important end task in itself but also vital\nfor Information Retrieval (IR) and other downstream applications such as\ncontrollable summarization. In this paper, we present and evaluate GUMsley, the\nfirst entity salience dataset covering all named and non-named salient entities\nfor 12 genres of English text, aligned with entity types, Wikification links\nand full coreference resolution annotations. We promote a strict definition of\nsalience using human summaries and demonstrate high inter-annotator agreement\nfor salience based on whether a source entity is mentioned in the summary. Our\nevaluation shows poor performance by pre-trained SOTA summarization models and\nzero-shot LLM prompting in capturing salient entities in generated summaries.\nWe also show that predicting or providing salient entities to several model\narchitectures enhances performance and helps derive higher-quality summaries by\nalleviating the entity hallucination problem in existing abstractive\nsummarization.\n","authors":["Jessica Lin","Amir Zeldes"],"pdf_url":"https://arxiv.org/pdf/2401.17974v1.pdf","comment":"Camera-ready for EACL 2024"},{"id":"http://arxiv.org/abs/2209.08316v2","updated":"2024-01-31T15:49:34Z","published":"2022-09-17T12:01:35Z","title":"An Empathetic AI Coach for Self-Attachment Therapy","summary":" In this work, we present a new dataset and a computational strategy for a\ndigital coach that aims to guide users in practicing the protocols of\nself-attachment therapy. Our framework augments a rule-based conversational\nagent with a deep-learning classifier for identifying the underlying emotion in\na user's text response, as well as a deep-learning assisted retrieval method\nfor producing novel, fluent and empathetic utterances. We also craft a set of\nhuman-like personas that users can choose to interact with. Our goal is to\nachieve a high level of engagement during virtual therapy sessions. We evaluate\nthe effectiveness of our framework in a non-clinical trial with N=16\nparticipants, all of whom have had at least four interactions with the agent\nover the course of five days. We find that our platform is consistently rated\nhigher for empathy, user engagement and usefulness than the simple rule-based\nframework. Finally, we provide guidelines to further improve the design and\nperformance of the application, in accordance with the feedback received.\n","authors":["Lisa Alazraki","Ali Ghachem","Neophytos Polydorou","Foaad Khosmood","Abbas Edalat"],"pdf_url":"https://arxiv.org/pdf/2209.08316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17922v1","updated":"2024-01-31T15:35:21Z","published":"2024-01-31T15:35:21Z","title":"[Lions: 1] and [Tigers: 2] and [Bears: 3], Oh My! Literary Coreference\n Annotation with LLMs","summary":" Coreference annotation and resolution is a vital component of computational\nliterary studies. However, it has previously been difficult to build high\nquality systems for fiction. Coreference requires complicated structured\noutputs, and literary text involves subtle inferences and highly varied\nlanguage. New language-model-based seq2seq systems present the opportunity to\nsolve both these problems by learning to directly generate a copy of an input\nsentence with markdown-like annotations. We create, evaluate, and release\nseveral trained models for coreference, as well as a workflow for training new\nmodels.\n","authors":["Rebecca M. M. Hicke","David Mimno"],"pdf_url":"https://arxiv.org/pdf/2401.17922v1.pdf","comment":"Accepted to LaTeCH-CLfL 2024"},{"id":"http://arxiv.org/abs/2401.17919v1","updated":"2024-01-31T15:33:37Z","published":"2024-01-31T15:33:37Z","title":"LOCOST: State-Space Models for Long Document Abstractive Summarization","summary":" State-space models are a low-complexity alternative to transformers for\nencoding long sequences and capturing long-term dependencies. We propose\nLOCOST: an encoder-decoder architecture based on state-space models for\nconditional text generation with long context inputs. With a computational\ncomplexity of $O(L \\log L)$, this architecture can handle significantly longer\nsequences than state-of-the-art models that are based on sparse attention\npatterns. We evaluate our model on a series of long document abstractive\nsummarization tasks. The model reaches a performance level that is 93-96%\ncomparable to the top-performing sparse transformers of the same size while\nsaving up to 50% memory during training and up to 87% during inference.\nAdditionally, LOCOST effectively handles input texts exceeding 600K tokens at\ninference time, setting new state-of-the-art results on full-book summarization\nand opening new perspectives for long input processing.\n","authors":["Florian Le Bronnec","Song Duong","Mathieu Ravaut","Alexandre Allauzen","Nancy F. Chen","Vincent Guigue","Alberto Lumbreras","Laure Soulier","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2401.17919v1.pdf","comment":"9 pages, 5 figures, 7 tables, EACL 2024 conference"},{"id":"http://arxiv.org/abs/2401.17911v1","updated":"2024-01-31T15:16:25Z","published":"2024-01-31T15:16:25Z","title":"SNNLP: Energy-Efficient Natural Language Processing Using Spiking Neural\n Networks","summary":" As spiking neural networks receive more attention, we look toward\napplications of this computing paradigm in fields other than computer vision\nand signal processing. One major field, underexplored in the neuromorphic\nsetting, is Natural Language Processing (NLP), where most state-of-the-art\nsolutions still heavily rely on resource-consuming and power-hungry traditional\ndeep learning architectures. Therefore, it is compelling to design NLP models\nfor neuromorphic architectures due to their low energy requirements, with the\nadditional benefit of a more human-brain-like operating model for processing\ninformation. However, one of the biggest issues with bringing NLP to the\nneuromorphic setting is in properly encoding text into a spike train so that it\ncan be seamlessly handled by both current and future SNN architectures. In this\npaper, we compare various methods of encoding text as spikes and assess each\nmethod's performance in an associated SNN on a downstream NLP task, namely,\nsentiment analysis. Furthermore, we go on to propose a new method of encoding\ntext as spikes that outperforms a widely-used rate-coding technique, Poisson\nrate-coding, by around 13\\% on our benchmark NLP tasks. Subsequently, we\ndemonstrate the energy efficiency of SNNs implemented in hardware for the\nsentiment analysis task compared to traditional deep neural networks, observing\nan energy efficiency increase of more than 32x during inference and 60x during\ntraining while incurring the expected energy-performance tradeoff.\n","authors":["R. Alexander Knipper","Kaniz Mishty","Mehdi Sadi","Shubhra Kanti Karmaker Santu"],"pdf_url":"https://arxiv.org/pdf/2401.17911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17902v1","updated":"2024-01-31T15:06:34Z","published":"2024-01-31T15:06:34Z","title":"Revisiting speech segmentation and lexicon learning with better features","summary":" We revisit a self-supervised method that segments unlabelled speech into\nword-like segments. We start from the two-stage duration-penalised dynamic\nprogramming method that performs zero-resource segmentation without learning an\nexplicit lexicon. In the first acoustic unit discovery stage, we replace\ncontrastive predictive coding features with HuBERT. After word segmentation in\nthe second stage, we get an acoustic word embedding for each segment by\naveraging HuBERT features. These embeddings are clustered using K-means to get\na lexicon. The result is good full-coverage segmentation with a lexicon that\nachieves state-of-the-art performance on the ZeroSpeech benchmarks.\n","authors":["Herman Kamper","Benjamin van Niekerk"],"pdf_url":"https://arxiv.org/pdf/2401.17902v1.pdf","comment":"2 pages"},{"id":"http://arxiv.org/abs/2401.17897v1","updated":"2024-01-31T15:04:01Z","published":"2024-01-31T15:04:01Z","title":"Employing Label Models on ChatGPT Answers Improves Legal Text Entailment\n Performance","summary":" The objective of legal text entailment is to ascertain whether the assertions\nin a legal query logically follow from the information provided in one or\nmultiple legal articles. ChatGPT, a large language model, is robust in many\nnatural language processing tasks, including legal text entailment: when we set\nthe temperature = 0 (the ChatGPT answers are deterministic) and prompt the\nmodel, it achieves 70.64% accuracy on COLIEE 2022 dataset, which outperforms\nthe previous SOTA of 67.89%. On the other hand, if the temperature is larger\nthan zero, ChatGPT answers are not deterministic, leading to inconsistent\nanswers and fluctuating results. We propose to leverage label models (a\nfundamental component of weak supervision techniques) to integrate the\nprovisional answers by ChatGPT into consolidated labels. By that way, we treat\nChatGPT provisional answers as noisy predictions which can be consolidated by\nlabel models. The experimental results demonstrate that this approach can\nattain an accuracy of 76.15%, marking a significant improvement of 8.26% over\nthe prior state-of-the-art benchmark. Additionally, we perform an analysis of\nthe instances where ChatGPT produces incorrect answers, then we classify the\nerrors, offering insights that could guide potential enhancements for future\nresearch endeavors.\n","authors":["Chau Nguyen","Le-Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2401.17897v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2401.17882v1","updated":"2024-01-31T14:41:23Z","published":"2024-01-31T14:41:23Z","title":"I Think, Therefore I am: Awareness in Large Language Models","summary":" Do large language models (LLMs) exhibit any forms of awareness similar to\nhumans? In this paper, we introduce the concept of awareness to LLMs, arguing\nthat awareness is an essential aspect of trustworthiness for LLMs to enhance\ntheir interaction with humans while ensuring ethical responses. We define\nawareness in LLMs as the ability to perceive and understand themselves as AI\nmodels and to exhibit social intelligence. We identify four key dimensions of\nawareness: capability, mission, emotion, and perspective. To assess LLMs on\nthese dimensions, we introduce a specialized dataset, AwareLLM dataset. Our\nfindings reveal that LLMs demonstrate a decent degree of awareness, though they\nstill lack substantial capability awareness.\n","authors":["Yuan Li","Yue Huang","Yuli Lin","Siyuan Wu","Yao Wan","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2401.17882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08929v2","updated":"2024-01-31T14:25:15Z","published":"2023-09-16T08:54:30Z","title":"Leveraging Multi-lingual Positive Instances in Contrastive Learning to\n Improve Sentence Embedding","summary":" Learning multi-lingual sentence embeddings is a fundamental task in natural\nlanguage processing. Recent trends in learning both mono-lingual and\nmulti-lingual sentence embeddings are mainly based on contrastive learning (CL)\namong an anchor, one positive, and multiple negative instances. In this work,\nwe argue that leveraging multiple positives should be considered for\nmulti-lingual sentence embeddings because (1) positives in a diverse set of\nlanguages can benefit cross-lingual learning, and (2) transitive similarity\nacross multiple positives can provide reliable structural information for\nlearning. In order to investigate the impact of multiple positives in CL, we\npropose a novel approach, named MPCL, to effectively utilize multiple positive\ninstances to improve the learning of multi-lingual sentence embeddings.\nExperimental results on various backbone models and downstream tasks\ndemonstrate that MPCL leads to better retrieval, semantic similarity, and\nclassification performances compared to conventional CL. We also observe that\nin unseen languages, sentence embedding models trained on multiple positives\nshow better cross-lingual transfer performance than models trained on a single\npositive instance.\n","authors":["Kaiyan Zhao","Qiyu Wu","Xin-Qiang Cai","Yoshimasa Tsuruoka"],"pdf_url":"https://arxiv.org/pdf/2309.08929v2.pdf","comment":"Accepted to EACL 2024, main conference"},{"id":"http://arxiv.org/abs/2401.17858v1","updated":"2024-01-31T14:19:03Z","published":"2024-01-31T14:19:03Z","title":"Probing Language Models' Gesture Understanding for Enhanced Human-AI\n Interaction","summary":" The rise of Large Language Models (LLMs) has affected various disciplines\nthat got beyond mere text generation. Going beyond their textual nature, this\nproject proposal aims to investigate the interaction between LLMs and\nnon-verbal communication, specifically focusing on gestures. The proposal sets\nout a plan to examine the proficiency of LLMs in deciphering both explicit and\nimplicit non-verbal cues within textual prompts and their ability to associate\nthese gestures with various contextual factors. The research proposes to test\nestablished psycholinguistic study designs to construct a comprehensive dataset\nthat pairs textual prompts with detailed gesture descriptions, encompassing\ndiverse regional variations, and semantic labels. To assess LLMs' comprehension\nof gestures, experiments are planned, evaluating their ability to simulate\nhuman behaviour in order to replicate psycholinguistic experiments. These\nexperiments consider cultural dimensions and measure the agreement between\nLLM-identified gestures and the dataset, shedding light on the models'\ncontextual interpretation of non-verbal cues (e.g. gestures).\n","authors":["Philipp Wicke"],"pdf_url":"https://arxiv.org/pdf/2401.17858v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.17839v1","updated":"2024-01-31T13:57:24Z","published":"2024-01-31T13:57:24Z","title":"Global-Liar: Factuality of LLMs over Time and Geographic Regions","summary":" The increasing reliance on AI-driven solutions, particularly Large Language\nModels (LLMs) like the GPT series, for information retrieval highlights the\ncritical need for their factuality and fairness, especially amidst the rampant\nspread of misinformation and disinformation online. Our study evaluates the\nfactual accuracy, stability, and biases in widely adopted GPT models, including\nGPT-3.5 and GPT-4, contributing to reliability and integrity of AI-mediated\ninformation dissemination.\n We introduce 'Global-Liar,' a dataset uniquely balanced in terms of\ngeographic and temporal representation, facilitating a more nuanced evaluation\nof LLM biases. Our analysis reveals that newer iterations of GPT models do not\nalways equate to improved performance. Notably, the GPT-4 version from March\ndemonstrates higher factual accuracy than its subsequent June release.\nFurthermore, a concerning bias is observed, privileging statements from the\nGlobal North over the Global South, thus potentially exacerbating existing\ninformational inequities. Regions such as Africa and the Middle East are at a\ndisadvantage, with much lower factual accuracy. The performance fluctuations\nover time suggest that model updates may not consistently benefit all regions\nequally.\n Our study also offers insights into the impact of various LLM configuration\nsettings, such as binary decision forcing, model re-runs and temperature, on\nmodel's factuality. Models constrained to binary (true/false) choices exhibit\nreduced factuality compared to those allowing an 'unclear' option. Single\ninference at a low temperature setting matches the reliability of majority\nvoting across various configurations. The insights gained highlight the need\nfor culturally diverse and geographically inclusive model training and\nevaluation. This approach is key to achieving global equity in technology,\ndistributing AI benefits fairly worldwide.\n","authors":["Shujaat Mirza","Bruno Coelho","Yuyuan Cui","Christina Pöpper","Damon McCoy"],"pdf_url":"https://arxiv.org/pdf/2401.17839v1.pdf","comment":"24 pages, 12 figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.17827v1","updated":"2024-01-31T13:40:00Z","published":"2024-01-31T13:40:00Z","title":"Neural Machine Translation for Malayalam Paraphrase Generation","summary":" This study explores four methods of generating paraphrases in Malayalam,\nutilizing resources available for English paraphrasing and pre-trained Neural\nMachine Translation (NMT) models. We evaluate the resulting paraphrases using\nboth automated metrics, such as BLEU, METEOR, and cosine similarity, as well as\nhuman annotation. Our findings suggest that automated evaluation measures may\nnot be fully appropriate for Malayalam, as they do not consistently align with\nhuman judgment. This discrepancy underscores the need for more nuanced\nparaphrase evaluation approaches especially for highly agglutinative languages.\n","authors":["Christeena Varghese","Sergey Koshelev","Ivan P. Yamshchikov"],"pdf_url":"https://arxiv.org/pdf/2401.17827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17824v1","updated":"2024-01-31T13:35:07Z","published":"2024-01-31T13:35:07Z","title":"A Survey of Pre-trained Language Models for Processing Scientific Text","summary":" The number of Language Models (LMs) dedicated to processing scientific text\nis on the rise. Keeping pace with the rapid growth of scientific LMs (SciLMs)\nhas become a daunting task for researchers. To date, no comprehensive surveys\non SciLMs have been undertaken, leaving this issue unaddressed. Given the\nconstant stream of new SciLMs, appraising the state-of-the-art and how they\ncompare to each other remain largely unknown. This work fills that gap and\nprovides a comprehensive review of SciLMs, including an extensive analysis of\ntheir effectiveness across different domains, tasks and datasets, and a\ndiscussion on the challenges that lie ahead.\n","authors":["Xanh Ho","Anh Khoa Duong Nguyen","An Tuan Dao","Junfeng Jiang","Yuki Chida","Kaito Sugimoto","Huy Quoc To","Florian Boudin","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2401.17824v1.pdf","comment":"Resources are available at https://github.com/Alab-NII/Awesome-SciLM"},{"id":"http://arxiv.org/abs/2305.14205v2","updated":"2024-01-31T13:28:58Z","published":"2023-05-23T16:25:21Z","title":"$μ$PLAN: Summarizing using a Content Plan as Cross-Lingual Bridge","summary":" Cross-lingual summarization consists of generating a summary in one language\ngiven an input document in a different language, allowing for the dissemination\nof relevant content across speakers of other languages. The task is challenging\nmainly due to the paucity of cross-lingual datasets and the compounded\ndifficulty of summarizing and translating. This work presents $\\mu$PLAN, an\napproach to cross-lingual summarization that uses an intermediate planning step\nas a cross-lingual bridge. We formulate the plan as a sequence of entities\ncapturing the summary's content and the order in which it should be\ncommunicated. Importantly, our plans abstract from surface form: using a\nmultilingual knowledge base, we align entities to their canonical designation\nacross languages and generate the summary conditioned on this cross-lingual\nbridge and the input. Automatic and human evaluation on the XWikis dataset\n(across four language pairs) demonstrates that our planning objective achieves\nstate-of-the-art performance in terms of informativeness and faithfulness.\nMoreover, $\\mu$PLAN models improve the zero-shot transfer to new cross-lingual\nlanguage pairs compared to baselines without a planning component.\n","authors":["Fantine Huot","Joshua Maynez","Chris Alberti","Reinald Kim Amplayo","Priyanka Agrawal","Constanza Fierro","Shashi Narayan","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2305.14205v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2212.10558v2","updated":"2024-01-31T13:14:02Z","published":"2022-12-20T18:58:33Z","title":"On-the-fly Denoising for Data Augmentation in Natural Language\n Understanding","summary":" Data Augmentation (DA) is frequently used to provide additional training data\nwithout extra human annotation automatically. However, data augmentation may\nintroduce noisy data that impairs training. To guarantee the quality of\naugmented data, existing methods either assume no noise exists in the augmented\ndata and adopt consistency training or use simple heuristics such as training\nloss and diversity constraints to filter out \"noisy\" data. However, those\nfiltered examples may still contain useful information, and dropping them\ncompletely causes a loss of supervision signals. In this paper, based on the\nassumption that the original dataset is cleaner than the augmented data, we\npropose an on-the-fly denoising technique for data augmentation that learns\nfrom soft augmented labels provided by an organic teacher model trained on the\ncleaner original data. To further prevent overfitting on noisy labels, a simple\nself-regularization module is applied to force the model prediction to be\nconsistent across two distinct dropouts. Our method can be applied to general\naugmentation techniques and consistently improve the performance on both text\nclassification and question-answering tasks.\n","authors":["Tianqing Fang","Wenxuan Zhou","Fangyu Liu","Hongming Zhang","Yangqiu Song","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2212.10558v2.pdf","comment":"Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2401.17809v1","updated":"2024-01-31T13:08:45Z","published":"2024-01-31T13:08:45Z","title":"SWEA: Changing Factual Knowledge in Large Language Models via Subject\n Word Embedding Altering","summary":" Model editing has recently gained widespread attention. Current model editing\nmethods primarily involve modifying model parameters or adding additional\nmodules to the existing model. However, the former causes irreversible damage\nto LLMs, while the latter incurs additional inference overhead and fuzzy vector\nmatching is not always reliable. To address these issues, we propose an\nexpandable Subject Word Embedding Altering (SWEA) framework, which modifies the\nrepresentation of subjects and achieve the goal of editing knowledge during the\ninference stage. SWEA uses precise key matching outside the model and performs\nreliable subject word embedding altering, thus protecting the original weights\nof the model without increasing inference overhead. We then propose optimizing\nthen suppressing fusion method, which first optimizes the embedding vector for\nthe editing target and then suppresses the Knowledge Embedding Dimension (KED)\nto obtain the final fused embedding. We thus propose SWEAOS method for editing\nfactual knowledge in LLMs. We demonstrate the state-of-the-art performance of\nSWEAOS on the COUNTERFACT and zsRE datasets. To further validate the reasoning\nability of SWEAOS in editing knowledge, we evaluate it on the more complex\nRIPPLEEDITS benchmark. The results on two subdatasets demonstrate that our\nSWEAOS possesses state-of-the-art reasoning ability.\n","authors":["Xiaopeng Li","Shasha Li","Bin Ji","Shezheng Song","Xi Wang","Jun Ma","Jie Yu","Xiaodong Liu","Jing Wang","Weimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17809v1.pdf","comment":"Work in progress; Our code will be released"},{"id":"http://arxiv.org/abs/2307.15176v3","updated":"2024-01-31T12:40:40Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v3.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2310.20703v2","updated":"2024-01-31T12:39:06Z","published":"2023-10-31T17:59:05Z","title":"Vanishing Gradients in Reinforcement Finetuning of Language Models","summary":" Pretrained language models are commonly aligned with human preferences and\ndownstream tasks via reinforcement finetuning (RFT), which refers to maximizing\na (possibly learned) reward function using policy gradient algorithms. This\nwork identifies a fundamental optimization obstacle in RFT: we prove that the\nexpected gradient for an input vanishes when its reward standard deviation\nunder the model is small, even if the expected reward is far from optimal.\nThrough experiments on an RFT benchmark and controlled environments, as well as\na theoretical analysis, we then demonstrate that vanishing gradients due to\nsmall reward standard deviation are prevalent and detrimental, leading to\nextremely slow reward maximization. Lastly, we explore ways to overcome\nvanishing gradients in RFT. We find the common practice of an initial\nsupervised finetuning (SFT) phase to be the most promising candidate, which\nsheds light on its importance in an RFT pipeline. Moreover, we show that a\nrelatively small number of SFT optimization steps on as few as 1% of the input\nsamples can suffice, indicating that the initial SFT phase need not be\nexpensive in terms of compute and data labeling efforts. Overall, our results\nemphasize that being mindful for inputs whose expected gradient vanishes, as\nmeasured by the reward standard deviation, is crucial for successful execution\nof RFT.\n","authors":["Noam Razin","Hattie Zhou","Omid Saremi","Vimal Thilak","Arwen Bradley","Preetum Nakkiran","Joshua Susskind","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2310.20703v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.15378v3","updated":"2024-01-31T12:39:06Z","published":"2024-01-27T10:50:11Z","title":"A RAG-based Question Answering System Proposal for Understanding Islam:\n MufassirQAS LLM","summary":" There exist challenges in learning and understanding religions as the\npresence of complexity and depth of religious doctrines and teachings. Chatbots\nas question-answering systems can help in solving these challenges. LLM\nchatbots use NLP techniques to establish connections between topics and\naccurately respond to complex questions. These capabilities make it perfect to\nbe used in enlightenment on religion as a question answering chatbot. However,\nLLMs also have a tendency to generate false information, known as\nhallucination. The responses of the chatbots can include content that insults\npersonal religious beliefs, interfaith conflicts, and controversial or\nsensitive topics. It needs to avoid such cases without promoting hate speech or\noffending certain groups of people or their beliefs. This study uses a vector\ndatabase-based Retrieval Augmented Generation (RAG) approach to enhance the\naccuracy and transparency of LLMs. Our question-answering system is called as\n\"MufassirQAS\". We created a vector database with several open-access books that\ninclude Turkish context. These are Turkish translations, and interpretations on\nIslam. We worked on creating system prompts with care, ensuring they provide\ninstructions that prevent harmful, offensive, or disrespectful responses. We\nalso tested the MufassirQAS and ChatGPT with sensitive questions. We got better\nperformance with our system. Study and enhancements are still in progress.\nResults and future works are given.\n","authors":["Ahmet Yusuf Alan","Enis Karaarslan","Ömer Aydin"],"pdf_url":"https://arxiv.org/pdf/2401.15378v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17755v1","updated":"2024-01-31T11:30:24Z","published":"2024-01-31T11:30:24Z","title":"CauESC: A Causal Aware Model for Emotional Support Conversation","summary":" Emotional Support Conversation aims at reducing the seeker's emotional\ndistress through supportive response. Existing approaches have two limitations:\n(1) They ignore the emotion causes of the distress, which is important for\nfine-grained emotion understanding; (2) They focus on the seeker's own mental\nstate rather than the emotional dynamics during interaction between speakers.\nTo address these issues, we propose a novel framework CauESC, which firstly\nrecognizes the emotion causes of the distress, as well as the emotion effects\ntriggered by the causes, and then understands each strategy of verbal grooming\nindependently and integrates them skillfully. Experimental results on the\nbenchmark dataset demonstrate the effectiveness of our approach and show the\nbenefits of emotion understanding from cause to effect and\nindependent-integrated strategy modeling.\n","authors":["Wei Chen","Hengxu Lin","Qun Zhang","Xiaojin Zhang","Xiang Bai","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.17755v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03863v3","updated":"2024-01-31T11:29:40Z","published":"2023-12-06T19:18:42Z","title":"Efficient Large Language Models: A Survey","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nimportant tasks such as natural language understanding, language generation,\nand complex reasoning and have the potential to make a substantial impact on\nour society. Such capabilities, however, come with the considerable resources\nthey demand, highlighting the strong need to develop effective techniques for\naddressing their efficiency challenges.In this survey, we provide a systematic\nand comprehensive review of efficient LLMs research. We organize the literature\nin a taxonomy consisting of three main categories, covering distinct yet\ninterconnected efficient LLMs topics from model-centric, data-centric, and\nframework-centric perspective, respectively. We have also created a GitHub\nrepository where we compile the papers featured in this survey at\nhttps://github.com/AIoT-MLSys-Lab/Efficient-LLMs-Survey, and will actively\nmaintain this repository and incorporate new research as it emerges. We hope\nour survey can serve as a valuable resource to help researchers and\npractitioners gain a systematic understanding of the research developments in\nefficient LLMs and inspire them to contribute to this important and exciting\nfield.\n","authors":["Zhongwei Wan","Xin Wang","Che Liu","Samiul Alam","Yu Zheng","Jiachen Liu","Zhongnan Qu","Shen Yan","Yi Zhu","Quanlu Zhang","Mosharaf Chowdhury","Mi Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03863v3.pdf","comment":"Version 3: Added more latest papers"},{"id":"http://arxiv.org/abs/2401.14440v2","updated":"2024-01-31T10:52:52Z","published":"2024-01-25T14:47:05Z","title":"Semantic Sensitivities and Inconsistent Predictions: Measuring the\n Fragility of NLI Models","summary":" Recent studies of the emergent capabilities of transformer-based Natural\nLanguage Understanding (NLU) models have indicated that they have an\nunderstanding of lexical and compositional semantics. We provide evidence that\nsuggests these claims should be taken with a grain of salt: we find that\nstate-of-the-art Natural Language Inference (NLI) models are sensitive towards\nminor semantics preserving surface-form variations, which lead to sizable\ninconsistent model decisions during inference. Notably, this behaviour differs\nfrom valid and in-depth comprehension of compositional semantics, however does\nneither emerge when evaluating model accuracy on standard benchmarks nor when\nprobing for syntactic, monotonic, and logically robust reasoning. We propose a\nnovel framework to measure the extent of semantic sensitivity. To this end, we\nevaluate NLI models on adversarially generated examples containing minor\nsemantics-preserving surface-form input noise. This is achieved using\nconditional text generation, with the explicit condition that the NLI model\npredicts the relationship between the original and adversarial inputs as a\nsymmetric equivalence entailment. We systematically study the effects of the\nphenomenon across NLI models for $\\textbf{in-}$ and $\\textbf{out-of-}$ domain\nsettings. Our experiments show that semantic sensitivity causes performance\ndegradations of $12.92\\%$ and $23.71\\%$ average over $\\textbf{in-}$ and\n$\\textbf{out-of-}$ domain settings, respectively. We further perform ablation\nstudies, analysing this phenomenon across models, datasets, and variations in\ninference and show that semantic sensitivity can lead to major inconsistency\nwithin model predictions.\n","authors":["Erik Arakelyan","Zhaoqi Liu","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2401.14440v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.17716v1","updated":"2024-01-31T10:20:01Z","published":"2024-01-31T10:20:01Z","title":"Enhancing Large Language Model with Decomposed Reasoning for Emotion\n Cause Pair Extraction","summary":" Emotion-Cause Pair Extraction (ECPE) involves extracting clause pairs\nrepresenting emotions and their causes in a document. Existing methods tend to\noverfit spurious correlations, such as positional bias in existing benchmark\ndatasets, rather than capturing semantic features. Inspired by recent work, we\nexplore leveraging large language model (LLM) to address ECPE task without\nadditional training. Despite strong capabilities, LLMs suffer from\nuncontrollable outputs, resulting in mediocre performance. To address this, we\nintroduce chain-of-thought to mimic human cognitive process and propose the\nDecomposed Emotion-Cause Chain (DECC) framework. Combining inducing inference\nand logical pruning, DECC guides LLMs to tackle ECPE task. We further enhance\nthe framework by incorporating in-context learning. Experiment results\ndemonstrate the strength of DECC compared to state-of-the-art supervised\nfine-tuning methods. Finally, we analyze the effectiveness of each component\nand the robustness of the method in various scenarios, including different LLM\nbases, rebalanced datasets, and multi-pair extraction.\n","authors":["Jialiang Wu","Yi Shen","Ziheng Zhang","Longjun Cai"],"pdf_url":"https://arxiv.org/pdf/2401.17716v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.04645v2","updated":"2024-01-31T09:54:43Z","published":"2023-10-07T01:39:56Z","title":"Do self-supervised speech and language models extract similar\n representations as human brain?","summary":" Speech and language models trained through self-supervised learning (SSL)\ndemonstrate strong alignment with brain activity during speech and language\nperception. However, given their distinct training modalities, it remains\nunclear whether they correlate with the same neural aspects. We directly\naddress this question by evaluating the brain prediction performance of two\nrepresentative SSL models, Wav2Vec2.0 and GPT-2, designed for speech and\nlanguage tasks. Our findings reveal that both models accurately predict speech\nresponses in the auditory cortex, with a significant correlation between their\nbrain predictions. Notably, shared speech contextual information between\nWav2Vec2.0 and GPT-2 accounts for the majority of explained variance in brain\nactivity, surpassing static semantic and lower-level acoustic-phonetic\ninformation. These results underscore the convergence of speech contextual\nrepresentations in SSL models and their alignment with the neural network\nunderlying speech perception, offering valuable insights into both SSL models\nand the neural basis of speech and language processing.\n","authors":["Peili Chen","Linyang He","Li Fu","Lu Fan","Edward F. Chang","Yuanning Li"],"pdf_url":"https://arxiv.org/pdf/2310.04645v2.pdf","comment":"To appear in 2024 IEEE International Conference on Acoustics, Speech\n and Signal Processing"},{"id":"http://arxiv.org/abs/2401.17703v1","updated":"2024-01-31T09:49:22Z","published":"2024-01-31T09:49:22Z","title":"WSC+: Enhancing The Winograd Schema Challenge Using Tree-of-Experts","summary":" The Winograd Schema Challenge (WSC) serves as a prominent benchmark for\nevaluating machine understanding. While Large Language Models (LLMs) excel at\nanswering WSC questions, their ability to generate such questions remains less\nexplored. In this work, we propose Tree-of-Experts (ToE), a novel prompting\nmethod which enhances the generation of WSC instances (50% valid cases vs. 10%\nin recent methods). Using this approach, we introduce WSC+, a novel dataset\ncomprising 3,026 LLM-generated sentences. Notably, we extend the WSC framework\nby incorporating new 'ambiguous' and 'offensive' categories, providing a deeper\ninsight into model overconfidence and bias. Our analysis reveals nuances in\ngeneration-evaluation consistency, suggesting that LLMs may not always\noutperform in evaluating their own generated questions when compared to those\ncrafted by other models. On WSC+, GPT-4, the top-performing LLM, achieves an\naccuracy of 68.7%, significantly below the human benchmark of 95.1%.\n","authors":["Pardis Sadat Zahraei","Ali Emami"],"pdf_url":"https://arxiv.org/pdf/2401.17703v1.pdf","comment":"Accepted for publication in main proceedings of EACL 2024 conference,\n 22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2401.17692v1","updated":"2024-01-31T09:28:06Z","published":"2024-01-31T09:28:06Z","title":"Mitigating the Problem of Strong Priors in LMs with Context\n Extrapolation","summary":" Language models (LMs) have become important tools in a variety of\napplications, from data processing to the creation of instruction-following\nassistants. But despite their advantages, LMs have certain idiosyncratic\nlimitations such as the problem of `strong priors', where a model learns to\noutput typical continuations in response to certain, usually local, portions of\nthe input regardless of any earlier instructions. For example, prompt injection\nattacks can induce models to ignore explicit directives. In some cases, larger\nmodels have been shown to be more susceptible to these problems than similar\nsmaller models, an example of the phenomenon of `inverse scaling'. We develop a\nnew technique for mitigating the problem of strong priors: we take the original\nset of instructions, produce a weakened version of the original prompt that is\neven more susceptible to the strong priors problem, and then extrapolate the\ncontinuation away from the weakened prompt. This lets us infer how the model\nwould continue a hypothetical strengthened set of instructions. Our technique\nconceptualises LMs as mixture models which combine a family of data generation\nprocesses, reinforcing the desired elements of the mixture. Our approach works\nat inference time, removing any need for retraining. We apply it to eleven\nmodels including GPT-2, GPT-3, Llama 2, and Mistral on four tasks, and find\nimprovements in 41/44. Across all 44 combinations the median increase in\nproportion of tasks completed is 40%.\n","authors":["Raymond Douglas","Andis Draguns","Tomáš Gavenčiak"],"pdf_url":"https://arxiv.org/pdf/2401.17692v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.17686v1","updated":"2024-01-31T09:16:35Z","published":"2024-01-31T09:16:35Z","title":"Deductive Beam Search: Decoding Deducible Rationale for Chain-of-Thought\n Reasoning","summary":" Recent advancements have significantly augmented the reasoning capabilities\nof Large Language Models (LLMs) through various methodologies, especially\nchain-of-thought (CoT) reasoning. However, previous methods fail to address\nreasoning errors in intermediate steps, leading to accumulative errors.In this\npaper, we propose Deductive Beam Search (DBS), which seamlessly integrates CoT\nand deductive reasoning with step-wise beam search for LLMs. Our approach\ndeploys a verifier, verifying the deducibility of a reasoning step and its\npremises, thus alleviating the error accumulation. Furthermore, we introduce a\nscalable and labor-free data construction method to amplify our model's\nverification capabilities. Extensive experiments demonstrate that our approach\nsignificantly enhances the base performance of LLMs of various scales (7B, 13B,\n70B, and ChatGPT) across 8 reasoning datasets from 3 diverse reasoning genres,\nincluding arithmetic, commonsense, and symbolic. Moreover, our analysis proves\nDBS's capability of detecting diverse and subtle reasoning errors and\nrobustness on different model scales.\n","authors":["Tinghui Zhu","Kai Zhang","Jian Xie","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2401.17686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17671v1","updated":"2024-01-31T08:48:35Z","published":"2024-01-31T08:48:35Z","title":"Contextual Feature Extraction Hierarchies Converge in Large Language\n Models and the Brain","summary":" Recent advancements in artificial intelligence have sparked interest in the\nparallels between large language models (LLMs) and human neural processing,\nparticularly in language comprehension. While prior research has established\nsimilarities in the representation of LLMs and the brain, the underlying\ncomputational principles that cause this convergence, especially in the context\nof evolving LLMs, remain elusive. Here, we examined a diverse selection of\nhigh-performance LLMs with similar parameter sizes to investigate the factors\ncontributing to their alignment with the brain's language processing\nmechanisms. We find that as LLMs achieve higher performance on benchmark tasks,\nthey not only become more brain-like as measured by higher performance when\npredicting neural responses from LLM embeddings, but also their hierarchical\nfeature extraction pathways map more closely onto the brain's while using fewer\nlayers to do the same encoding. We also compare the feature extraction pathways\nof the LLMs to each other and identify new ways in which high-performing models\nhave converged toward similar hierarchical processing mechanisms. Finally, we\nshow the importance of contextual information in improving model performance\nand brain similarity. Our findings reveal the converging aspects of language\nprocessing in the brain and LLMs and offer new directions for developing models\nthat align more closely with human cognitive processing.\n","authors":["Gavin Mischler","Yinghao Aaron Li","Stephan Bickel","Ashesh D. Mehta","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2401.17671v1.pdf","comment":"19 pages, 5 figures and 4 supplementary figures"},{"id":"http://arxiv.org/abs/2401.16092v2","updated":"2024-01-31T08:33:37Z","published":"2024-01-29T12:02:28Z","title":"Multilingual Text-to-Image Generation Magnifies Gender Stereotypes and\n Prompt Engineering May Not Help You","summary":" Text-to-image generation models have recently achieved astonishing results in\nimage quality, flexibility, and text alignment and are consequently employed in\na fast-growing number of applications. Through improvements in multilingual\nabilities, a larger community now has access to this kind of technology. Yet,\nas we will show, multilingual models suffer similarly from (gender) biases as\nmonolingual models. Furthermore, the natural expectation is that these models\nwill provide similar results across languages, but this is not the case and\nthere are important differences between languages. Thus, we propose a novel\nbenchmark MAGBIG intending to foster research in multilingual models without\ngender bias. We investigate whether multilingual T2I models magnify gender bias\nwith MAGBIG. To this end, we use multilingual prompts requesting portrait\nimages of persons of a certain occupation or trait (using adjectives). Our\nresults show not only that models deviate from the normative assumption that\neach gender should be equally likely to be generated, but that there are also\nbig differences across languages. Furthermore, we investigate prompt\nengineering strategies, i.e. the use of indirect, neutral formulations, as a\npossible remedy for these biases. Unfortunately, they help only to a limited\nextent and result in worse text-to-image alignment. Consequently, this work\ncalls for more research into diverse representations across languages in image\ngenerators.\n","authors":["Felix Friedrich","Katharina Hämmerl","Patrick Schramowski","Jindrich Libovicky","Kristian Kersting","Alexander Fraser"],"pdf_url":"https://arxiv.org/pdf/2401.16092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17658v1","updated":"2024-01-31T08:28:06Z","published":"2024-01-31T08:28:06Z","title":"Document Structure in Long Document Transformers","summary":" Long documents often exhibit structure with hierarchically organized elements\nof different functions, such as section headers and paragraphs. Despite the\nomnipresence of document structure, its role in natural language processing\n(NLP) remains opaque. Do long-document Transformer models acquire an internal\nrepresentation of document structure during pre-training? How can structural\ninformation be communicated to a model after pre-training, and how does it\ninfluence downstream performance? To answer these questions, we develop a novel\nsuite of probing tasks to assess structure-awareness of long-document\nTransformers, propose general-purpose structure infusion methods, and evaluate\nthe effects of structure infusion on QASPER and Evidence Inference, two\nchallenging long-document NLP tasks. Results on LED and LongT5 suggest that\nthey acquire implicit understanding of document structure during pre-training,\nwhich can be further enhanced by structure infusion, leading to improved\nend-task performance. To foster research on the role of document structure in\nNLP modeling, we make our data and code publicly available.\n","authors":["Jan Buchmann","Max Eichler","Jan-Micha Bodensohn","Ilia Kuznetsov","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2401.17658v1.pdf","comment":"Accepted at EACL 2024. Code and data:\n http://github.com/UKPLab/eacl2024-doc-structure"},{"id":"http://arxiv.org/abs/2401.16403v2","updated":"2024-01-31T07:59:16Z","published":"2024-01-29T18:41:39Z","title":"ViLexNorm: A Lexical Normalization Corpus for Vietnamese Social Media\n Text","summary":" Lexical normalization, a fundamental task in Natural Language Processing\n(NLP), involves the transformation of words into their canonical forms. This\nprocess has been proven to benefit various downstream NLP tasks greatly. In\nthis work, we introduce Vietnamese Lexical Normalization (ViLexNorm), the\nfirst-ever corpus developed for the Vietnamese lexical normalization task. The\ncorpus comprises over 10,000 pairs of sentences meticulously annotated by human\nannotators, sourced from public comments on Vietnam's most popular social media\nplatforms. Various methods were used to evaluate our corpus, and the\nbest-performing system achieved a result of 57.74% using the Error Reduction\nRate (ERR) metric (van der Goot, 2019a) with the Leave-As-Is (LAI) baseline.\nFor extrinsic evaluation, employing the model trained on ViLexNorm demonstrates\nthe positive impact of the Vietnamese lexical normalization task on other NLP\ntasks. Our corpus is publicly available exclusively for research purposes.\n","authors":["Thanh-Nhi Nguyen","Thanh-Phong Le","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2401.16403v2.pdf","comment":"Accepted at the EACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2401.17633v1","updated":"2024-01-31T07:26:47Z","published":"2024-01-31T07:26:47Z","title":"Navigating the OverKill in Large Language Models","summary":" Large language models are meticulously aligned to be both helpful and\nharmless. However, recent research points to a potential overkill which means\nmodels may refuse to answer benign queries. In this paper, we investigate the\nfactors for overkill by exploring how models handle and determine the safety of\nqueries. Our findings reveal the presence of shortcuts within models, leading\nto an over-attention of harmful words like 'kill' and prompts emphasizing\nsafety will exacerbate overkill. Based on these insights, we introduce\nSelf-Contrastive Decoding (Self-CD), a training-free and model-agnostic\nstrategy, to alleviate this phenomenon. We first extract such over-attention by\namplifying the difference in the model's output distributions when responding\nto system prompts that either include or omit an emphasis on safety. Then we\ndetermine the final next-token predictions by downplaying the over-attention\nfrom the model via contrastive decoding. Empirical results indicate that our\nmethod has achieved an average reduction of the refusal rate by 20\\% while\nhaving almost no impact on safety.\n","authors":["Chenyu Shi","Xiao Wang","Qiming Ge","Songyang Gao","Xianjun Yang","Tao Gui","Qi Zhang","Xuanjing Huang","Xun Zhao","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.17633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17632v1","updated":"2024-01-31T07:23:22Z","published":"2024-01-31T07:23:22Z","title":"What Do Self-Supervised Speech and Speaker Models Learn? New Findings\n From a Cross Model Layer-Wise Analysis","summary":" Self-supervised learning (SSL) has attracted increased attention for learning\nmeaningful speech representations. Speech SSL models, such as WavLM, employ\nmasked prediction training to encode general-purpose representations. In\ncontrast, speaker SSL models, exemplified by DINO-based models, adopt\nutterance-level training objectives primarily for speaker representation.\nUnderstanding how these models represent information is essential for refining\nmodel efficiency and effectiveness. Unlike the various analyses of speech SSL,\nthere has been limited investigation into what information speaker SSL captures\nand how its representation differs from speech SSL or other fully-supervised\nspeaker models. This paper addresses these fundamental questions. We explore\nthe capacity to capture various speech properties by applying SUPERB evaluation\nprobing tasks to speech and speaker SSL models. We also examine which layers\nare predominantly utilized for each task to identify differences in how speech\nis represented. Furthermore, we conduct direct comparisons to measure the\nsimilarities between layers within and across models. Our analysis unveils that\n1) the capacity to represent content information is somewhat unrelated to\nenhanced speaker representation, 2) specific layers of speech SSL models would\nbe partly specialized in capturing linguistic information, and 3) speaker SSL\nmodels tend to disregard linguistic information but exhibit more sophisticated\nspeaker representation.\n","authors":["Takanori Ashihara","Marc Delcroix","Takafumi Moriya","Kohei Matsuura","Taichi Asami","Yusuke Ijima"],"pdf_url":"https://arxiv.org/pdf/2401.17632v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.15623v2","updated":"2024-01-31T06:58:51Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2401.17623v1","updated":"2024-01-31T06:49:36Z","published":"2024-01-31T06:49:36Z","title":"Neighboring Perturbations of Knowledge Editing on Large Language Models","summary":" Despite their exceptional capabilities, large language models (LLMs) are\nprone to generating unintended text due to false or outdated knowledge. Given\nthe resource-intensive nature of retraining LLMs, there has been a notable\nincrease in the development of knowledge editing. However, current approaches\nand evaluations rarely explore the perturbation of editing on neighboring\nknowledge. This paper studies whether updating new knowledge to LLMs perturbs\nthe neighboring knowledge encapsulated within them. Specifically, we seek to\nfigure out whether appending a new answer into an answer list to a factual\nquestion leads to catastrophic forgetting of original correct answers in this\nlist, as well as unintentional inclusion of incorrect answers. A metric of\nadditivity is introduced and a benchmark dubbed as Perturbation Evaluation of\nAppending Knowledge (PEAK) is constructed to evaluate the degree of\nperturbation to neighboring knowledge when appending new knowledge. Besides, a\nplug-and-play framework termed Appending via Preservation and Prevention (APP)\nis proposed to mitigate the neighboring perturbation by maintaining the\nintegrity of the answer list. Experiments demonstrate the effectiveness of APP\ncoupling with four editing methods on three LLMs.\n","authors":["Jun-Yu Ma","Jia-Chen Gu","Ningyu Zhang","Zhen-Hua Ling"],"pdf_url":"https://arxiv.org/pdf/2401.17623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12292v2","updated":"2024-01-31T06:44:42Z","published":"2024-01-22T19:00:08Z","title":"GRATH: Gradual Self-Truthifying for Large Language Models","summary":" Truthfulness is paramount for large language models (LLMs) as they are\nincreasingly deployed in real-world applications. However, existing LLMs still\nstruggle with generating truthful content, as evidenced by their modest\nperformance on benchmarks like TruthfulQA. To address this issue, we propose\nGRAdual self-truTHifying (GRATH), a novel post-processing method to enhance\ntruthfulness of LLMs. GRATH utilizes out-of-domain question prompts to generate\npairwise truthfulness training data with each pair containing a question and\nits correct and incorrect answers, and then optimizes the model via direct\npreference optimization (DPO) to learn from the truthfulness difference between\nanswer pairs. GRATH iteratively refines truthfulness data and updates the\nmodel, leading to a gradual improvement in model truthfulness in a\nself-supervised manner. Empirically, we evaluate GRATH using different 7B-LLMs\nand compare with LLMs with similar or even larger sizes on benchmark datasets.\nOur results show that GRATH effectively improves LLMs' truthfulness without\ncompromising other core capabilities. Notably, GRATH achieves state-of-the-art\nperformance on TruthfulQA, with MC1 accuracy of 54.71% and MC2 accuracy of\n69.10%, which even surpass those on 70B-LLMs.\n","authors":["Weixin Chen","Dawn Song","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2401.12292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07518v2","updated":"2024-01-31T06:20:32Z","published":"2024-01-15T07:48:42Z","title":"Survey of Natural Language Processing for Education: Taxonomy,\n Systematic Review, and Future Trends","summary":" Natural Language Processing (NLP) aims to analyze the text via techniques in\nthe computer science field. It serves the applications in healthcare, commerce,\nand education domains. Particularly, NLP has been applied to the education\ndomain to help teaching and learning. In this survey, we review recent advances\nin NLP with a focus on solving problems related to the education domain. In\ndetail, we begin with introducing the relevant background. Then, we present the\ntaxonomy of NLP in the education domain. Next, we illustrate the task\ndefinition, challenges, and corresponding techniques based on the above\ntaxonomy. After that, we showcase some off-the-shelf demonstrations in this\ndomain and conclude with future directions.\n","authors":["Yunshi Lan","Xinyuan Li","Hanyue Du","Xuesong Lu","Ming Gao","Weining Qian","Aoying Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.07518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17602v1","updated":"2024-01-31T05:11:00Z","published":"2024-01-31T05:11:00Z","title":"Assertion Detection Large Language Model In-context Learning LoRA\n Fine-tuning","summary":" In this study, we aim to address the task of assertion detection when\nextracting medical concepts from clinical notes, a key process in clinical\nnatural language processing (NLP). Assertion detection in clinical NLP usually\ninvolves identifying assertion types for medical concepts in the clinical text,\nnamely certainty (whether the medical concept is positive, negated, possible,\nor hypothetical), temporality (whether the medical concept is for present or\nthe past history), and experiencer (whether the medical concept is described\nfor the patient or a family member). These assertion types are essential for\nhealthcare professionals to quickly and clearly understand the context of\nmedical conditions from unstructured clinical texts, directly influencing the\nquality and outcomes of patient care. Although widely used, traditional\nmethods, particularly rule-based NLP systems and machine learning or deep\nlearning models, demand intensive manual efforts to create patterns and tend to\noverlook less common assertion types, leading to an incomplete understanding of\nthe context. To address this challenge, our research introduces a novel\nmethodology that utilizes Large Language Models (LLMs) pre-trained on a vast\narray of medical data for assertion detection. We enhanced the current method\nwith advanced reasoning techniques, including Tree of Thought (ToT), Chain of\nThought (CoT), and Self-Consistency (SC), and refine it further with Low-Rank\nAdaptation (LoRA) fine-tuning. We first evaluated the model on the i2b2 2010\nassertion dataset. Our method achieved a micro-averaged F-1 of 0.89, with 0.11\nimprovements over the previous works. To further assess the generalizability of\nour approach, we extended our evaluation to a local dataset that focused on\nsleep concept extraction. Our approach achieved an F-1 of 0.74, which is 0.31\nhigher than the previous method.\n","authors":["Yuelyu Ji","Zeshui Yu","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00162v3","updated":"2024-01-31T05:00:25Z","published":"2023-06-30T22:36:41Z","title":"What Do Self-Supervised Speech Models Know About Words?","summary":" Many self-supervised speech models (S3Ms) have been introduced over the last\nfew years, improving performance and data efficiency on various speech tasks.\nHowever, these empirical successes alone do not give a complete picture of what\nis learned during pre-training. Recent work has begun analyzing how S3Ms encode\ncertain properties, such as phonetic and speaker information, but we still lack\na proper understanding of knowledge encoded at the word level and beyond. In\nthis work, we use lightweight analysis methods to study segment-level\nlinguistic properties -- word identity, boundaries, pronunciation, syntactic\nfeatures, and semantic features -- encoded in S3Ms. We present a comparative\nstudy of layer-wise representations from ten S3Ms and find that (i) the\nframe-level representations within each word segment are not all equally\ninformative, and (ii) the pre-training objective and model size heavily\ninfluence the accessibility and distribution of linguistic information across\nlayers. We also find that on several tasks -- word discrimination, word\nsegmentation, and semantic sentence similarity -- S3Ms trained with visual\ngrounding outperform their speech-only counterparts. Finally, our task-based\nanalyses demonstrate improved performance on word segmentation and acoustic\nword discrimination while using simpler methods than prior work.\n","authors":["Ankita Pasad","Chung-Ming Chien","Shane Settle","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2307.00162v3.pdf","comment":"Pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2401.17600v1","updated":"2024-01-31T04:57:12Z","published":"2024-01-31T04:57:12Z","title":"Good at captioning, bad at counting: Benchmarking GPT-4V on Earth\n observation data","summary":" Large Vision-Language Models (VLMs) have demonstrated impressive performance\non complex tasks involving visual input with natural language instructions.\nHowever, it remains unclear to what extent capabilities on natural images\ntransfer to Earth observation (EO) data, which are predominantly satellite and\naerial images less common in VLM training data. In this work, we propose a\ncomprehensive benchmark to gauge the progress of VLMs toward being useful tools\nfor EO data by assessing their abilities on scene understanding, localization\nand counting, and change detection tasks. Motivated by real-world applications,\nour benchmark includes scenarios like urban monitoring, disaster relief, land\nuse, and conservation. We discover that, although state-of-the-art VLMs like\nGPT-4V possess extensive world knowledge that leads to strong performance on\nopen-ended tasks like location understanding and image captioning, their poor\nspatial reasoning limits usefulness on object localization and counting tasks.\nOur benchmark will be made publicly available at https://vleo.danielz.ch/ and\non Hugging Face at\nhttps://huggingface.co/collections/mit-ei/vleo-benchmark-datasets-65b789b0466555489cce0d70\nfor easy model evaluation.\n","authors":["Chenhui Zhang","Sherrie Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17600v1.pdf","comment":"62 pages; work in progress"},{"id":"http://arxiv.org/abs/2401.17597v1","updated":"2024-01-31T04:50:00Z","published":"2024-01-31T04:50:00Z","title":"SPECTRUM: Speaker-Enhanced Pre-Training for Long Dialogue Summarization","summary":" Multi-turn dialogues are characterized by their extended length and the\npresence of turn-taking conversations. Traditional language models often\noverlook the distinct features of these dialogues by treating them as regular\ntext. In this paper, we propose a speaker-enhanced pre-training method for long\ndialogue summarization, which leverages the inherent structure of multiple-turn\ndialogues. To support our study, we curate a diverse dataset that includes\ntranscripts from real-world scenarios, movie or TV show transcripts, and\ndialogues generated by a Large Language Model. We then perform a pre-training,\nwhich encompasses the detection of speaker changes, and masked utterance\ngeneration. Experimental results of fine-tuned models demonstrate that our\nmodel achieves state-of-the-art performance on downstream benchmarks with long\ncontext, surpassing baseline models and highlighting the effectiveness of our\napproach. Our findings highlight the importance of curating pre-training\ndatasets that exhibit diversity and variations in length distribution to ensure\neffective alignment with downstream datasets.\n","authors":["Sangwoo Cho","Kaiqiang Song","Chao Zhao","Xiaoyang Wang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.17597v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.17588v1","updated":"2024-01-31T04:19:22Z","published":"2024-01-31T04:19:22Z","title":"Local and Global Contexts for Conversation","summary":" The context in conversation is the dialog history crucial for multi-turn\ndialogue. Learning from the relevant contexts in dialog history for grounded\nconversation is a challenging problem. Local context is the most neighbor and\nmore sensitive to the subsequent response, and global context is relevant to a\nwhole conversation far beyond neighboring utterances. Currently, pretrained\ntransformer models for conversation challenge capturing the correlation and\nconnection between local and global contexts. We introduce a local and global\nconversation model (LGCM) for general-purpose conversation in open domain. It\nis a local-global hierarchical transformer model that excels at accurately\ndiscerning and assimilating the relevant contexts necessary for generating\nresponses. It employs a local encoder to grasp the local context at the level\nof individual utterances and a global encoder to understand the broader context\nat the dialogue level. The seamless fusion of these locally and globally\ncontextualized encodings ensures a comprehensive comprehension of the\nconversation. Experiments on popular datasets show that LGCM outperforms the\nexisting conversation models on the performance of automatic metrics with\nsignificant margins.\n","authors":["Zuoquan Lin","Xinyi Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17588v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.17585v1","updated":"2024-01-31T04:12:59Z","published":"2024-01-31T04:12:59Z","title":"Propagation and Pitfalls: Reasoning-based Assessment of Knowledge\n Editing through Counterfactual Tasks","summary":" Current approaches of knowledge editing struggle to effectively propagate\nupdates to interconnected facts. In this work, we delve into the barriers that\nhinder the appropriate propagation of updated knowledge within these models for\naccurate reasoning. To support our analysis, we introduce a novel\nreasoning-based benchmark -- ReCoE (Reasoning-based Counterfactual Editing\ndataset) -- which covers six common reasoning schemes in real world. We conduct\na thorough analysis of existing knowledge editing techniques, including input\naugmentation, finetuning, and locate-and-edit. We found that all model editing\nmethods show notably low performance on this dataset, especially in certain\nreasoning schemes. Our analysis over the chain-of-thought generation of edited\nmodels further uncover key reasons behind the inadequacy of existing knowledge\nediting methods from a reasoning standpoint, involving aspects on fact-wise\nediting, fact recall ability, and coherence in generation. We will make our\nbenchmark publicly available.\n","authors":["Wenyue Hua","Jiang Guo","Mingwen Dong","Henghui Zhu","Patrick Ng","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17585v1.pdf","comment":"22 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2212.10767v3","updated":"2024-01-31T04:10:30Z","published":"2022-12-21T05:01:01Z","title":"How Does Beam Search improve Span-Level Confidence Estimation in\n Generative Sequence Labeling?","summary":" Sequence labeling is a core task in text understanding for IE/IR systems.\nText generation models have increasingly become the go-to solution for such\ntasks (e.g., entity extraction and dialog slot filling). While most research\nhas focused on the labeling accuracy, a key aspect -- of vital practical\nimportance -- has slipped through the cracks: understanding model confidence.\nMore specifically, we lack a principled understanding of how to reliably gauge\nthe confidence of a model in its predictions for each labeled span. This paper\naims to provide some empirical insights on estimating model confidence for\ngenerative sequence labeling. Most notably, we find that simply using the\ndecoder's output probabilities \\textbf{is not} the best in realizing\nwell-calibrated confidence estimates. As verified over six public datasets of\ndifferent tasks, we show that our proposed approach -- which leverages\nstatistics from top-$k$ predictions by a beam search -- significantly reduces\ncalibration errors of the predictions of a generative sequence labeling model.\n","authors":["Kazuma Hashimoto","Iftekhar Naim","Karthik Raman"],"pdf_url":"https://arxiv.org/pdf/2212.10767v3.pdf","comment":"UncertaiNLP 2024 (an EACL 2024 workshop:\n https://uncertainlp.github.io/)"},{"id":"http://arxiv.org/abs/2305.15002v2","updated":"2024-01-31T03:56:22Z","published":"2023-05-24T10:41:24Z","title":"A RelEntLess Benchmark for Modelling Graded Relations between Named\n Entities","summary":" Relations such as \"is influenced by\", \"is known for\" or \"is a competitor of\"\nare inherently graded: we can rank entity pairs based on how well they satisfy\nthese relations, but it is hard to draw a line between those pairs that satisfy\nthem and those that do not. Such graded relations play a central role in many\napplications, yet they are typically not covered by existing Knowledge Graphs.\nIn this paper, we consider the possibility of using Large Language Models\n(LLMs) to fill this gap. To this end, we introduce a new benchmark, in which\nentity pairs have to be ranked according to how much they satisfy a given\ngraded relation. The task is formulated as a few-shot ranking problem, where\nmodels only have access to a description of the relation and five prototypical\ninstances. We use the proposed benchmark to evaluate state-of-the-art relation\nembedding strategies as well as several recent LLMs, covering both publicly\navailable LLMs and closed models such as GPT-4. Overall, we find a strong\ncorrelation between model size and performance, with smaller Language Models\nstruggling to outperform a naive baseline. The results of the largest Flan-T5\nand OPT models are remarkably strong, although a clear gap with human\nperformance remains.\n","authors":["Asahi Ushio","Jose Camacho Collados","Steven Schockaert"],"pdf_url":"https://arxiv.org/pdf/2305.15002v2.pdf","comment":"EACL 2024 main conference"},{"id":"http://arxiv.org/abs/2401.11864v3","updated":"2024-01-31T03:50:07Z","published":"2024-01-22T11:37:18Z","title":"Improving Small Language Models' Mathematical Reasoning via\n Equation-of-Thought Distillation","summary":" This work addresses the challenge of democratizing advanced Large Language\nModels (LLMs) by compressing their mathematical reasoning capabilities into\nsub-billion parameter Small Language Models (SLMs) without compromising\nperformance. We introduce Equation-of-Thought Distillation (EoTD), a novel\ntechnique that encapsulates the reasoning process into equation-based\nrepresentations to construct an EoTD dataset for fine-tuning SLMs.\nAdditionally, we propose the Ensemble Thoughts Distillation (ETD) framework to\nenhance the reasoning performance of SLMs. This involves creating a reasoning\ndataset with multiple thought processes, including Chain-of-Thought (CoT),\nProgram-of-Thought (PoT), and Equation-of-Thought (EoT), and using it for\nfine-tuning. Our experimental findings demonstrate that EoTD significantly\nboosts the reasoning abilities of SLMs, while ETD enables these models to\nachieve state-of-the-art reasoning performance.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11864v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08958v2","updated":"2024-01-31T03:42:04Z","published":"2023-09-16T11:22:46Z","title":"Monolingual or Multilingual Instruction Tuning: Which Makes a Better\n Alpaca","summary":" Foundational large language models (LLMs) can be instruction-tuned to perform\nopen-domain question answering, facilitating applications like chat assistants.\nWhile such efforts are often carried out in a single language, we empirically\nanalyze cost-efficient strategies for multilingual scenarios. Our study employs\nthe Alpaca dataset and machine translations of it to form multilingual data,\nwhich is then used to tune LLMs through either low-rank adaptation or\nfull-parameter training. Under a controlled computation budget, comparisons\nshow that multilingual tuning is on par or better than tuning a model for each\nlanguage. Furthermore, multilingual tuning with downsampled data can be as\npowerful and more robust. Our findings serve as a guide for expanding language\nsupport through instruction tuning.\n","authors":["Pinzhen Chen","Shaoxiong Ji","Nikolay Bogoychev","Andrey Kutuzov","Barry Haddow","Kenneth Heafield"],"pdf_url":"https://arxiv.org/pdf/2309.08958v2.pdf","comment":"Accepted to Findings of ACL: EACL 2024. Added human evaluation and\n shortened writing"},{"id":"http://arxiv.org/abs/2401.17574v1","updated":"2024-01-31T03:39:07Z","published":"2024-01-31T03:39:07Z","title":"Scavenging Hyena: Distilling Transformers into Long Convolution Models","summary":" The rapid evolution of Large Language Models (LLMs), epitomized by\narchitectures like GPT-4, has reshaped the landscape of natural language\nprocessing. This paper introduces a pioneering approach to address the\nefficiency concerns associated with LLM pre-training, proposing the use of\nknowledge distillation for cross-architecture transfer. Leveraging insights\nfrom the efficient Hyena mechanism, our method replaces attention heads in\ntransformer models by Hyena, offering a cost-effective alternative to\ntraditional pre-training while confronting the challenge of processing long\ncontextual information, inherent in quadratic attention mechanisms. Unlike\nconventional compression-focused methods, our technique not only enhances\ninference speed but also surpasses pre-training in terms of both accuracy and\nefficiency. In the era of evolving LLMs, our work contributes to the pursuit of\nsustainable AI solutions, striking a balance between computational power and\nenvironmental impact.\n","authors":["Tokiniaina Raharison Ralambomihanta","Shahrad Mohammadzadeh","Mohammad Sami Nur Islam","Wassim Jabbour","Laurence Liang"],"pdf_url":"https://arxiv.org/pdf/2401.17574v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.08648v3","updated":"2024-01-31T02:36:48Z","published":"2023-09-15T13:15:54Z","title":"MAPLE: Mobile App Prediction Leveraging Large Language Model Embeddings","summary":" In recent years, predicting mobile app usage has become increasingly\nimportant for areas like app recommendation, user behaviour analysis, and\nmobile resource management. Existing models, however, struggle with the\nheterogeneous nature of contextual data and the user cold start problem. This\nstudy introduces a novel prediction model, Mobile App Prediction Leveraging\nLarge Language Model Embeddings (MAPLE), which employs Large Language Models\n(LLMs) and installed app similarity to overcome these challenges. MAPLE\nutilises the power of LLMs to process contextual data and discern intricate\nrelationships within it effectively. Additionally, we explore the use of\ninstalled app similarity to address the cold start problem, facilitating the\nmodelling of user preferences and habits, even for new users with limited\nhistorical data. In essence, our research presents MAPLE as a novel, potent,\nand practical approach to app usage prediction, making significant strides in\nresolving issues faced by existing models. MAPLE stands out as a comprehensive\nand effective solution, setting a new benchmark for more precise and\npersonalised app usage predictions. In tests on two real-world datasets, MAPLE\nsurpasses contemporary models in both standard and cold start scenarios. These\noutcomes validate MAPLE's capacity for precise app usage predictions and its\nresilience against the cold start problem. This enhanced performance stems from\nthe model's proficiency in capturing complex temporal patterns and leveraging\ncontextual information. As a result, MAPLE can potentially improve personalised\nmobile app usage predictions and user experiences markedly.\n","authors":["Yonchanok Khaokaew","Hao Xue","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2309.08648v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14341v2","updated":"2024-01-31T02:32:19Z","published":"2023-05-23T17:59:19Z","title":"APPLS: Evaluating Evaluation Metrics for Plain Language Summarization","summary":" While there has been significant development of models for Plain Language\nSummarization (PLS), evaluation remains a challenge. PLS lacks a dedicated\nassessment metric, and the suitability of text generation evaluation metrics is\nunclear due to the unique transformations involved (e.g., adding background\nexplanations, removing specialized terminology). To address these concerns, our\nstudy presents a granular meta-evaluation testbed, APPLS, designed to evaluate\nmetrics for PLS. We define a set of perturbations along four criteria inspired\nby previous work that a PLS metric should capture: informativeness,\nsimplification, coherence, and faithfulness. An analysis of metrics using our\ntestbed reveals that current metrics fail to capture simplification\nconsistently. In response, we introduce POMME, a new metric designed to assess\ntext simplification in PLS; the metric is calculated as the normalized\nperplexity difference between an in-domain and out-of-domain language model. We\ndemonstrate POMME's correlation with fine-grained variations in simplification\nand validate its sensitivity across 4 text simplification datasets. This work\ncontributes the first meta-evaluation testbed for PLS and a comprehensive\nevaluation of existing metrics. The APPLS testbed and POMME is available at\nhttps://github.com/LinguisticAnomalies/APPLS.\n","authors":["Yue Guo","Tal August","Gondy Leroy","Trevor Cohen","Lucy Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2305.14341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17536v1","updated":"2024-01-31T01:37:33Z","published":"2024-01-31T01:37:33Z","title":"PipeNet: Question Answering with Semantic Pruning over Knowledge Graphs","summary":" It is well acknowledged that incorporating explicit knowledge graphs (KGs)\ncan benefit question answering. Existing approaches typically follow a\ngrounding-reasoning pipeline in which entity nodes are first grounded for the\nquery (question and candidate answers), and then a reasoning module reasons\nover the matched multi-hop subgraph for answer prediction. Although the\npipeline largely alleviates the issue of extracting essential information from\ngiant KGs, efficiency is still an open challenge when scaling up hops in\ngrounding the subgraphs. In this paper, we target at finding semantically\nrelated entity nodes in the subgraph to improve the efficiency of graph\nreasoning with KG. We propose a grounding-pruning-reasoning pipeline to prune\nnoisy nodes, remarkably reducing the computation cost and memory usage while\nalso obtaining decent subgraph representation. In detail, the pruning module\nfirst scores concept nodes based on the dependency distance between matched\nspans and then prunes the nodes according to score ranks. To facilitate the\nevaluation of pruned subgraphs, we also propose a graph attention network (GAT)\nbased module to reason with the subgraph data. Experimental results on\nCommonsenseQA and OpenBookQA demonstrate the effectiveness of our method.\n","authors":["Ying Su","Jipeng Zhang","Yangqiu Song","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17536v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11143v3","updated":"2024-01-31T01:22:43Z","published":"2024-01-20T06:42:32Z","title":"Gaussian Adaptive Attention is All You Need: Robust Contextual\n Representations Across Multiple Modalities","summary":" We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a\nnovel probabilistic attention framework, and the Gaussian Adaptive Transformer\n(GAT), designed to enhance information aggregation across multiple modalities,\nincluding Speech, Text and Vision. GAAM integrates learnable mean and variance\ninto its attention mechanism, implemented in a Multi-Headed framework enabling\nit to collectively model any Probability Distribution for dynamic recalibration\nof feature significance. This method demonstrates significant improvements,\nespecially with highly non-stationary data, surpassing the state-of-the-art\nattention techniques in model performance (up to approximately +20% in\naccuracy) by identifying key elements within the feature space. GAAM's\ncompatibility with dot-product-based attention models and relatively low number\nof parameters showcases its adaptability and potential to boost existing\nattention frameworks. Empirically, GAAM exhibits superior adaptability and\nefficacy across a diverse range of tasks, including emotion recognition in\nspeech, image classification, and text classification, thereby establishing its\nrobustness and versatility in handling multi-modal data. Furthermore, we\nintroduce the Importance Factor (IF), a new learning-based metric that enhances\nthe explainability of models trained with GAAM-based methods. Overall, GAAM\nrepresents an advancement towards development of better performing and more\nexplainable attention models across multiple modalities.\n","authors":["Georgios Ioannides","Aman Chadha","Aaron Elkins"],"pdf_url":"https://arxiv.org/pdf/2401.11143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17514v1","updated":"2024-01-31T00:15:34Z","published":"2024-01-31T00:15:34Z","title":"FEUDA: Frustratingly Easy Prompt Based Unsupervised Domain Adaptation","summary":" A major thread of unsupervised domain adaptation (UDA) methods uses unlabeled\ndata from both source and target domains to learn domain-invariant\nrepresentations for adaptation. However, these methods showcase certain\nlimitations, encouraging the use of self-supervised learning through continued\npre-training. The necessity of continued pre-training or learning\ndomain-invariant representations is still unclear in the prompt-based\nclassification framework, where an input example is modified by a template and\nthen fed into a language model (LM) to generate a label string. To examine this\nnew paradigm of UDA in the prompt-based setup, we propose a frustratingly easy\nUDA method (FEUDA) that trains an autoregressive LM on both unlabeled and\nlabeled examples using two different instruction-tuning tasks. Specifically,\nthe first task trains the LM on unlabeled texts from both domains via masked\nlanguage modeling (MLM), and the other uses supervised instruction-tuning on\nsource-labeled data for classification. We conduct extensive experiments on 24\nreal-world domain pairs to show the effectiveness of our method over strong\ndomain-invariant learning methods. Our analysis sheds light on why masked\nlanguage modeling improves target-domain classification performance in\nprompt-based UDA. We discover that MLM helps the model learn both semantic and\nbackground knowledge of a domain, which are both beneficial for downstream\nclassification.\n","authors":["Rheeya Uppaal","Yixuan Li","Junjie Hu"],"pdf_url":"https://arxiv.org/pdf/2401.17514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17511v1","updated":"2024-01-31T00:08:44Z","published":"2024-01-31T00:08:44Z","title":"Linguistically Communicating Uncertainty in Patient-Facing Risk\n Prediction Models","summary":" This paper addresses the unique challenges associated with uncertainty\nquantification in AI models when applied to patient-facing contexts within\nhealthcare. Unlike traditional eXplainable Artificial Intelligence (XAI)\nmethods tailored for model developers or domain experts, additional\nconsiderations of communicating in natural language, its presentation and\nevaluating understandability are necessary. We identify the challenges in\ncommunication model performance, confidence, reasoning and unknown knowns using\nnatural language in the context of risk prediction. We propose a design aimed\nat addressing these challenges, focusing on the specific application of\nin-vitro fertilisation outcome prediction.\n","authors":["Adarsa Sivaprasad","Ehud Reiter"],"pdf_url":"https://arxiv.org/pdf/2401.17511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00235v1","updated":"2024-01-31T23:29:42Z","published":"2024-01-31T23:29:42Z","title":"Exploring the limits of decoder-only models trained on public speech\n recognition corpora","summary":" The emergence of industrial-scale speech recognition (ASR) models such as\nWhisper and USM, trained on 1M hours of weakly labelled and 12M hours of audio\nonly proprietary data respectively, has led to a stronger need for large scale\npublic ASR corpora and competitive open source pipelines. Unlike the said\nmodels, large language models are typically based on Transformer decoders, and\nit remains unclear if decoder-only models trained on public data alone can\ndeliver competitive performance. In this work, we investigate factors such as\nchoice of training datasets and modeling components necessary for obtaining the\nbest performance using public English ASR corpora alone. Our Decoder-Only\nTransformer for ASR (DOTA) model comprehensively outperforms the\nencoder-decoder open source replication of Whisper (OWSM) on nearly all English\nASR benchmarks and outperforms Whisper large-v3 on 7 out of 15 test sets. We\nrelease our codebase and model checkpoints under permissive license.\n","authors":["Ankit Gupta","George Saon","Brian Kingsbury"],"pdf_url":"https://arxiv.org/pdf/2402.00235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07713v2","updated":"2024-01-31T23:27:26Z","published":"2023-10-11T17:59:05Z","title":"InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining","summary":" Pretraining auto-regressive large language models (LLMs) with retrieval\ndemonstrates better perplexity and factual accuracy by leveraging external\ndatabases. However, the size of existing pretrained retrieval-augmented LLM is\nstill limited (e.g., Retro has 7.5B parameters), which limits the effectiveness\nof instruction tuning and zero-shot generalization. In this work, we introduce\nRetro 48B, the largest LLM pretrained with retrieval. Specifically, we continue\nto pretrain a 43B GPT model on additional 100 billion tokens using the Retro\naugmentation method by retrieving from 1.2 trillion tokens. Notably, the\nobtained foundation model, Retro 48B, largely outperforms the counterpart GPT\n43B trained on 1.2T tokens in terms of perplexity with only 2.58% additional\nGPU hours, demonstrating the significant scaling potential of the method. After\ninstruction tuning on Retro, InstructRetro demonstrates significant improvement\nover the instruction tuned GPT on a wide range of zero-shot tasks.\nSpecifically, the average improvement of InstructRetro is 7% over its GPT\ncounterpart across 8 short-form QA and reading comprehension tasks, 10% over\nGPT across 4 challenging long-form QA tasks, and 16% over GPT across 3\nsummarization tasks. Surprisingly, we find that one can ablate the encoder from\nInstructRetro architecture and directly use its decoder backbone, while\nachieving comparable results. Our results highlight the promising direction to\nobtain a better GPT decoder through continued pretraining with retrieval before\ninstruction tuning. Our code and checkpoints are publicly available at:\nhttps://github.com/NVIDIA/Megatron-LM/tree/InstructRetro/tools/retro.\n","authors":["Boxin Wang","Wei Ping","Lawrence McAfee","Peng Xu","Bo Li","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.07713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00234v1","updated":"2024-01-31T23:24:37Z","published":"2024-01-31T23:24:37Z","title":"Are Generative AI systems Capable of Supporting Information Needs of\n Patients?","summary":" Patients managing a complex illness such as cancer face a complex information\nchallenge where they not only must learn about their illness but also how to\nmanage it. Close interaction with healthcare experts (radiologists,\noncologists) can improve patient learning and thereby, their disease outcome.\nHowever, this approach is resource intensive and takes expert time away from\nother critical tasks. Given the recent advancements in Generative AI models\naimed at improving the healthcare system, our work investigates whether and how\ngenerative visual question answering systems can responsibly support patient\ninformation needs in the context of radiology imaging data. We conducted a\nformative need-finding study in which participants discussed chest computed\ntomography (CT) scans and associated radiology reports of a fictitious close\nrelative with a cardiothoracic radiologist. Using thematic analysis of the\nconversation between participants and medical experts, we identified commonly\noccurring themes across interactions, including clarifying medical terminology,\nlocating the problems mentioned in the report in the scanned image,\nunderstanding disease prognosis, discussing the next diagnostic steps, and\ncomparing treatment options. Based on these themes, we evaluated two\nstate-of-the-art generative visual language models against the radiologist's\nresponses. Our results reveal variability in the quality of responses generated\nby the models across various themes. We highlight the importance of\npatient-facing generative AI systems to accommodate a diverse range of\nconversational themes, catering to the real-world informational needs of\npatients.\n","authors":["Shreya Rajagopal","Subhashis Hazarika","Sookyung Kim","Yan-ming Chiou","Jae Ho Sohn","Hari Subramonyam","Shiwali Mohan"],"pdf_url":"https://arxiv.org/pdf/2402.00234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00179v1","updated":"2024-01-31T21:14:01Z","published":"2024-01-31T21:14:01Z","title":"De-identification is not always enough","summary":" For sharing privacy-sensitive data, de-identification is commonly regarded as\nadequate for safeguarding privacy. Synthetic data is also being considered as a\nprivacy-preserving alternative. Recent successes with numerical and tabular\ndata generative models and the breakthroughs in large generative language\nmodels raise the question of whether synthetically generated clinical notes\ncould be a viable alternative to real notes for research purposes. In this\nwork, we demonstrated that (i) de-identification of real clinical notes does\nnot protect records against a membership inference attack, (ii) proposed a\nnovel approach to generate synthetic clinical notes using the current\nstate-of-the-art large language models, (iii) evaluated the performance of the\nsynthetically generated notes in a clinical domain task, and (iv) proposed a\nway to mount a membership inference attack where the target model is trained\nwith synthetic data. We observed that when synthetically generated notes\nclosely match the performance of real data, they also exhibit similar privacy\nconcerns to the real data. Whether other approaches to synthetically generated\nclinical notes could offer better trade-offs and become a better alternative to\nsensitive real notes warrants further investigation.\n","authors":["Atiquer Rahman Sarkar","Yao-Shun Chuang","Noman Mohammed","Xiaoqian Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.00179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00160v1","updated":"2024-01-31T20:31:56Z","published":"2024-01-31T20:31:56Z","title":"Multimodal Clinical Pseudo-notes for Emergency Department Prediction\n Tasks using Multiple Embedding Model for EHR (MEME)","summary":" In this work, we introduce Multiple Embedding Model for EHR (MEME), an\napproach that views Electronic Health Records (EHR) as multimodal data. This\napproach incorporates \"pseudo-notes\", textual representations of tabular EHR\nconcepts such as diagnoses and medications, and allows us to effectively employ\nLarge Language Models (LLMs) for EHR representation. This framework also adopts\na multimodal approach, embedding each EHR modality separately. We demonstrate\nthe effectiveness of MEME by applying it to several tasks within the Emergency\nDepartment across multiple hospital systems. Our findings show that MEME\nsurpasses the performance of both single modality embedding methods and\ntraditional machine learning approaches. However, we also observe notable\nlimitations in generalizability across hospital institutions for all tested\nmodels.\n","authors":["Simon A. Lee","Sujay Jain","Alex Chen","Arabdha Biswas","Jennifer Fang","Akos Rudas","Jeffrey N. Chiang"],"pdf_url":"https://arxiv.org/pdf/2402.00160v1.pdf","comment":"ICML Submission. However it is under review until May"},{"id":"http://arxiv.org/abs/2402.00159v1","updated":"2024-01-31T20:29:50Z","published":"2024-01-31T20:29:50Z","title":"Dolma: an Open Corpus of Three Trillion Tokens for Language Model\n Pretraining Research","summary":" Language models have become a critical technology to tackling a wide range of\nnatural language processing tasks, yet many details about how the\nbest-performing language models were developed are not reported. In particular,\ninformation about their pretraining corpora is seldom discussed: commercial\nlanguage models rarely provide any information about their data; even open\nmodels rarely release datasets they are trained on, or an exact recipe to\nreproduce them. As a result, it is challenging to conduct certain threads of\nlanguage modeling research, such as understanding how training data impacts\nmodel capabilities and shapes their limitations. To facilitate open research on\nlanguage model pretraining, we release Dolma, a three trillion tokens English\ncorpus, built from a diverse mixture of web content, scientific papers, code,\npublic-domain books, social media, and encyclopedic materials. In addition, we\nopen source our data curation toolkit to enable further experimentation and\nreproduction of our work. In this report, we document Dolma, including its\ndesign principles, details about its construction, and a summary of its\ncontents. We interleave this report with analyses and experimental results from\ntraining language models on intermediate states of Dolma to share what we have\nlearned about important data curation practices, including the role of content\nor quality filters, deduplication, and multi-source mixing. Dolma has been used\nto train OLMo, a state-of-the-art, open language model and framework designed\nto build and study the science of language modeling.\n","authors":["Luca Soldaini","Rodney Kinney","Akshita Bhagia","Dustin Schwenk","David Atkinson","Russell Authur","Ben Bogin","Khyathi Chandu","Jennifer Dumas","Yanai Elazar","Valentin Hofmann","Ananya Harsh Jha","Sachin Kumar","Li Lucy","Xinxi Lyu","Nathan Lambert","Ian Magnusson","Jacob Morrison","Niklas Muennighoff","Aakanksha Naik","Crystal Nam","Matthew E. Peters","Abhilasha Ravichander","Kyle Richardson","Zejiang Shen","Emma Strubell","Nishant Subramani","Oyvind Tafjord","Pete Walsh","Luke Zettlemoyer","Noah A. Smith","Hannaneh Hajishirzi","Iz Beltagy","Dirk Groeneveld","Jesse Dodge","Kyle Lo"],"pdf_url":"https://arxiv.org/pdf/2402.00159v1.pdf","comment":"Dataset available at: https://huggingface.co/datasets/allenai/dolma"},{"id":"http://arxiv.org/abs/2402.00157v1","updated":"2024-01-31T20:26:32Z","published":"2024-01-31T20:26:32Z","title":"Large Language Models for Mathematical Reasoning: Progresses and\n Challenges","summary":" Mathematical reasoning serves as a cornerstone for assessing the fundamental\ncognitive capabilities of human intelligence. In recent times, there has been a\nnotable surge in the development of Large Language Models (LLMs) geared towards\nthe automated resolution of mathematical problems. However, the landscape of\nmathematical problem types is vast and varied, with LLM-oriented techniques\nundergoing evaluation across diverse datasets and settings. This diversity\nmakes it challenging to discern the true advancements and obstacles within this\nburgeoning field. This survey endeavors to address four pivotal dimensions: i)\na comprehensive exploration of the various mathematical problems and their\ncorresponding datasets that have been investigated; ii) an examination of the\nspectrum of LLM-oriented techniques that have been proposed for mathematical\nproblem-solving; iii) an overview of factors and concerns affecting LLMs in\nsolving math; and iv) an elucidation of the persisting challenges within this\ndomain. To the best of our knowledge, this survey stands as one of the first\nextensive examinations of the landscape of LLMs in the realm of mathematics,\nproviding a holistic perspective on the current state, accomplishments, and\nfuture challenges in this rapidly evolving field.\n","authors":["Janice Ahn","Rishu Verma","Renze Lou","Di Liu","Rui Zhang","Wenpeng Yin"],"pdf_url":"https://arxiv.org/pdf/2402.00157v1.pdf","comment":"EACL 2024 Student Research Workshop, 8 pages"},{"id":"http://arxiv.org/abs/2402.00149v1","updated":"2024-01-31T20:07:43Z","published":"2024-01-31T20:07:43Z","title":"The Impact of Language Adapters in Cross-Lingual Transfer for NLU","summary":" Modular deep learning has been proposed for the efficient adaption of\npre-trained models to new tasks, domains and languages. In particular,\ncombining language adapters with task adapters has shown potential where no\nsupervised data exists for a language. In this paper, we explore the role of\nlanguage adapters in zero-shot cross-lingual transfer for natural language\nunderstanding (NLU) benchmarks. We study the effect of including a\ntarget-language adapter in detailed ablation studies with two multilingual\nmodels and three multilingual datasets. Our results show that the effect of\ntarget-language adapters is highly inconsistent across tasks, languages and\nmodels. Retaining the source-language adapter instead often leads to an\nequivalent, and sometimes to a better, performance. Removing the language\nadapter after training has only a weak negative effect, indicating that the\nlanguage adapters do not have a strong impact on the predictions.\n","authors":["Jenny Kunz","Oskar Holmström"],"pdf_url":"https://arxiv.org/pdf/2402.00149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00143v1","updated":"2024-01-31T19:48:58Z","published":"2024-01-31T19:48:58Z","title":"Making a Long Story Short in Conversation Modeling","summary":" Conversation systems accommodate diverse users with unique personalities and\ndistinct writing styles. Within the domain of multi-turn dialogue modeling,\nthis work studies the impact of varied utterance lengths on the quality of\nsubsequent responses generated by conversation models. Using GPT-3 as the base\nmodel, multiple dialogue datasets, and several metrics, we conduct a thorough\nexploration of this aspect of conversational models. Our analysis sheds light\non the complex relationship between utterance lengths and the quality of\nfollow-up responses generated by dialogue systems. Empirical findings suggests\nthat, for certain types of conversations, utterance lengths can be reduced by\nup to 72% without any noticeable difference in the quality of follow-up\nresponses.\n","authors":["Yufei Tao","Tiernan Mines","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2402.00143v1.pdf","comment":"This paper was accepted by TEICAI workshop at EACL 2024"},{"id":"http://arxiv.org/abs/2308.12950v3","updated":"2024-01-31T19:47:26Z","published":"2023-08-24T17:39:13Z","title":"Code Llama: Open Foundation Models for Code","summary":" We release Code Llama, a family of large language models for code based on\nLlama 2 providing state-of-the-art performance among open models, infilling\ncapabilities, support for large input contexts, and zero-shot instruction\nfollowing ability for programming tasks. We provide multiple flavors to cover a\nwide range of applications: foundation models (Code Llama), Python\nspecializations (Code Llama - Python), and instruction-following models (Code\nLlama - Instruct) with 7B, 13B, 34B and 70B parameters each. All models are\ntrained on sequences of 16k tokens and show improvements on inputs with up to\n100k tokens. 7B, 13B and 70B Code Llama and Code Llama - Instruct variants\nsupport infilling based on surrounding content. Code Llama reaches\nstate-of-the-art performance among open models on several code benchmarks, with\nscores of up to 67% and 65% on HumanEval and MBPP, respectively. Notably, Code\nLlama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our\nmodels outperform every other publicly available model on MultiPL-E. We release\nCode Llama under a permissive license that allows for both research and\ncommercial use.\n","authors":["Baptiste Rozière","Jonas Gehring","Fabian Gloeckle","Sten Sootla","Itai Gat","Xiaoqing Ellen Tan","Yossi Adi","Jingyu Liu","Romain Sauvestre","Tal Remez","Jérémy Rapin","Artyom Kozhevnikov","Ivan Evtimov","Joanna Bitton","Manish Bhatt","Cristian Canton Ferrer","Aaron Grattafiori","Wenhan Xiong","Alexandre Défossez","Jade Copet","Faisal Azhar","Hugo Touvron","Louis Martin","Nicolas Usunier","Thomas Scialom","Gabriel Synnaeve"],"pdf_url":"https://arxiv.org/pdf/2308.12950v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05085v2","updated":"2024-01-31T19:17:00Z","published":"2023-11-09T01:04:44Z","title":"Characterizing Large Language Models as Rationalizers of\n Knowledge-intensive Tasks","summary":" Large language models (LLMs) are proficient at generating fluent text with\nminimal task-specific supervision. Yet, their ability to provide well-grounded\nrationalizations for knowledge-intensive tasks remains under-explored. Such\ntasks, like commonsense multiple-choice questions, require rationales based on\nworld knowledge to support predictions and refute alternate options. We\nconsider the task of generating knowledge-guided rationalization in natural\nlanguage by using expert-written examples in a few-shot manner. Surprisingly,\ncrowd-workers preferred knowledge-grounded rationales over crowdsourced\nrationalizations, citing their factuality, sufficiency, and comprehensive\nrefutations. Although LLMs-generated rationales were preferable, further\nimprovements in conciseness and novelty are required. In another study, we show\nhow rationalization of incorrect model predictions erodes humans' trust in\nLLM-generated rationales. Motivated by these observations, we create a\ntwo-stage pipeline to review task predictions and eliminate potential incorrect\ndecisions before rationalization, enabling trustworthy rationale generation.\n","authors":["Aditi Mishra","Sajjadur Rahman","Hannah Kim","Kushan Mitra","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2311.05085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00126v1","updated":"2024-01-31T19:11:58Z","published":"2024-01-31T19:11:58Z","title":"Common Sense Reasoning for Deep Fake Detection","summary":" State-of-the-art approaches rely on image-based features extracted via neural\nnetworks for the deepfake detection binary classification. While these\napproaches trained in the supervised sense extract likely fake features, they\nmay fall short in representing unnatural `non-physical' semantic facial\nattributes -- blurry hairlines, double eyebrows, rigid eye pupils, or unnatural\nskin shading. However, such facial attributes are generally easily perceived by\nhumans via common sense reasoning. Furthermore, image-based feature extraction\nmethods that provide visual explanation via saliency maps can be hard to be\ninterpreted by humans. To address these challenges, we propose the use of\ncommon sense reasoning to model deepfake detection, and extend it to the\nDeepfake Detection VQA (DD-VQA) task with the aim to model human intuition in\nexplaining the reason behind labeling an image as either real or fake. To this\nend, we introduce a new dataset that provides answers to the questions related\nto the authenticity of an image, along with its corresponding explanations. We\nalso propose a Vision and Language Transformer-based framework for the DD-VQA\ntask, incorporating text and image aware feature alignment formulations.\nFinally, we evaluate our method on both the performance of deepfake detection\nand the quality of the generated explanations. We hope that this task inspires\nresearchers to explore new avenues for enhancing language-based\ninterpretability and cross-modality applications in the realm of deepfake\ndetection.\n","authors":["Yue Zhang","Ben Colman","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2402.00126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00123v1","updated":"2024-01-31T19:07:37Z","published":"2024-01-31T19:07:37Z","title":"Comparing Template-based and Template-free Language Model Probing","summary":" The differences between cloze-task language model (LM) probing with 1)\nexpert-made templates and 2) naturally-occurring text have often been\noverlooked. Here, we evaluate 16 different LMs on 10 probing English datasets\n-- 4 template-based and 6 template-free -- in general and biomedical domains to\nanswer the following research questions: (RQ1) Do model rankings differ between\nthe two approaches? (RQ2) Do models' absolute scores differ between the two\napproaches? (RQ3) Do the answers to RQ1 and RQ2 differ between general and\ndomain-specific models? Our findings are: 1) Template-free and template-based\napproaches often rank models differently, except for the top domain-specific\nmodels. 2) Scores decrease by up to 42% Acc@1 when comparing parallel\ntemplate-free and template-based prompts. 3) Perplexity is negatively\ncorrelated with accuracy in the template-free approach, but,\ncounter-intuitively, they are positively correlated for template-based probing.\n4) Models tend to predict the same answers frequently across prompts for\ntemplate-based probing, which is less common when employing template-free\ntechniques.\n","authors":["Sagi Shaier","Kevin Bennett","Lawrence E Hunter","Katharina von der Wense"],"pdf_url":"https://arxiv.org/pdf/2402.00123v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2310.10688v2","updated":"2024-01-31T19:05:49Z","published":"2023-10-14T17:01:37Z","title":"A decoder-only foundation model for time-series forecasting","summary":" Motivated by recent advances in large language models for Natural Language\nProcessing (NLP), we design a time-series foundation model for forecasting\nwhose out-of-the-box zero-shot performance on a variety of public datasets\ncomes close to the accuracy of state-of-the-art supervised forecasting models\nfor each individual dataset. Our model is based on pretraining a\npatched-decoder style attention model on a large time-series corpus, and can\nwork well across different forecasting history lengths, prediction lengths and\ntemporal granularities.\n","authors":["Abhimanyu Das","Weihao Kong","Rajat Sen","Yichen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.10688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00160v1","updated":"2024-01-31T20:31:56Z","published":"2024-01-31T20:31:56Z","title":"Multimodal Clinical Pseudo-notes for Emergency Department Prediction\n Tasks using Multiple Embedding Model for EHR (MEME)","summary":" In this work, we introduce Multiple Embedding Model for EHR (MEME), an\napproach that views Electronic Health Records (EHR) as multimodal data. This\napproach incorporates \"pseudo-notes\", textual representations of tabular EHR\nconcepts such as diagnoses and medications, and allows us to effectively employ\nLarge Language Models (LLMs) for EHR representation. This framework also adopts\na multimodal approach, embedding each EHR modality separately. We demonstrate\nthe effectiveness of MEME by applying it to several tasks within the Emergency\nDepartment across multiple hospital systems. Our findings show that MEME\nsurpasses the performance of both single modality embedding methods and\ntraditional machine learning approaches. However, we also observe notable\nlimitations in generalizability across hospital institutions for all tested\nmodels.\n","authors":["Simon A. Lee","Sujay Jain","Alex Chen","Arabdha Biswas","Jennifer Fang","Akos Rudas","Jeffrey N. Chiang"],"pdf_url":"https://arxiv.org/pdf/2402.00160v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.18085v1","updated":"2024-01-31T18:59:59Z","published":"2024-01-31T18:59:59Z","title":"Motion Guidance: Diffusion-Based Image Editing with Differentiable\n Motion Estimators","summary":" Diffusion models are capable of generating impressive images conditioned on\ntext descriptions, and extensions of these models allow users to edit images at\na relatively coarse scale. However, the ability to precisely edit the layout,\nposition, pose, and shape of objects in images with diffusion models is still\ndifficult. To this end, we propose motion guidance, a zero-shot technique that\nallows a user to specify dense, complex motion fields that indicate where each\npixel in an image should move. Motion guidance works by steering the diffusion\nsampling process with the gradients through an off-the-shelf optical flow\nnetwork. Specifically, we design a guidance loss that encourages the sample to\nhave the desired motion, as estimated by a flow network, while also being\nvisually similar to the source image. By simultaneously sampling from a\ndiffusion model and guiding the sample to have low guidance loss, we can obtain\na motion-edited image. We demonstrate that our technique works on complex\nmotions and produces high quality edits of real and generated images.\n","authors":["Daniel Geng","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2401.18085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18084v1","updated":"2024-01-31T18:59:57Z","published":"2024-01-31T18:59:57Z","title":"Binding Touch to Everything: Learning Unified Multimodal Tactile\n Representations","summary":" The ability to associate touch with other modalities has huge implications\nfor humans and computational systems. However, multimodal learning with touch\nremains challenging due to the expensive data collection process and\nnon-standardized sensor outputs. We introduce UniTouch, a unified tactile model\nfor vision-based touch sensors connected to multiple modalities, including\nvision, language, and sound. We achieve this by aligning our UniTouch\nembeddings to pretrained image embeddings already associated with a variety of\nother modalities. We further propose learnable sensor-specific tokens, allowing\nthe model to learn from a set of heterogeneous tactile sensors, all at the same\ntime. UniTouch is capable of conducting various touch sensing tasks in the\nzero-shot setting, from robot grasping prediction to touch image question\nanswering. To the best of our knowledge, UniTouch is the first to demonstrate\nsuch capabilities. Project page: https://cfeng16.github.io/UniTouch/\n","authors":["Fengyu Yang","Chao Feng","Ziyang Chen","Hyoungseob Park","Daniel Wang","Yiming Dou","Ziyao Zeng","Xien Chen","Rit Gangopadhyay","Andrew Owens","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2401.18084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18083v1","updated":"2024-01-31T18:59:12Z","published":"2024-01-31T18:59:12Z","title":"Improved Scene Landmark Detection for Camera Localization","summary":" Camera localization methods based on retrieval, local feature matching, and\n3D structure-based pose estimation are accurate but require high storage, are\nslow, and are not privacy-preserving. A method based on scene landmark\ndetection (SLD) was recently proposed to address these limitations. It involves\ntraining a convolutional neural network (CNN) to detect a few predetermined,\nsalient, scene-specific 3D points or landmarks and computing camera pose from\nthe associated 2D-3D correspondences. Although SLD outperformed existing\nlearning-based approaches, it was notably less accurate than 3D structure-based\nmethods. In this paper, we show that the accuracy gap was due to insufficient\nmodel capacity and noisy labels during training. To mitigate the capacity\nissue, we propose to split the landmarks into subgroups and train a separate\nnetwork for each subgroup. To generate better training labels, we propose using\ndense reconstructions to estimate visibility of scene landmarks. Finally, we\npresent a compact architecture to improve memory efficiency. Accuracy wise, our\napproach is on par with state of the art structure based methods on the\nINDOOR-6 dataset but runs significantly faster and uses less storage. Code and\nmodels can be found at https://github.com/microsoft/SceneLandmarkLocalization.\n","authors":["Tien Do","Sudipta N. Sinha"],"pdf_url":"https://arxiv.org/pdf/2401.18083v1.pdf","comment":"To be presented at 3DV 2024"},{"id":"http://arxiv.org/abs/2401.18075v1","updated":"2024-01-31T18:56:09Z","published":"2024-01-31T18:56:09Z","title":"CARFF: Conditional Auto-encoded Radiance Field for 3D Scene Forecasting","summary":" We propose CARFF: Conditional Auto-encoded Radiance Field for 3D Scene\nForecasting, a method for predicting future 3D scenes given past observations,\nsuch as 2D ego-centric images. Our method maps an image to a distribution over\nplausible 3D latent scene configurations using a probabilistic encoder, and\npredicts the evolution of the hypothesized scenes through time. Our latent\nscene representation conditions a global Neural Radiance Field (NeRF) to\nrepresent a 3D scene model, which enables explainable predictions and\nstraightforward downstream applications. This approach extends beyond previous\nneural rendering work by considering complex scenarios of uncertainty in\nenvironmental states and dynamics. We employ a two-stage training of\nPose-Conditional-VAE and NeRF to learn 3D representations. Additionally, we\nauto-regressively predict latent scene representations as a partially\nobservable Markov decision process, utilizing a mixture density network. We\ndemonstrate the utility of our method in realistic scenarios using the CARLA\ndriving simulator, where CARFF can be used to enable efficient trajectory and\ncontingency planning in complex multi-agent autonomous driving scenarios\ninvolving visual occlusions.\n","authors":["Jiezhi Yang","Khushi Desai","Charles Packer","Harshil Bhatia","Nicholas Rhinehart","Rowan McAllister","Joseph Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2401.18075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16468v2","updated":"2024-01-31T18:54:15Z","published":"2024-01-29T18:53:33Z","title":"High-Quality Image Restoration Following Human Instructions","summary":" Image restoration is a fundamental problem that involves recovering a\nhigh-quality clean image from its degraded observation. All-In-One image\nrestoration models can effectively restore images from various types and levels\nof degradation using degradation-specific information as prompts to guide the\nrestoration model. In this work, we present the first approach that uses\nhuman-written instructions to guide the image restoration model. Given natural\nlanguage prompts, our model can recover high-quality images from their degraded\ncounterparts, considering multiple degradation types. Our method, InstructIR,\nachieves state-of-the-art results on several restoration tasks including image\ndenoising, deraining, deblurring, dehazing, and (low-light) image enhancement.\nInstructIR improves +1dB over previous all-in-one restoration methods.\nMoreover, our dataset and results represent a novel benchmark for new research\non text-guided image restoration and enhancement. Our code, datasets and models\nare available at: https://github.com/mv-lab/InstructIR\n","authors":["Marcos V. Conde","Gregor Geigle","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2401.16468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06712v2","updated":"2024-01-31T18:44:22Z","published":"2023-12-10T22:07:42Z","title":"Separate-and-Enhance: Compositional Finetuning for Text2Image Diffusion\n Models","summary":" Despite recent significant strides achieved by diffusion-based Text-to-Image\n(T2I) models, current systems are still less capable of ensuring decent\ncompositional generation aligned with text prompts, particularly for the\nmulti-object generation. This work illuminates the fundamental reasons for such\nmisalignment, pinpointing issues related to low attention activation scores and\nmask overlaps. While previous research efforts have individually tackled these\nissues, we assert that a holistic approach is paramount. Thus, we propose two\nnovel objectives, the Separate loss and the Enhance loss, that reduce object\nmask overlaps and maximize attention scores, respectively. Our method diverges\nfrom conventional test-time-adaptation techniques, focusing on finetuning\ncritical parameters, which enhances scalability and generalizability.\nComprehensive evaluations demonstrate the superior performance of our model in\nterms of image realism, text-image alignment, and adaptability, notably\noutperforming prominent baselines. Ultimately, this research paves the way for\nT2I diffusion models with enhanced compositional capacities and broader\napplicability.\n","authors":["Zhipeng Bao","Yijun Li","Krishna Kumar Singh","Yu-Xiong Wang","Martial Hebert"],"pdf_url":"https://arxiv.org/pdf/2312.06712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18054v1","updated":"2024-01-31T18:20:42Z","published":"2024-01-31T18:20:42Z","title":"Benchmarking Sensitivity of Continual Graph Learning for Skeleton-Based\n Action Recognition","summary":" Continual learning (CL) is the research field that aims to build machine\nlearning models that can accumulate knowledge continuously over different tasks\nwithout retraining from scratch. Previous studies have shown that pre-training\ngraph neural networks (GNN) may lead to negative transfer (Hu et al., 2020)\nafter fine-tuning, a setting which is closely related to CL. Thus, we focus on\nstudying GNN in the continual graph learning (CGL) setting. We propose the\nfirst continual graph learning benchmark for spatio-temporal graphs and use it\nto benchmark well-known CGL methods in this novel setting. The benchmark is\nbased on the N-UCLA and NTU-RGB+D datasets for skeleton-based action\nrecognition. Beyond benchmarking for standard performance metrics, we study the\nclass and task-order sensitivity of CGL methods, i.e., the impact of learning\norder on each class/task's performance, and the architectural sensitivity of\nCGL methods with backbone GNN at various widths and depths. We reveal that\ntask-order robust methods can still be class-order sensitive and observe\nresults that contradict previous empirical observations on architectural\nsensitivity in CL.\n","authors":["Wei Wei","Tom De Schepper","Kevin Mets"],"pdf_url":"https://arxiv.org/pdf/2401.18054v1.pdf","comment":"This work is accepted at VISAPP 2024 as a short paper"},{"id":"http://arxiv.org/abs/2401.06550v2","updated":"2024-01-31T18:13:53Z","published":"2024-01-12T12:54:30Z","title":"Multimodal Urban Areas of Interest Generation via Remote Sensing Imagery\n and Geographical Prior","summary":" Urban area-of-interest (AOI) refers to an integrated urban functional zone\nwith defined boundaries. The rapid development of urban commerce has resulted\nin an increased demand for more precise requirements in defining AOIs. However,\nexisting research primarily concentrates on broad AOI mining for urban planning\nor regional economic analysis, failing to cater to the precise requirements of\nmobile Internet online-to-offline businesses. These businesses necessitate\naccuracy down to a specific community, school, or hospital. In this paper, we\npropose an end-to-end multimodal deep learning algorithm for detecting AOI\nfence polygon using remote sensing images and multi-semantics reference\ninformation. We then evaluate its timeliness through a cascaded module that\nincorporates dynamic human mobility and logistics address information.\nSpecifically, we begin by selecting a point-of-interest (POI) of specific\ncategory, and use it to recall corresponding remote sensing images, nearby\nPOIs, road nodes, human mobility, and logistics addresses to build a multimodal\ndetection model based on transformer encoder-decoder architecture, titled\nAOITR. In the model, in addition to the remote sensing images, multi-semantic\ninformation including core POI and road nodes is embedded and reorganized as\nthe query content part for the transformer decoder to generate the AOI polygon.\nMeanwhile, relatively dynamic distribution features of human mobility, nearby\nPOIs, and logistics addresses are used for AOI reliability evaluation through a\ncascaded feedforward network. The experimental results demonstrate that our\nalgorithm significantly outperforms two existing methods.\n","authors":["Chuanji Shi","Yingying Zhang","Jiaotuan Wang","Xin Guo","Qiqi Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.06550v2.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.18032v1","updated":"2024-01-31T17:54:43Z","published":"2024-01-31T17:54:43Z","title":"DROP: Decouple Re-Identification and Human Parsing with Task-specific\n Features for Occluded Person Re-identification","summary":" The paper introduces the Decouple Re-identificatiOn and human Parsing (DROP)\nmethod for occluded person re-identification (ReID). Unlike mainstream\napproaches using global features for simultaneous multi-task learning of ReID\nand human parsing, or relying on semantic information for attention guidance,\nDROP argues that the inferior performance of the former is due to distinct\ngranularity requirements for ReID and human parsing features. ReID focuses on\ninstance part-level differences between pedestrian parts, while human parsing\ncenters on semantic spatial context, reflecting the internal structure of the\nhuman body. To address this, DROP decouples features for ReID and human\nparsing, proposing detail-preserving upsampling to combine varying resolution\nfeature maps. Parsing-specific features for human parsing are decoupled, and\nhuman position information is exclusively added to the human parsing branch. In\nthe ReID branch, a part-aware compactness loss is introduced to enhance\ninstance-level part differences. Experimental results highlight the efficacy of\nDROP, especially achieving a Rank-1 accuracy of 76.8% on Occluded-Duke,\nsurpassing two mainstream methods. The codebase is accessible at\nhttps://github.com/shuguang-52/DROP.\n","authors":["Shuguang Dou","Xiangyang Jiang","Yuanpeng Tu","Junyao Gao","Zefan Qu","Qingsong Zhao","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.18032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13530v2","updated":"2024-01-31T17:29:26Z","published":"2023-01-31T10:24:50Z","title":"Domain-Generalizable Multiple-Domain Clustering","summary":" This work generalizes the problem of unsupervised domain generalization to\nthe case in which no labeled samples are available (completely unsupervised).\nWe are given unlabeled samples from multiple source domains, and we aim to\nlearn a shared predictor that assigns examples to semantically related\nclusters. Evaluation is done by predicting cluster assignments in previously\nunseen domains. Towards this goal, we propose a two-stage training framework:\n(1) self-supervised pre-training for extracting domain invariant semantic\nfeatures. (2) multi-head cluster prediction with pseudo labels, which rely on\nboth the feature space and cluster head prediction, further leveraging a novel\nprediction-based label smoothing scheme. We demonstrate empirically that our\nmodel is more accurate than baselines that require fine-tuning using samples\nfrom the target domain or some level of supervision. Our code is available at\nhttps://github.com/AmitRozner/domain-generalizable-multiple-domain-clustering.\n","authors":["Amit Rozner","Barak Battash","Lior Wolf","Ofir Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2301.13530v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.17992v1","updated":"2024-01-31T16:52:19Z","published":"2024-01-31T16:52:19Z","title":"Multilinear Operator Networks","summary":" Despite the remarkable capabilities of deep neural networks in image\nrecognition, the dependence on activation functions remains a largely\nunexplored area and has yet to be eliminated. On the other hand, Polynomial\nNetworks is a class of models that does not require activation functions, but\nhave yet to perform on par with modern architectures. In this work, we aim\nclose this gap and propose MONet, which relies solely on multilinear operators.\nThe core layer of MONet, called Mu-Layer, captures multiplicative interactions\nof the elements of the input token. MONet captures high-degree interactions of\nthe input elements and we demonstrate the efficacy of our approach on a series\nof image recognition and scientific computing benchmarks. The proposed model\noutperforms prior polynomial networks and performs on par with modern\narchitectures. We believe that MONet can inspire further research on models\nthat use entirely multilinear operations.\n","authors":["Yixin Cheng","Grigorios G. Chrysos","Markos Georgopoulos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2401.17992v1.pdf","comment":"International Conference on Learning Representations Poster(2024)"},{"id":"http://arxiv.org/abs/2302.06432v3","updated":"2024-01-31T16:50:58Z","published":"2023-02-13T15:12:11Z","title":"A Deep Learning-based Global and Segmentation-based Semantic Feature\n Fusion Approach for Indoor Scene Classification","summary":" This work proposes a novel approach that uses a semantic segmentation mask to\nobtain a 2D spatial layout of the segmentation-categories across the scene,\ndesignated by segmentation-based semantic features (SSFs). These features\nrepresent, per segmentation-category, the pixel count, as well as the 2D\naverage position and respective standard deviation values. Moreover, a\ntwo-branch network, GS2F2App, that exploits CNN-based global features extracted\nfrom RGB images and the segmentation-based features extracted from the proposed\nSSFs, is also proposed. GS2F2App was evaluated in two indoor scene benchmark\ndatasets: the SUN RGB-D and the NYU Depth V2, achieving state-of-the-art\nresults on both datasets.\n","authors":["Ricardo Pereira","Tiago Barros","Luis Garrote","Ana Lopes","Urbano J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2302.06432v3.pdf","comment":"Published at Pattern Recognition Letters 2024 (DOI:\n 10.1016/j.patrec.2024.01.022)"},{"id":"http://arxiv.org/abs/2401.17985v1","updated":"2024-01-31T16:44:20Z","published":"2024-01-31T16:44:20Z","title":"Shrub of a thousand faces: an individual segmentation from satellite\n images using deep learning","summary":" Monitoring the distribution and size structure of long-living shrubs, such as\nJuniperus communis, can be used to estimate the long-term effects of climate\nchange on high-mountain and high latitude ecosystems. Historical aerial\nvery-high resolution imagery offers a retrospective tool to monitor shrub\ngrowth and distribution at high precision. Currently, deep learning models\nprovide impressive results for detecting and delineating the contour of objects\nwith defined shapes. However, adapting these models to detect natural objects\nthat express complex growth patterns, such as junipers, is still a challenging\ntask.\n This research presents a novel approach that leverages remotely sensed RGB\nimagery in conjunction with Mask R-CNN-based instance segmentation models to\nindividually delineate Juniperus shrubs above the treeline in Sierra Nevada\n(Spain). In this study, we propose a new data construction design that consists\nin using photo interpreted (PI) and field work (FW) data to respectively\ndevelop and externally validate the model. We also propose a new shrub-tailored\nevaluation algorithm based on a new metric called Multiple Intersections over\nGround Truth Area (MIoGTA) to assess and optimize the model shrub delineation\nperformance. Finally, we deploy the developed model for the first time to\ngenerate a wall-to-wall map of Juniperus individuals.\n The experimental results demonstrate the efficiency of our dual data\nconstruction approach in overcoming the limitations associated with traditional\nfield survey methods. They also highlight the robustness of MIoGTA metric in\nevaluating instance segmentation models on species with complex growth patterns\nshowing more resilience against data annotation uncertainty. Furthermore, they\nshow the effectiveness of employing Mask R-CNN with ResNet101-C4 backbone in\ndelineating PI and FW shrubs, achieving an F1-score of 87,87% and 76.86%,\nrespectively.\n","authors":["Rohaifa Khaldi","Siham Tabik","Sergio Puertas-Ruiz","Julio Peñas de Giles","José Antonio Hódar Correa","Regino Zamora","Domingo Alcaraz Segura"],"pdf_url":"https://arxiv.org/pdf/2401.17985v1.pdf","comment":"39 pages, 20 figures"},{"id":"http://arxiv.org/abs/2401.17981v1","updated":"2024-01-31T16:38:32Z","published":"2024-01-31T16:38:32Z","title":"Enhancing Multimodal Large Language Models with Vision Detection Models:\n An Empirical Study","summary":" Despite the impressive capabilities of Multimodal Large Language Models\n(MLLMs) in integrating text and image modalities, challenges remain in\naccurately interpreting detailed visual elements. This paper presents an\nempirical study on enhancing MLLMs with state-of-the-art (SOTA) object\ndetection and Optical Character Recognition models to improve fine-grained\nimage understanding and reduce hallucination in responses. Our research\ninvestigates the embedding-based infusion of detection information, the impact\nof such infusion on the MLLMs' original abilities, and the interchangeability\nof detection models. We conduct systematic experiments with models such as\nLLaVA-1.5, DINO, and PaddleOCRv2, revealing that our approach not only refines\nMLLMs' performance in specific visual tasks but also maintains their original\nstrengths. The resulting enhanced MLLMs outperform SOTA models on 9 out of 10\nbenchmarks, achieving an improvement of up to 12.99% on the normalized average\nscore, marking a notable advancement in multimodal understanding. We release\nour codes to facilitate further exploration into the fine-grained multimodal\ndialogue capabilities of MLLMs.\n","authors":["Qirui Jiao","Daoyuan Chen","Yilun Huang","Yaliang Li","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17972v1","updated":"2024-01-31T16:27:47Z","published":"2024-01-31T16:27:47Z","title":"MelNet: A Real-Time Deep Learning Algorithm for Object Detection","summary":" In this study, a novel deep learning algorithm for object detection, named\nMelNet, was introduced. MelNet underwent training utilizing the KITTI dataset\nfor object detection. Following 300 training epochs, MelNet attained an mAP\n(mean average precision) score of 0.732. Additionally, three alternative models\n-YOLOv5, EfficientDet, and Faster-RCNN-MobileNetv3- were trained on the KITTI\ndataset and juxtaposed with MelNet for object detection.\n The outcomes underscore the efficacy of employing transfer learning in\ncertain instances. Notably, preexisting models trained on prominent datasets\n(e.g., ImageNet, COCO, and Pascal VOC) yield superior results. Another finding\nunderscores the viability of creating a new model tailored to a specific\nscenario and training it on a specific dataset. This investigation demonstrates\nthat training MelNet exclusively on the KITTI dataset also surpasses\nEfficientDet after 150 epochs. Consequently, post-training, MelNet's\nperformance closely aligns with that of other pre-trained models.\n","authors":["Yashar Azadvatan","Murat Kurt"],"pdf_url":"https://arxiv.org/pdf/2401.17972v1.pdf","comment":"11 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.05930v2","updated":"2024-01-31T16:11:27Z","published":"2023-09-12T03:05:06Z","title":"Combining Deep Learning and Street View Imagery to Map Smallholder Crop\n Types","summary":" Accurate crop type maps are an essential source of information for monitoring\nyield progress at scale, projecting global crop production, and planning\neffective policies. To date, however, crop type maps remain challenging to\ncreate in low and middle-income countries due to a lack of ground truth labels\nfor training machine learning models. Field surveys are the gold standard in\nterms of accuracy but require an often-prohibitively large amount of time,\nmoney, and statistical capacity. In recent years, street-level imagery, such as\nGoogle Street View, KartaView, and Mapillary, has become available around the\nworld. Such imagery contains rich information about crop types grown at\nparticular locations and times. In this work, we develop an automated system to\ngenerate crop type ground references using deep learning and Google Street View\nimagery. The method efficiently curates a set of street view images containing\ncrop fields, trains a model to predict crop type by utilizing weakly-labelled\nimages from disparate out-of-domain sources, and combines predicted labels with\nremote sensing time series to create a wall-to-wall crop type map. We show\nthat, in Thailand, the resulting country-wide map of rice, cassava, maize, and\nsugarcane achieves an accuracy of 93%. We publicly release the first-ever crop\ntype map for all of Thailand 2022 at 10m-resolution with no gaps. To our\nknowledge, this is the first time a 10m-resolution, multi-crop map has been\ncreated for any smallholder country. As the availability of roadside imagery\nexpands, our pipeline provides a way to map crop types at scale around the\nglobe, especially in underserved smallholder regions.\n","authors":["Jordi Laguarta Soler","Thomas Friedel","Sherrie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05930v2.pdf","comment":"Accepted to AAAI-24: Special Track on AI for Social Impact"},{"id":"http://arxiv.org/abs/2303.14346v2","updated":"2024-01-31T16:00:54Z","published":"2023-03-25T03:32:01Z","title":"Collaborative Multi-Object Tracking with Conformal Uncertainty\n Propagation","summary":" Object detection and multiple object tracking (MOT) are essential components\nof self-driving systems. Accurate detection and uncertainty quantification are\nboth critical for onboard modules, such as perception, prediction, and\nplanning, to improve the safety and robustness of autonomous vehicles.\nCollaborative object detection (COD) has been proposed to improve detection\naccuracy and reduce uncertainty by leveraging the viewpoints of multiple\nagents. However, little attention has been paid to how to leverage the\nuncertainty quantification from COD to enhance MOT performance. In this paper,\nas the first attempt to address this challenge, we design an uncertainty\npropagation framework called MOT-CUP. Our framework first quantifies the\nuncertainty of COD through direct modeling and conformal prediction, and\npropagates this uncertainty information into the motion prediction and\nassociation steps. MOT-CUP is designed to work with different collaborative\nobject detectors and baseline MOT algorithms. We evaluate MOT-CUP on V2X-Sim, a\ncomprehensive collaborative perception dataset, and demonstrate a 2%\nimprovement in accuracy and a 2.67X reduction in uncertainty compared to the\nbaselines, e.g. SORT and ByteTrack. In scenarios characterized by high\nocclusion levels, our MOT-CUP demonstrates a noteworthy $4.01\\%$ improvement in\naccuracy. MOT-CUP demonstrates the importance of uncertainty quantification in\nboth COD and MOT, and provides the first attempt to improve the accuracy and\nreduce the uncertainty in MOT based on COD through uncertainty propagation. Our\ncode is public on https://coperception.github.io/MOT-CUP/.\n","authors":["Sanbao Su","Songyang Han","Yiming Li","Zhili Zhang","Chen Feng","Caiwen Ding","Fei Miao"],"pdf_url":"https://arxiv.org/pdf/2303.14346v2.pdf","comment":"This paper has been accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2401.17948v1","updated":"2024-01-31T15:57:21Z","published":"2024-01-31T15:57:21Z","title":"HyperZ$\\cdot$Z$\\cdot$W Operator Connects Slow-Fast Networks for Full\n Context Interaction","summary":" The self-attention mechanism utilizes large implicit weight matrices,\nprogrammed through dot product-based activations with very few trainable\nparameters, to enable long sequence modeling. In this paper, we investigate the\npossibility of discarding residual learning by employing large implicit kernels\nto achieve full context interaction at each layer of the network. To accomplish\nit, we introduce coordinate-based implicit MLPs as a slow network to generate\nhyper-kernels for another fast convolutional network. To get context-varying\nweights for fast dynamic encoding, we propose a\n$\\mathrm{Hyper}\\mathcal{Z{\\cdot}Z{\\cdot}W}$ operator that connects\nhyper-kernels ($\\mathcal{W}$) and hidden activations ($\\mathcal{Z}$) through\nsimple elementwise multiplication, followed by convolution of $\\mathcal{Z}$\nusing the context-dependent $\\mathcal{W}$. Based on this design, we present a\nnovel Terminator architecture that integrates hyper-kernels of different sizes\nto produce multi-branch hidden representations for enhancing the feature\nextraction capability of each layer. Additionally, a bottleneck layer is\nemployed to compress the concatenated channels, allowing only valuable\ninformation to propagate to the subsequent layers. Notably, our model\nincorporates several innovative components and exhibits excellent properties,\nsuch as introducing local feedback error for updating the slow network, stable\nzero-mean features, faster training convergence, and fewer model parameters.\nExtensive experimental results on pixel-level 1D and 2D image classification\nbenchmarks demonstrate the superior performance of our architecture.\n","authors":["Harvie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17948v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.13554v2","updated":"2024-01-31T15:54:10Z","published":"2024-01-24T16:13:24Z","title":"PanAf20K: A Large Video Dataset for Wild Ape Detection and Behaviour\n Recognition","summary":" We present the PanAf20K dataset, the largest and most diverse open-access\nannotated video dataset of great apes in their natural environment. It\ncomprises more than 7 million frames across ~20,000 camera trap videos of\nchimpanzees and gorillas collected at 14 field sites in tropical Africa as part\nof the Pan African Programme: The Cultured Chimpanzee. The footage is\naccompanied by a rich set of annotations and benchmarks making it suitable for\ntraining and testing a variety of challenging and ecologically important\ncomputer vision tasks including ape detection and behaviour recognition.\nFurthering AI analysis of camera trap information is critical given the\nInternational Union for Conservation of Nature now lists all species in the\ngreat ape family as either Endangered or Critically Endangered. We hope the\ndataset can form a solid basis for engagement of the AI community to improve\nperformance, efficiency, and result interpretation in order to support\nassessments of great ape presence, abundance, distribution, and behaviour and\nthereby aid conservation efforts.\n","authors":["Otto Brookes","Majid Mirmehdi","Colleen Stephens","Samuel Angedakin","Katherine Corogenes","Dervla Dowd","Paula Dieguez","Thurston C. Hicks","Sorrel Jones","Kevin Lee","Vera Leinert","Juan Lapuente","Maureen S. McCarthy","Amelia Meier","Mizuki Murai","Emmanuelle Normand","Virginie Vergnes","Erin G. Wessling","Roman M. Wittig","Kevin Langergraber","Nuria Maldonado","Xinyu Yang","Klaus Zuberbuhler","Christophe Boesch","Mimi Arandjelovic","Hjalmar Kuhl","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2401.13554v2.pdf","comment":"Accepted at IJCV"},{"id":"http://arxiv.org/abs/2401.16416v2","updated":"2024-01-31T15:51:45Z","published":"2024-01-29T18:55:29Z","title":"Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian\n Splatting","summary":" In the realm of robot-assisted minimally invasive surgery, dynamic scene\nreconstruction can significantly enhance downstream tasks and improve surgical\noutcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to\nprominence for their exceptional ability to reconstruct scenes. Nonetheless,\nthese methods are hampered by slow inference, prolonged training, and\nsubstantial computational demands. Additionally, some rely on stereo depth\nestimation, which is often infeasible due to the high costs and logistical\nchallenges associated with stereo cameras. Moreover, the monocular\nreconstruction quality for deformable scenes is currently inadequate. To\novercome these obstacles, we present Endo-4DGS, an innovative, real-time\nendoscopic dynamic reconstruction approach that utilizes 4D Gaussian Splatting\n(GS) and requires no ground truth depth data. This method extends 3D GS by\nincorporating a temporal component and leverages a lightweight MLP to capture\ntemporal Gaussian deformations. This effectively facilitates the reconstruction\nof dynamic surgical scenes with variable conditions. We also integrate\nDepth-Anything to generate pseudo-depth maps from monocular views, enhancing\nthe depth-guided reconstruction process. Our approach has been validated on two\nsurgical datasets, where it can effectively render in real-time, compute\nefficiently, and reconstruct with remarkable accuracy. These results underline\nthe vast potential of Endo-4DGS to improve surgical assistance.\n","authors":["Yiming Huang","Beilei Cui","Long Bai","Ziqi Guo","Mengya Xu","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2401.16416v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06225v2","updated":"2024-01-31T15:50:17Z","published":"2022-10-12T14:12:04Z","title":"On the Generalizability of ECG-based Stress Detection Models","summary":" Stress is prevalent in many aspects of everyday life including work,\nhealthcare, and social interactions. Many works have studied handcrafted\nfeatures from various bio-signals that are indicators of stress. Recently, deep\nlearning models have also been proposed to detect stress. Typically, stress\nmodels are trained and validated on the same dataset, often involving one\nstressful scenario. However, it is not practical to collect stress data for\nevery scenario. So, it is crucial to study the generalizability of these models\nand determine to what extent they can be used in other scenarios. In this\npaper, we explore the generalization capabilities of Electrocardiogram\n(ECG)-based deep learning models and models based on handcrafted ECG features,\ni.e., Heart Rate Variability (HRV) features. To this end, we train three HRV\nmodels and two deep learning models that use ECG signals as input. We use ECG\nsignals from two popular stress datasets - WESAD and SWELL-KW - differing in\nterms of stressors and recording devices. First, we evaluate the models using\nleave-one-subject-out (LOSO) cross-validation using training and validation\nsamples from the same dataset. Next, we perform a cross-dataset validation of\nthe models, that is, LOSO models trained on the WESAD dataset are validated\nusing SWELL-KW samples and vice versa. While deep learning models achieve the\nbest results on the same dataset, models based on HRV features considerably\noutperform them on data from a different dataset. This trend is observed for\nall the models on both datasets. Therefore, HRV models are a better choice for\nstress recognition in applications that are different from the dataset\nscenario. To the best of our knowledge, this is the first work to compare the\ncross-dataset generalizability between ECG-based deep learning models and HRV\nmodels.\n","authors":["Pooja Prajod","Elisabeth André"],"pdf_url":"https://arxiv.org/pdf/2210.06225v2.pdf","comment":"Published in Proceedings of 2022 21st IEEE International Conference\n on Machine Learning and Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2401.17916v1","updated":"2024-01-31T15:32:44Z","published":"2024-01-31T15:32:44Z","title":"Source-free Domain Adaptive Object Detection in Remote Sensing Images","summary":" Recent studies have used unsupervised domain adaptive object detection\n(UDAOD) methods to bridge the domain gap in remote sensing (RS) images.\nHowever, UDAOD methods typically assume that the source domain data can be\naccessed during the domain adaptation process. This setting is often\nimpractical in the real world due to RS data privacy and transmission\ndifficulty. To address this challenge, we propose a practical source-free\nobject detection (SFOD) setting for RS images, which aims to perform target\ndomain adaptation using only the source pre-trained model. We propose a new\nSFOD method for RS images consisting of two parts: perturbed domain generation\nand alignment. The proposed multilevel perturbation constructs the perturbed\ndomain in a simple yet efficient form by perturbing the domain-variant features\nat the image level and feature level according to the color and style bias. The\nproposed multilevel alignment calculates feature and label consistency between\nthe perturbed domain and the target domain across the teacher-student network,\nand introduces the distillation of feature prototype to mitigate the noise of\npseudo-labels. By requiring the detector to be consistent in the perturbed\ndomain and the target domain, the detector is forced to focus on\ndomaininvariant features. Extensive results of three synthetic-to-real\nexperiments and three cross-sensor experiments have validated the effectiveness\nof our method which does not require access to source domain RS images.\nFurthermore, experiments on computer vision datasets show that our method can\nbe extended to other fields as well. Our code will be available at:\nhttps://weixliu.github.io/ .\n","authors":["Weixing Liu","Jun Liu","Xin Su","Han Nie","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2401.17916v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.17910v1","updated":"2024-01-31T15:15:41Z","published":"2024-01-31T15:15:41Z","title":"Controllable Dense Captioner with Multimodal Embedding Bridging","summary":" In this paper, we propose a controllable dense captioner (ControlCap), which\naccommodates user's intention to dense captioning by introducing linguistic\nguidance. ControlCap is defined as a multimodal embedding bridging\narchitecture, which comprises multimodal embedding generation (MEG) module and\nbi-directional embedding bridging (BEB) module. While MEG module represents\nobjects/regions by combining embeddings of detailed information with\ncontext-aware ones, it also endows ControlCap the adaptability to specialized\ncontrols by utilizing them as linguistic guidance. BEB module aligns the\nlinguistic guidance with visual embeddings through borrowing/returning features\nfrom/to the visual domain and gathering such features to predict text\ndescriptions. Experiments on Visual Genome and VG-COCO datasets show that\nControlCap respectively outperforms the state-of-the-art methods by 1.5% and\n3.7% (mAP). Last but not least, with the capability of converting\nregion-category pairs to region-text pairs, ControlCap is able to act as a\npowerful data engine for dense captioning. Code is available at\nhttps://github.com/callsys/ControlCap.\n","authors":["Yuzhong Zhao","Yue Liu","Zonghao Guo","Weijia Wu","Chen Gong","Qixiang Ye","Fang Wan"],"pdf_url":"https://arxiv.org/pdf/2401.17910v1.pdf","comment":"https://github.com/callsys/ControlCap"},{"id":"http://arxiv.org/abs/2401.17904v1","updated":"2024-01-31T15:10:29Z","published":"2024-01-31T15:10:29Z","title":"Hi-SAM: Marrying Segment Anything Model for Hierarchical Text\n Segmentation","summary":" The Segment Anything Model (SAM), a profound vision foundation model\npre-trained on a large-scale dataset, breaks the boundaries of general\nsegmentation and sparks various downstream applications. This paper introduces\nHi-SAM, a unified model leveraging SAM for hierarchical text segmentation.\nHi-SAM excels in text segmentation across four hierarchies, including stroke,\nword, text-line, and paragraph, while realizing layout analysis as well.\nSpecifically, we first turn SAM into a high-quality text stroke segmentation\n(TSS) model through a parameter-efficient fine-tuning approach. We use this TSS\nmodel to iteratively generate the text stroke labels in a semi-automatical\nmanner, unifying labels across the four text hierarchies in the HierText\ndataset. Subsequently, with these complete labels, we launch the end-to-end\ntrainable Hi-SAM based on the TSS architecture with a customized hierarchical\nmask decoder. During inference, Hi-SAM offers both automatic mask generation\n(AMG) mode and promptable segmentation mode. In terms of the AMG mode, Hi-SAM\nsegments text stroke foreground masks initially, then samples foreground points\nfor hierarchical text mask generation and achieves layout analysis in passing.\nAs for the promptable mode, Hi-SAM provides word, text-line, and paragraph\nmasks with a single point click. Experimental results show the state-of-the-art\nperformance of our TSS model: 84.86% fgIOU on Total-Text and 88.96% fgIOU on\nTextSeg for text stroke segmentation. Moreover, compared to the previous\nspecialist for joint hierarchical detection and layout analysis on HierText,\nHi-SAM achieves significant improvements: 4.73% PQ and 5.39% F1 on the\ntext-line level, 5.49% PQ and 7.39% F1 on the paragraph level layout analysis,\nrequiring 20x fewer training epochs. The code is available at\nhttps://github.com/ymy-k/Hi-SAM.\n","authors":["Maoyuan Ye","Jing Zhang","Juhua Liu","Chenyu Liu","Baocai Yin","Cong Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.17904v1.pdf","comment":"GitHub repository: https://github.com/ymy-k/Hi-SAM"},{"id":"http://arxiv.org/abs/2401.17895v1","updated":"2024-01-31T15:02:26Z","published":"2024-01-31T15:02:26Z","title":"ReplaceAnything3D:Text-Guided 3D Scene Editing with Compositional Neural\n Radiance Fields","summary":" We introduce ReplaceAnything3D model (RAM3D), a novel text-guided 3D scene\nediting method that enables the replacement of specific objects within a scene.\nGiven multi-view images of a scene, a text prompt describing the object to\nreplace, and a text prompt describing the new object, our Erase-and-Replace\napproach can effectively swap objects in the scene with newly generated content\nwhile maintaining 3D consistency across multiple viewpoints. We demonstrate the\nversatility of ReplaceAnything3D by applying it to various realistic 3D scenes,\nshowcasing results of modified foreground objects that are well-integrated with\nthe rest of the scene without affecting its overall integrity.\n","authors":["Edward Bartrum","Thu Nguyen-Phuoc","Chris Xie","Zhengqin Li","Numair Khan","Armen Avetisyan","Douglas Lanman","Lei Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.17895v1.pdf","comment":"For our project page, see https://replaceanything3d.github.io/"},{"id":"http://arxiv.org/abs/2401.17053v2","updated":"2024-01-31T14:53:22Z","published":"2024-01-30T14:34:19Z","title":"BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane\n Extrapolation","summary":" We present BlockFusion, a diffusion-based model that generates 3D scenes as\nunit blocks and seamlessly incorporates new blocks to extend the scene.\nBlockFusion is trained using datasets of 3D blocks that are randomly cropped\nfrom complete 3D scene meshes. Through per-block fitting, all training blocks\nare converted into the hybrid neural fields: with a tri-plane containing the\ngeometry features, followed by a Multi-layer Perceptron (MLP) for decoding the\nsigned distance values. A variational auto-encoder is employed to compress the\ntri-planes into the latent tri-plane space, on which the denoising diffusion\nprocess is performed. Diffusion applied to the latent representations allows\nfor high-quality and diverse 3D scene generation. To expand a scene during\ngeneration, one needs only to append empty blocks to overlap with the current\nscene and extrapolate existing latent tri-planes to populate new blocks. The\nextrapolation is done by conditioning the generation process with the feature\nsamples from the overlapping tri-planes during the denoising iterations. Latent\ntri-plane extrapolation produces semantically and geometrically meaningful\ntransitions that harmoniously blend with the existing scene. A 2D layout\nconditioning mechanism is used to control the placement and arrangement of\nscene elements. Experimental results indicate that BlockFusion is capable of\ngenerating diverse, geometrically consistent and unbounded large 3D scenes with\nunprecedented high-quality shapes in both indoor and outdoor scenarios.\n","authors":["Zhennan Wu","Yang Li","Han Yan","Taizhang Shang","Weixuan Sun","Senbo Wang","Ruikai Cui","Weizhe Liu","Hiroyuki Sato","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2401.17053v2.pdf","comment":"Video: https://www.youtube.com/watch?v=PxIBtd6G0mA"},{"id":"http://arxiv.org/abs/2401.17883v1","updated":"2024-01-31T14:41:40Z","published":"2024-01-31T14:41:40Z","title":"Reimagining Reality: A Comprehensive Survey of Video Inpainting\n Techniques","summary":" This paper offers a comprehensive analysis of recent advancements in video\ninpainting techniques, a critical subset of computer vision and artificial\nintelligence. As a process that restores or fills in missing or corrupted\nportions of video sequences with plausible content, video inpainting has\nevolved significantly with the advent of deep learning methodologies. Despite\nthe plethora of existing methods and their swift development, the landscape\nremains complex, posing challenges to both novices and established researchers.\nOur study deconstructs major techniques, their underpinning theories, and their\neffective applications. Moreover, we conduct an exhaustive comparative study,\ncentering on two often-overlooked dimensions: visual quality and computational\nefficiency. We adopt a human-centric approach to assess visual quality,\nenlisting a panel of annotators to evaluate the output of different video\ninpainting techniques. This provides a nuanced qualitative understanding that\ncomplements traditional quantitative metrics. Concurrently, we delve into the\ncomputational aspects, comparing inference times and memory demands across a\nstandardized hardware setup. This analysis underscores the balance between\nquality and efficiency: a critical consideration for practical applications\nwhere resources may be constrained. By integrating human validation and\ncomputational resource comparison, this survey not only clarifies the present\nlandscape of video inpainting techniques but also charts a course for future\nexplorations in this vibrant and evolving field.\n","authors":["Shreyank N Gowda","Yash Thakre","Shashank Narayana Gowda","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2401.17883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17881v1","updated":"2024-01-31T14:39:11Z","published":"2024-01-31T14:39:11Z","title":"PVLR: Prompt-driven Visual-Linguistic Representation Learning for\n Multi-Label Image Recognition","summary":" Multi-label image recognition is a fundamental task in computer vision.\nRecently, vision-language models have made notable advancements in this area.\nHowever, previous methods often failed to effectively leverage the rich\nknowledge within language models and instead incorporated label semantics into\nvisual features in a unidirectional manner. In this paper, we propose a\nPrompt-driven Visual-Linguistic Representation Learning (PVLR) framework to\nbetter leverage the capabilities of the linguistic modality. In PVLR, we first\nintroduce a dual-prompting strategy comprising Knowledge-Aware Prompting (KAP)\nand Context-Aware Prompting (CAP). KAP utilizes fixed prompts to capture the\nintrinsic semantic knowledge and relationships across all labels, while CAP\nemploys learnable prompts to capture context-aware label semantics and\nrelationships. Later, we propose an Interaction and Fusion Module (IFM) to\ninteract and fuse the representations obtained from KAP and CAP. In contrast to\nthe unidirectional fusion in previous works, we introduce a Dual-Modal\nAttention (DMA) that enables bidirectional interaction between textual and\nvisual features, yielding context-aware label representations and\nsemantic-related visual representations, which are subsequently used to\ncalculate similarities and generate final predictions for all labels. Extensive\nexperiments on three popular datasets including MS-COCO, Pascal VOC 2007, and\nNUS-WIDE demonstrate the superiority of PVLR.\n","authors":["Hao Tan","Zichang Tan","Jun Li","Jun Wan","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2401.17881v1.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.17879v1","updated":"2024-01-31T14:36:49Z","published":"2024-01-31T14:36:49Z","title":"AEROBLADE: Training-Free Detection of Latent Diffusion Images Using\n Autoencoder Reconstruction Error","summary":" With recent text-to-image models, anyone can generate deceptively realistic\nimages with arbitrary contents, fueling the growing threat of visual\ndisinformation. A key enabler for generating high-resolution images with low\ncomputational cost has been the development of latent diffusion models (LDMs).\nIn contrast to conventional diffusion models, LDMs perform the denoising\nprocess in the low-dimensional latent space of a pre-trained autoencoder (AE)\ninstead of the high-dimensional image space. Despite their relevance, the\nforensic analysis of LDMs is still in its infancy. In this work we propose\nAEROBLADE, a novel detection method which exploits an inherent component of\nLDMs: the AE used to transform images between image and latent space. We find\nthat generated images can be more accurately reconstructed by the AE than real\nimages, allowing for a simple detection approach based on the reconstruction\nerror. Most importantly, our method is easy to implement and does not require\nany training, yet nearly matches the performance of detectors that rely on\nextensive training. We empirically demonstrate that AEROBLADE is effective\nagainst state-of-the-art LDMs including Stable Diffusion and Midjourney. Beyond\ndetection, our approach allows for the qualitative analysis of images, which\ncan be leveraged for identifying inpainted regions.\n","authors":["Jonas Ricker","Denis Lukovnikov","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2401.17879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17874v1","updated":"2024-01-31T14:32:56Z","published":"2024-01-31T14:32:56Z","title":"VR-based generation of photorealistic synthetic data for training\n hand-object tracking models","summary":" Supervised learning models for precise tracking of hand-object interactions\n(HOI) in 3D require large amounts of annotated data for training. Moreover, it\nis not intuitive for non-experts to label 3D ground truth (e.g. 6DoF object\npose) on 2D images. To address these issues, we present \"blender-hoisynth\", an\ninteractive synthetic data generator based on the Blender software.\nBlender-hoisynth can scalably generate and automatically annotate visual HOI\ntraining data. Other competing approaches usually generate synthetic HOI data\ncompeletely without human input. While this may be beneficial in some\nscenarios, HOI applications inherently necessitate direct control over the HOIs\nas an expression of human intent. With blender-hoisynth, it is possible for\nusers to interact with objects via virtual hands using standard Virtual Reality\nhardware. The synthetically generated data are characterized by a high degree\nof photorealism and contain visually plausible and physically realistic videos\nof hands grasping objects and moving them around in 3D. To demonstrate the\nefficacy of our data generation, we replace large parts of the training data in\nthe well-known DexYCB dataset with hoisynth data and train a state-of-the-art\nHOI reconstruction model with it. We show that there is no significant\ndegradation in the model performance despite the data replacement.\n","authors":["Chengyan Zhang","Rahul Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2401.17874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19413v2","updated":"2024-01-31T14:31:23Z","published":"2023-10-30T10:24:21Z","title":"CARPE-ID: Continuously Adaptable Re-identification for Personalized\n Robot Assistance","summary":" In today's Human-Robot Interaction (HRI) scenarios, a prevailing tendency\nexists to assume that the robot shall cooperate with the closest individual or\nthat the scene involves merely a singular human actor. However, in realistic\nscenarios, such as shop floor operations, such an assumption may not hold and\npersonalized target recognition by the robot in crowded environments is\nrequired. To fulfil this requirement, in this work, we propose a person\nre-identification module based on continual visual adaptation techniques that\nensure the robot's seamless cooperation with the appropriate individual even\nsubject to varying visual appearances or partial or complete occlusions. We\ntest the framework singularly using recorded videos in a laboratory environment\nand an HRI scenario, i.e., a person-following task by a mobile robot. The\ntargets are asked to change their appearance during tracking and to disappear\nfrom the camera field of view to test the challenging cases of occlusion and\noutfit variations. We compare our framework with one of the state-of-the-art\nMulti-Object Tracking (MOT) methods and the results show that the CARPE-ID can\naccurately track each selected target throughout the experiments in all the\ncases (except two limit cases). At the same time, the s-o-t-a MOT has a mean of\n4 tracking errors for each video.\n","authors":["Federico Rollo","Andrea Zunino","Nikolaos Tsagarakis","Enrico Mingo Hoffman","Arash Ajoudani"],"pdf_url":"https://arxiv.org/pdf/2310.19413v2.pdf","comment":"Accepted to the International Conference on Robotics and Automation\n (ICRA) 2024"},{"id":"http://arxiv.org/abs/2401.17868v1","updated":"2024-01-31T14:27:07Z","published":"2024-01-31T14:27:07Z","title":"Convolution Meets LoRA: Parameter Efficient Finetuning for Segment\n Anything Model","summary":" The Segment Anything Model (SAM) stands as a foundational framework for image\nsegmentation. While it exhibits remarkable zero-shot generalization in typical\nscenarios, its advantage diminishes when applied to specialized domains like\nmedical imagery and remote sensing. To address this limitation, this paper\nintroduces Conv-LoRA, a simple yet effective parameter-efficient fine-tuning\napproach. By integrating ultra-lightweight convolutional parameters into\nLow-Rank Adaptation (LoRA), Conv-LoRA can inject image-related inductive biases\ninto the plain ViT encoder, further reinforcing SAM's local prior assumption.\nNotably, Conv-LoRA not only preserves SAM's extensive segmentation knowledge\nbut also revives its capacity of learning high-level image semantics, which is\nconstrained by SAM's foreground-background segmentation pretraining.\nComprehensive experimentation across diverse benchmarks spanning multiple\ndomains underscores Conv-LoRA's superiority in adapting SAM to real-world\nsemantic segmentation tasks.\n","authors":["Zihan Zhong","Zhiqiang Tang","Tong He","Haoyang Fang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.17868v1.pdf","comment":"Accepted at ICLR 2024 Conference"},{"id":"http://arxiv.org/abs/2401.17862v1","updated":"2024-01-31T14:21:49Z","published":"2024-01-31T14:21:49Z","title":"Proximity QA: Unleashing the Power of Multi-Modal Large Language Models\n for Spatial Proximity Analysis","summary":" Multi-modal large language models (MLLMs) have demonstrated remarkable\nvision-language capabilities, primarily due to the exceptional in-context\nunderstanding and multi-task learning strengths of large language models\n(LLMs). The advent of visual instruction tuning has further enhanced MLLMs'\nperformance in vision-language understanding. However, while existing MLLMs\nadeptly recognize \\textit{what} objects are in an image, they still face\nchallenges in effectively discerning \\textit{where} these objects are,\nparticularly along the distance (scene depth) axis. To overcome this limitation\nin MLLMs, we introduce Proximity Question Answering (Proximity QA), a novel\nframework designed to enable MLLMs to infer the proximity relationship between\nobjects in images. The framework operates in two phases: the first phase\nfocuses on guiding the models to understand the relative depth of objects, and\nthe second phase further encourages the models to infer the proximity\nrelationships between objects based on their depth perceptions. We also propose\na VQA dataset called Proximity-110K, containing additional instructions that\nincorporate depth information and the proximity relationships of objects. We\nhave conducted extensive experiments to validate Proximity QA's superior\nability in depth perception and proximity analysis, outperforming other\nstate-of-the-art MLLMs. Code and dataset will be released at\n\\textcolor{magenta}{https://github.com/NorthSummer/ProximityQA.git}.\n","authors":["Jianing Li","Xi Nan","Ming Lu","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17862v1.pdf","comment":"15 pages,version 1"},{"id":"http://arxiv.org/abs/2401.17857v1","updated":"2024-01-31T14:19:03Z","published":"2024-01-31T14:19:03Z","title":"Semantic Anything in 3D Gaussians","summary":" 3D Gaussian Splatting has emerged as an alternative 3D representation of\nNeural Radiance Fields (NeRFs), benefiting from its high-quality rendering\nresults and real-time rendering speed. Considering the 3D Gaussian\nrepresentation remains unparsed, it is necessary first to execute object\nsegmentation within this domain. Subsequently, scene editing and collision\ndetection can be performed, proving vital to a multitude of applications, such\nas virtual reality (VR), augmented reality (AR), game/movie production, etc. In\nthis paper, we propose a novel approach to achieve object segmentation in 3D\nGaussian via an interactive procedure without any training process and learned\nparameters. We refer to the proposed method as SA-GS, for Segment Anything in\n3D Gaussians. Given a set of clicked points in a single input view, SA-GS can\ngeneralize SAM to achieve 3D consistent segmentation via the proposed\nmulti-view mask generation and view-wise label assignment methods. We also\npropose a cross-view label-voting approach to assign labels from different\nviews. In addition, in order to address the boundary roughness issue of\nsegmented objects resulting from the non-negligible spatial sizes of 3D\nGaussian located at the boundary, SA-GS incorporates the simple but effective\nGaussian Decomposition scheme. Extensive experiments demonstrate that SA-GS\nachieves high-quality 3D segmentation results, which can also be easily applied\nfor scene editing and collision detection tasks. Codes will be released soon.\n","authors":["Xu Hu","Yuxi Wang","Lue Fan","Junsong Fan","Junran Peng","Zhen Lei","Qing Li","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17851v1","updated":"2024-01-31T14:13:01Z","published":"2024-01-31T14:13:01Z","title":"Instruction-Guided Scene Text Recognition","summary":" Multi-modal models have shown appealing performance in visual tasks recently,\nas instruction-guided training has evoked the ability to understand\nfine-grained visual content. However, current methods cannot be trivially\napplied to scene text recognition (STR) due to the gap between natural and text\nimages. In this paper, we introduce a novel paradigm that formulates STR as an\ninstruction learning problem, and propose instruction-guided scene text\nrecognition (IGTR) to achieve effective cross-modal learning. IGTR first\ngenerates rich and diverse instruction triplets of ,\nserving as guidance for nuanced text image understanding. Then, we devise an\narchitecture with dedicated cross-modal feature fusion module, and multi-task\nanswer head to effectively fuse the required instruction and image features for\nanswering questions. Built upon these designs, IGTR facilitates accurate text\nrecognition by comprehending character attributes. Experiments on English and\nChinese benchmarks show that IGTR outperforms existing models by significant\nmargins. Furthermore, by adjusting the instructions, IGTR enables various\nrecognition schemes. These include zero-shot prediction, where the model is\ntrained based on instructions not explicitly targeting character recognition,\nand the recognition of rarely appearing and morphologically similar characters,\nwhich were previous challenges for existing models.\n","authors":["Yongkun Du","Zhineng Chen","Yuchen Su","Caiyan Jia","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.17851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11199v2","updated":"2024-01-31T14:11:56Z","published":"2023-08-22T05:21:31Z","title":"ConcatPlexer: Additional Dim1 Batching for Faster ViTs","summary":" Transformers have demonstrated tremendous success not only in the natural\nlanguage processing (NLP) domain but also the field of computer vision,\nigniting various creative approaches and applications. Yet, the superior\nperformance and modeling flexibility of transformers came with a severe\nincrease in computation costs, and hence several works have proposed methods to\nreduce this burden. Inspired by a cost-cutting method originally proposed for\nlanguage models, Data Multiplexing (DataMUX), we propose a novel approach for\nefficient visual recognition that employs additional dim1 batching (i.e.,\nconcatenation) that greatly improves the throughput with little compromise in\nthe accuracy. We first introduce a naive adaptation of DataMux for vision\nmodels, Image Multiplexer, and devise novel components to overcome its\nweaknesses, rendering our final model, ConcatPlexer, at the sweet spot between\ninference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and\nCIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and\n83.4% validation accuracy, respectively.\n","authors":["Donghoon Han","Seunghyeon Seo","Donghyeon Jeon","Jiho Jang","Chaerin Kong","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.11199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17828v1","updated":"2024-01-31T13:41:17Z","published":"2024-01-31T13:41:17Z","title":"Leveraging Swin Transformer for Local-to-Global Weakly Supervised\n Semantic Segmentation","summary":" In recent years, weakly supervised semantic segmentation using image-level\nlabels as supervision has received significant attention in the field of\ncomputer vision. Most existing methods have addressed the challenges arising\nfrom the lack of spatial information in these labels by focusing on\nfacilitating supervised learning through the generation of pseudo-labels from\nclass activation maps (CAMs). Due to the localized pattern detection of\nConvolutional Neural Networks (CNNs), CAMs often emphasize only the most\ndiscriminative parts of an object, making it challenging to accurately\ndistinguish foreground objects from each other and the background. Recent\nstudies have shown that Vision Transformer (ViT) features, due to their global\nview, are more effective in capturing the scene layout than CNNs. However, the\nuse of hierarchical ViTs has not been extensively explored in this field. This\nwork explores the use of Swin Transformer by proposing \"SWTformer\" to enhance\nthe accuracy of the initial seed CAMs by bringing local and global views\ntogether. SWTformer-V1 generates class probabilities and CAMs using only the\npatch tokens as features. SWTformer-V2 incorporates a multi-scale feature\nfusion mechanism to extract additional information and utilizes a\nbackground-aware mechanism to generate more accurate localization maps with\nimproved cross-object discrimination. Based on experiments on the PascalVOC\n2012 dataset, SWTformer-V1 achieves a 0.98% mAP higher localization accuracy,\noutperforming state-of-the-art models. It also yields comparable performance by\n0.82% mIoU on average higher than other methods in generating initial\nlocalization maps, depending only on the classification network. SWTformer-V2\nfurther improves the accuracy of the generated seed CAMs by 5.32% mIoU, further\nproving the effectiveness of the local-to-global view provided by the Swin\ntransformer.\n","authors":["Rozhan Ahmadi","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2401.17828v1.pdf","comment":"7 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.17821v1","updated":"2024-01-31T13:24:51Z","published":"2024-01-31T13:24:51Z","title":"Do Object Detection Localization Errors Affect Human Performance and\n Trust?","summary":" Bounding boxes are often used to communicate automatic object detection\nresults to humans, aiding humans in a multitude of tasks. We investigate the\nrelationship between bounding box localization errors and human task\nperformance. We use observer performance studies on a visual multi-object\ncounting task to measure both human trust and performance with different levels\nof bounding box accuracy. The results show that localization errors have no\nsignificant impact on human accuracy or trust in the system. Recall and\nprecision errors impact both human performance and trust, suggesting that\noptimizing algorithms based on the F1 score is more beneficial in\nhuman-computer tasks. Lastly, the paper offers an improvement on bounding boxes\nin multi-object counting tasks with center dots, showing improved performance\nand better resilience to localization inaccuracy.\n","authors":["Sven de Witte","Ombretta Strafforello","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2401.17821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15589v2","updated":"2024-01-31T13:19:38Z","published":"2023-09-27T11:44:58Z","title":"Domain generalization across tumor types, laboratories, and species --\n insights from the 2022 edition of the Mitosis Domain Generalization Challenge","summary":" Recognition of mitotic figures in histologic tumor specimens is highly\nrelevant to patient outcome assessment. This task is challenging for algorithms\nand human experts alike, with deterioration of algorithmic performance under\nshifts in image representations. Considerable covariate shifts occur when\nassessment is performed on different tumor types, images are acquired using\ndifferent digitization devices, or specimens are produced in different\nlaboratories. This observation motivated the inception of the 2022 challenge on\nMItosis Domain Generalization (MIDOG 2022). The challenge provided annotated\nhistologic tumor images from six different domains and evaluated the\nalgorithmic approaches for mitotic figure detection provided by nine challenge\nparticipants on ten independent domains. Ground truth for mitotic figure\ndetection was established in two ways: a three-expert consensus and an\nindependent, immunohistochemistry-assisted set of labels. This work represents\nan overview of the challenge tasks, the algorithmic strategies employed by the\nparticipants, and potential factors contributing to their success. With an\n$F_1$ score of 0.764 for the top-performing team, we summarize that domain\ngeneralization across various tumor domains is possible with today's deep\nlearning-based recognition pipelines. However, we also found that domain\ncharacteristics not present in the training set (feline as new species, spindle\ncell shape as new morphology and a new scanner) led to small but significant\ndecreases in performance. When assessed against the\nimmunohistochemistry-assisted reference standard, all methods resulted in\nreduced recall scores, but with only minor changes in the order of participants\nin the ranking.\n","authors":["Marc Aubreville","Nikolas Stathonikos","Taryn A. Donovan","Robert Klopfleisch","Jonathan Ganz","Jonas Ammeling","Frauke Wilm","Mitko Veta","Samir Jabari","Markus Eckstein","Jonas Annuscheit","Christian Krumnow","Engin Bozaba","Sercan Cayir","Hongyan Gu","Xiang 'Anthony' Chen","Mostafa Jahanifar","Adam Shephard","Satoshi Kondo","Satoshi Kasai","Sujatha Kotte","VG Saipradeep","Maxime W. Lafarge","Viktor H. Koelzer","Ziyue Wang","Yongbing Zhang","Sen Yang","Xiyue Wang","Katharina Breininger","Christof A. Bertram"],"pdf_url":"https://arxiv.org/pdf/2309.15589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17807v1","updated":"2024-01-31T13:06:48Z","published":"2024-01-31T13:06:48Z","title":"Advances in 3D Generation: A Survey","summary":" Generating 3D models lies at the core of computer graphics and has been the\nfocus of decades of research. With the emergence of advanced neural\nrepresentations and generative models, the field of 3D content generation is\ndeveloping rapidly, enabling the creation of increasingly high-quality and\ndiverse 3D models. The rapid growth of this field makes it difficult to stay\nabreast of all recent developments. In this survey, we aim to introduce the\nfundamental methodologies of 3D generation methods and establish a structured\nroadmap, encompassing 3D representation, generation methods, datasets, and\ncorresponding applications. Specifically, we introduce the 3D representations\nthat serve as the backbone for 3D generation. Furthermore, we provide a\ncomprehensive overview of the rapidly growing literature on generation methods,\ncategorized by the type of algorithmic paradigms, including feedforward\ngeneration, optimization-based generation, procedural generation, and\ngenerative novel view synthesis. Lastly, we discuss available datasets,\napplications, and open challenges. We hope this survey will help readers\nexplore this exciting topic and foster further advancements in the field of 3D\ncontent generation.\n","authors":["Xiaoyu Li","Qi Zhang","Di Kang","Weihao Cheng","Yiming Gao","Jingbo Zhang","Zhihao Liang","Jing Liao","Yan-Pei Cao","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.17807v1.pdf","comment":"33 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.17803v1","updated":"2024-01-31T12:53:11Z","published":"2024-01-31T12:53:11Z","title":"SimAda: A Simple Unified Framework for Adapting Segment Anything Model\n in Underperformed Scenes","summary":" Segment anything model (SAM) has demonstrated excellent generalization\ncapabilities in common vision scenarios, yet lacking an understanding of\nspecialized data. Although numerous works have focused on optimizing SAM for\ndownstream tasks, these task-specific approaches usually limit the\ngeneralizability to other downstream tasks. In this paper, we aim to\ninvestigate the impact of the general vision modules on finetuning SAM and\nenable them to generalize across all downstream tasks. We propose a simple\nunified framework called SimAda for adapting SAM in underperformed scenes.\nSpecifically, our framework abstracts the general modules of different methods\ninto basic design elements, and we design four variants based on a shared\ntheoretical framework. SimAda is simple yet effective, which removes all\ndataset-specific designs and focuses solely on general optimization, ensuring\nthat SimAda can be applied to all SAM-based and even Transformer-based models.\nWe conduct extensive experiments on nine datasets of six downstream tasks. The\nresults demonstrate that SimAda significantly improves the performance of SAM\non multiple downstream tasks and achieves state-of-the-art performance on most\nof them, without requiring task-specific designs. Code is available at:\nhttps://github.com/zongzi13545329/SimAda\n","authors":["Yiran Song","Qianyu Zhou","Xuequan Lu","Zhiwen Shao","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.17803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17797v1","updated":"2024-01-31T12:45:44Z","published":"2024-01-31T12:45:44Z","title":"M2-RAAP: A Multi-Modal Recipe for Advancing Adaptation-based\n Pre-training towards Effective and Efficient Zero-shot Video-text Retrieval","summary":" We present a Multi-Modal Recipe for Advancing Adaptation-based Pre-training\ntowards effective and efficient zero-shot video-text retrieval, dubbed M2-RAAP.\nUpon popular image-text models like CLIP, most current adaptation-based\nvideo-text pre-training methods are confronted by three major issues, i.e.,\nnoisy data corpus, time-consuming pre-training, and limited performance gain.\nTowards this end, we conduct a comprehensive study including four critical\nsteps in video-text pre-training. Specifically, we investigate 1) data\nfiltering and refinement, 2) video input type selection, 3) temporal modeling,\nand 4) video feature enhancement. We then summarize this empirical study into\nthe M2-RAAP recipe, where our technical contributions lie in 1) the data\nfiltering and text re-writing pipeline resulting in 1M high-quality bilingual\nvideo-text pairs, 2) the replacement of video inputs with key-frames to\naccelerate pre-training, and 3) the Auxiliary-Caption-Guided (ACG) strategy to\nenhance video features. We conduct extensive experiments by adapting three\nimage-text foundation models on two refined video-text datasets from different\nlanguages, validating the robustness and reproducibility of M2-RAAP for\nadaptation-based pre-training. Results demonstrate that M2-RAAP yields superior\nperformance with significantly reduced data (-90%) and time consumption (-95%),\nestablishing a new SOTA on four English zero-shot retrieval datasets and two\nChinese ones. We are preparing our refined bilingual data annotations and\ncodebase, which will be available at\nhttps://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/M2_RAAP.\n","authors":["Xingning Dong","Zipeng Feng","Chunluan Zhou","Xuzheng Yu","Ming Yang","Qingpei Guo"],"pdf_url":"https://arxiv.org/pdf/2401.17797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15885v2","updated":"2024-01-31T12:41:05Z","published":"2024-01-29T04:40:33Z","title":"Rectify the Regression Bias in Long-Tailed Object Detection","summary":" Long-tailed object detection faces great challenges because of its extremely\nimbalanced class distribution. Recent methods mainly focus on the\nclassification bias and its loss function design, while ignoring the subtle\ninfluence of the regression branch. This paper shows that the regression bias\nexists and does adversely and seriously impact the detection accuracy. While\nexisting methods fail to handle the regression bias, the class-specific\nregression head for rare classes is hypothesized to be the main cause of it in\nthis paper. As a result, three kinds of viable solutions to cater for the rare\ncategories are proposed, including adding a class-agnostic branch, clustering\nheads and merging heads. The proposed methods brings in consistent and\nsignificant improvements over existing long-tailed detection methods,\nespecially in rare and common classes. The proposed method achieves\nstate-of-the-art performance in the large vocabulary LVIS dataset with\ndifferent backbones and architectures. It generalizes well to more difficult\nevaluation metrics, relatively balanced datasets, and the mask branch. This is\nthe first attempt to reveal and explore rectifying of the regression bias in\nlong-tailed object detection.\n","authors":["Ke Zhu","Minghao Fu","Jie Shao","Tianyu Liu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2401.15885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v3","updated":"2024-01-31T12:40:51Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":" Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\ndesigned for point clouds. Notably, prior latent variable models for point\nclouds lack a one-to-one correspondence between input and output points.\nInstead, they rely on optimizing Chamfer distances, a metric that lacks a\nnormalized distributional counterpart, rendering it unsuitable for\nprobabilistic modeling. We replace the explicit minimization of Chamfer\ndistances with a suitable encoder, increasing computational efficiency while\nsimplifying the probabilistic extension. This allows for straightforward\napplication in various tasks, including mesh generation, shape completion, and\nrepresentation learning. Empirically, we provide evidence of lower\nreconstruction error in dental reconstruction and interpolation, showcasing\nstate-of-the-art performance in dental sample generation while identifying\nvaluable latent representations.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00028v3","updated":"2024-01-31T12:34:48Z","published":"2023-12-29T03:08:57Z","title":"An Empirical Study of Scaling Law for OCR","summary":" The laws of model size, data volume, computation and model performance have\nbeen extensively studied in the field of Natural Language Processing (NLP).\nHowever, the scaling laws in Optical Character Recognition (OCR) have not yet\nbeen investigated. To address this, we conducted comprehensive studies that\ninvolved examining the correlation between performance and the scale of models,\ndata volume and computation in the field of text recognition.Conclusively, the\nstudy demonstrates smooth power laws between performance and model size, as\nwell as training data volume, when other influencing factors are held constant.\nAdditionally, we have constructed a large-scale dataset called REBU-Syn, which\ncomprises 6 million real samples and 18 million synthetic samples. Based on our\nscaling law and new dataset, we have successfully trained a scene text\nrecognition model, achieving a new state-ofthe-art on 6 common test benchmarks\nwith a top-1 average accuracy of 97.42%. The models and dataset are publicly\navailable at https://github.com/large-ocr-model/large-ocr-model.github.io.\n","authors":["Miao Rang","Zhenni Bi","Chuanjian Liu","Yunhe Wang","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2401.00028v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17790v1","updated":"2024-01-31T12:32:18Z","published":"2024-01-31T12:32:18Z","title":"RADIN: Souping on a Budget","summary":" Model Soups, extending Stochastic Weights Averaging (SWA), combine models\nfine-tuned with different hyperparameters. Yet, their adoption is hindered by\ncomputational challenges due to subset selection issues. In this paper, we\npropose to speed up model soups by approximating soups performance using\naveraged ensemble logits performances. Theoretical insights validate the\ncongruence between ensemble logits and weight averaging soups across any mixing\nratios. Our Resource ADjusted soups craftINg (RADIN) procedure stands out by\nallowing flexible evaluation budgets, enabling users to adjust his budget of\nexploration adapted to his resources while increasing performance at lower\nbudget compared to previous greedy approach (up to 4% on ImageNet).\n","authors":["Thibaut Menes","Olivier Risser-Maroix"],"pdf_url":"https://arxiv.org/pdf/2401.17790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17789v1","updated":"2024-01-31T12:32:17Z","published":"2024-01-31T12:32:17Z","title":"Robustly overfitting latents for flexible neural image compression","summary":" Neural image compression has made a great deal of progress. State-of-the-art\nmodels are based on variational autoencoders and are outperforming classical\nmodels. Neural compression models learn to encode an image into a quantized\nlatent representation that can be efficiently sent to the decoder, which\ndecodes the quantized latent into a reconstructed image. While these models\nhave proven successful in practice, they lead to sub-optimal results due to\nimperfect optimization and limitations in the encoder and decoder capacity.\nRecent work shows how to use stochastic Gumbel annealing (SGA) to refine the\nlatents of pre-trained neural image compression models. We extend this idea by\nintroducing SGA+, which contains three different methods that build upon SGA.\nFurther, we give a detailed analysis of our proposed methods, show how they\nimprove performance, and show that they are less sensitive to hyperparameter\nchoices. Besides, we show how each method can be extended to three- instead of\ntwo-class rounding. Finally, we show how refinement of the latents with our\nbest-performing method improves the compression performance on the Tecnick\ndataset and how it can be deployed to partly move along the rate-distortion\ncurve.\n","authors":["Yura Perugachi-Diaz","Arwin Gansekoele","Sandjai Bhulai"],"pdf_url":"https://arxiv.org/pdf/2401.17789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08357v2","updated":"2024-01-31T12:18:10Z","published":"2024-01-16T13:35:28Z","title":"SAMF: Small-Area-Aware Multi-focus Image Fusion for Object Detection","summary":" Existing multi-focus image fusion (MFIF) methods often fail to preserve the\nuncertain transition region and detect small focus areas within large defocused\nregions accurately. To address this issue, this study proposes a new\nsmall-area-aware MFIF algorithm for enhancing object detection capability.\nFirst, we enhance the pixel attributes within the small focus and boundary\nregions, which are subsequently combined with visual saliency detection to\nobtain the pre-fusion results used to discriminate the distribution of focused\npixels. To accurately ensure pixel focus, we consider the source image as a\ncombination of focused, defocused, and uncertain regions and propose a\nthree-region segmentation strategy. Finally, we design an effective pixel\nselection rule to generate segmentation decision maps and obtain the final\nfusion results. Experiments demonstrated that the proposed method can\naccurately detect small and smooth focus areas while improving object detection\nperformance, outperforming existing methods in both subjective and objective\nevaluations. The source code is available at https://github.com/ixilai/SAMF.\n","authors":["Xilai Li","Xiaosong Li","Haishu Tan","Jinyang Li"],"pdf_url":"https://arxiv.org/pdf/2401.08357v2.pdf","comment":"Accepted to International Conference on Acoustics, Speech and Signal\n Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.17776v1","updated":"2024-01-31T12:16:39Z","published":"2024-01-31T12:16:39Z","title":"Double InfoGAN for Contrastive Analysis","summary":" Contrastive Analysis (CA) deals with the discovery of what is common and what\nis distinctive of a target domain compared to a background one. This is of\ngreat interest in many applications, such as medical imaging. Current\nstate-of-the-art (SOTA) methods are latent variable models based on VAE\n(CA-VAEs). However, they all either ignore important constraints or they don't\nenforce fundamental assumptions. This may lead to sub-optimal solutions where\ndistinctive factors are mistaken for common ones (or viceversa). Furthermore,\nthe generated images have a rather poor quality, typical of VAEs, decreasing\ntheir interpretability and usefulness. Here, we propose Double InfoGAN, the\nfirst GAN based method for CA that leverages the high-quality synthesis of GAN\nand the separation power of InfoGAN. Experimental results on four visual\ndatasets, from simple synthetic examples to complex medical images, show that\nthe proposed method outperforms SOTA CA-VAEs in terms of latent separation and\nimage quality. Datasets and code are available online.\n","authors":["Florence Carton","Robin Louiset","Pietro Gori"],"pdf_url":"https://arxiv.org/pdf/2401.17776v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2311.01886v2","updated":"2024-01-31T12:13:49Z","published":"2023-11-03T12:58:39Z","title":"Bridging the Gap between Multi-focus and Multi-modal: A Focused\n Integration Framework for Multi-modal Image Fusion","summary":" Multi-modal image fusion (MMIF) integrates valuable information from\ndifferent modality images into a fused one. However, the fusion of multiple\nvisible images with different focal regions and infrared images is a\nunprecedented challenge in real MMIF applications. This is because of the\nlimited depth of the focus of visible optical lenses, which impedes the\nsimultaneous capture of the focal information within the same scene. To address\nthis issue, in this paper, we propose a MMIF framework for joint focused\nintegration and modalities information extraction. Specifically, a\nsemi-sparsity-based smoothing filter is introduced to decompose the images into\nstructure and texture components. Subsequently, a novel multi-scale operator is\nproposed to fuse the texture components, capable of detecting significant\ninformation by considering the pixel focus attributes and relevant data from\nvarious modal images. Additionally, to achieve an effective capture of scene\nluminance and reasonable contrast maintenance, we consider the distribution of\nenergy information in the structural components in terms of multi-directional\nfrequency variance and information entropy. Extensive experiments on existing\nMMIF datasets, as well as the object detection and depth estimation tasks,\nconsistently demonstrate that the proposed algorithm can surpass the\nstate-of-the-art methods in visual perception and quantitative evaluation. The\ncode is available at https://github.com/ixilai/MFIF-MMIF.\n","authors":["Xilai Li","Xiaosong Li","Tao Ye","Xiaoqi Cheng","Wuyang Liu","Haishu Tan"],"pdf_url":"https://arxiv.org/pdf/2311.01886v2.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2024"},{"id":"http://arxiv.org/abs/2401.17773v1","updated":"2024-01-31T12:12:56Z","published":"2024-01-31T12:12:56Z","title":"SNP-S3: Shared Network Pre-training and Significant Semantic\n Strengthening for Various Video-Text Tasks","summary":" We present a framework for learning cross-modal video representations by\ndirectly pre-training on raw data to facilitate various downstream video-text\ntasks. Our main contributions lie in the pre-training framework and proxy\ntasks. First, based on the shortcomings of two mainstream pixel-level\npre-training architectures (limited applications or less efficient), we propose\nShared Network Pre-training (SNP). By employing one shared BERT-type network to\nrefine textual and cross-modal features simultaneously, SNP is lightweight and\ncould support various downstream applications. Second, based on the intuition\nthat people always pay attention to several \"significant words\" when\nunderstanding a sentence, we propose the Significant Semantic Strengthening\n(S3) strategy, which includes a novel masking and matching proxy task to\npromote the pre-training performance. Experiments conducted on three downstream\nvideo-text tasks and six datasets demonstrate that, we establish a new\nstate-of-the-art in pixel-level video-text pre-training; we also achieve a\nsatisfactory balance between the pre-training efficiency and the fine-tuning\nperformance. The codebase are available at\nhttps://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/snps3_vtp.\n","authors":["Xingning Dong","Qingpei Guo","Tian Gan","Qing Wang","Jianlong Wu","Xiangyuan Ren","Yuan Cheng","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2401.17773v1.pdf","comment":"Accepted by TCSVT (IEEE Transactions on Circuits and Systems for\n Video Technology)"},{"id":"http://arxiv.org/abs/2401.17766v1","updated":"2024-01-31T11:51:24Z","published":"2024-01-31T11:51:24Z","title":"Fine-Grained Zero-Shot Learning: Advances, Challenges, and Prospects","summary":" Recent zero-shot learning (ZSL) approaches have integrated fine-grained\nanalysis, i.e., fine-grained ZSL, to mitigate the commonly known seen/unseen\ndomain bias and misaligned visual-semantics mapping problems, and have made\nprofound progress. Notably, this paradigm differs from existing close-set\nfine-grained methods and, therefore, can pose unique and nontrivial challenges.\nHowever, to the best of our knowledge, there remains a lack of systematic\nsummaries of this topic. To enrich the literature of this domain and provide a\nsound basis for its future development, in this paper, we present a broad\nreview of recent advances for fine-grained analysis in ZSL. Concretely, we\nfirst provide a taxonomy of existing methods and techniques with a thorough\nanalysis of each category. Then, we summarize the benchmark, covering publicly\navailable datasets, models, implementations, and some more details as a\nlibrary. Last, we sketch out some related applications. In addition, we discuss\nvital challenges and suggest potential future directions.\n","authors":["Jingcai Guo","Zhijie Rao","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.17766v1.pdf","comment":"11 pages, 1 figure, 4 tables"},{"id":"http://arxiv.org/abs/2309.10896v2","updated":"2024-01-31T11:47:24Z","published":"2023-09-19T19:42:26Z","title":"PLVS: A SLAM System with Points, Lines, Volumetric Mapping, and 3D\n Incremental Segmentation","summary":" This document presents PLVS: a real-time system that leverages sparse SLAM,\nvolumetric mapping, and 3D unsupervised incremental segmentation. PLVS stands\nfor Points, Lines, Volumetric mapping, and Segmentation. It supports RGB-D and\nStereo cameras, which may be optionally equipped with IMUs. The SLAM module is\nkeyframe-based, and extracts and tracks sparse points and line segments as\nfeatures. Volumetric mapping runs in parallel with respect to the SLAM\nfront-end and generates a 3D reconstruction of the explored environment by\nfusing point clouds backprojected from keyframes. Different volumetric mapping\nmethods are supported and integrated in PLVS. We use a novel reprojection error\nto bundle-adjust line segments. This error exploits available depth information\nto stabilize the position estimates of line segment endpoints. An incremental\nand geometric-based segmentation method is implemented and integrated for RGB-D\ncameras in the PLVS framework. We present qualitative and quantitative\nevaluations of the PLVS framework on some publicly available datasets. The\nappendix details the adopted stereo line triangulation method and provides a\nderivation of the Jacobians we used for line error terms. The software is\navailable as open-source.\n","authors":["Luigi Freda"],"pdf_url":"https://arxiv.org/pdf/2309.10896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17759v1","updated":"2024-01-31T11:36:12Z","published":"2024-01-31T11:36:12Z","title":"Tiered approach for rapid damage characterisation of infrastructure\n enabled by remote sensing and deep learning technologies","summary":" Critical infrastructure such as bridges are systematically targeted during\nwars and conflicts. This is because critical infrastructure is vital for\nenabling connectivity and transportation of people and goods, and hence,\nunderpinning the national and international defence planning and economic\ngrowth. Mass destruction of bridges, along with minimal or no accessibility to\nthese assets during natural and anthropogenic disasters, prevents us from\ndelivering rapid recovery. As a result, systemic resilience is drastically\nreduced. A solution to this challenge is to use technology for stand-off\nobservations. Yet, no method exists to characterise damage at different scales,\ni.e. regional, asset, and structural (component), and more so there is little\nor no systematic correlation between assessments at scale. We propose an\nintegrated three-level tiered approach to fill this capability gap, and we\ndemonstrate the methods for damage characterisation enabled by fit-for-purpose\ndigital technologies. Next, this method is applied and validated to a case\nstudy in Ukraine that includes 17 bridges. From macro to micro, we deploy\ntechnology at scale, from Sentinel-1 SAR images, crowdsourced information, and\nhigh-resolution images to deep learning for damaged infrastructure. For the\nfirst time, the interferometric coherence difference and semantic segmentation\nof images were deployed to improve the reliability of damage characterisations\nfrom regional to infrastructure component level, when enhanced assessment\naccuracy is required. This integrated method improves the speed of\ndecision-making, and thus, enhances resilience. Keywords: critical\ninfrastructure, damage characterisation, targeted attacks, restoration\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v1.pdf","comment":"Main text (34 pages,18 figures); Supplementary materials (13 pages)"},{"id":"http://arxiv.org/abs/2310.19620v2","updated":"2024-01-31T11:22:46Z","published":"2023-10-30T15:12:41Z","title":"Large Trajectory Models are Scalable Motion Predictors and Planners","summary":" Motion prediction and planning are vital tasks in autonomous driving, and\nrecent efforts have shifted to machine learning-based approaches. The\nchallenges include understanding diverse road topologies, reasoning traffic\ndynamics over a long time horizon, interpreting heterogeneous behaviors, and\ngenerating policies in a large continuous state space. Inspired by the success\nof large language models in addressing similar complexities through model\nscaling, we introduce a scalable trajectory model called State Transformer\n(STR). STR reformulates the motion prediction and motion planning problems by\narranging observations, states, and actions into one unified sequence modeling\ntask. Our approach unites trajectory generation problems with other sequence\nmodeling problems, powering rapid iterations with breakthroughs in neighbor\ndomains such as language modeling. Remarkably, experimental results reveal that\nlarge trajectory models (LTMs), such as STR, adhere to the scaling laws by\npresenting outstanding adaptability and learning efficiency. Qualitative\nresults further demonstrate that LTMs are capable of making plausible\npredictions in scenarios that diverge significantly from the training data\ndistribution. LTMs also learn to make complex reasonings for long-term\nplanning, without explicit loss designs or costly high-level annotations.\n","authors":["Qiao Sun","Shiduo Zhang","Danjiao Ma","Jingzhe Shi","Derun Li","Simian Luo","Yu Wang","Ningyi Xu","Guangzhi Cao","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.19620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12431v2","updated":"2024-01-31T10:57:40Z","published":"2023-12-19T18:57:34Z","title":"On Inference Stability for Diffusion Models","summary":" Denoising Probabilistic Models (DPMs) represent an emerging domain of\ngenerative models that excel in generating diverse and high-quality images.\nHowever, most current training methods for DPMs often neglect the correlation\nbetween timesteps, limiting the model's performance in generating images\neffectively. Notably, we theoretically point out that this issue can be caused\nby the cumulative estimation gap between the predicted and the actual\ntrajectory. To minimize that gap, we propose a novel \\textit{sequence-aware}\nloss that aims to reduce the estimation gap to enhance the sampling quality.\nFurthermore, we theoretically show that our proposed loss function is a tighter\nupper bound of the estimation loss in comparison with the conventional loss in\nDPMs. Experimental results on several benchmark datasets including CIFAR10,\nCelebA, and CelebA-HQ consistently show a remarkable improvement of our\nproposed method regarding the image generalization quality measured by FID and\nInception Score compared to several DPM baselines. Our code and pre-trained\ncheckpoints are available at \\url{https://github.com/VinAIResearch/SA-DPM}.\n","authors":["Viet Nguyen","Giang Vu","Tung Nguyen Thanh","Khoat Than","Toan Tran"],"pdf_url":"https://arxiv.org/pdf/2312.12431v2.pdf","comment":"Oral presentation at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.17736v1","updated":"2024-01-31T10:57:07Z","published":"2024-01-31T10:57:07Z","title":"Leveraging Human-Machine Interactions for Computer Vision Dataset\n Quality Enhancement","summary":" Large-scale datasets for single-label multi-class classification, such as\n\\emph{ImageNet-1k}, have been instrumental in advancing deep learning and\ncomputer vision. However, a critical and often understudied aspect is the\ncomprehensive quality assessment of these datasets, especially regarding\npotential multi-label annotation errors. In this paper, we introduce a\nlightweight, user-friendly, and scalable framework that synergizes human and\nmachine intelligence for efficient dataset validation and quality enhancement.\nWe term this novel framework \\emph{Multilabelfy}. Central to Multilabelfy is an\nadaptable web-based platform that systematically guides annotators through the\nre-evaluation process, effectively leveraging human-machine interactions to\nenhance dataset quality. By using Multilabelfy on the ImageNetV2 dataset, we\nfound that approximately $47.88\\%$ of the images contained at least two labels,\nunderscoring the need for more rigorous assessments of such influential\ndatasets. Furthermore, our analysis showed a negative correlation between the\nnumber of potential labels per image and model top-1 accuracy, illuminating a\ncrucial factor in model evaluation and selection. Our open-source framework,\nMultilabelfy, offers a convenient, lightweight solution for dataset\nenhancement, emphasizing multi-label proportions. This study tackles major\nchallenges in dataset integrity and provides key insights into model\nperformance evaluation. Moreover, it underscores the advantages of integrating\nhuman expertise with machine capabilities to produce more robust models and\ntrustworthy data development. The source code for Multilabelfy will be\navailable at https://github.com/esla/Multilabelfy.\n \\keywords{Computer Vision \\and Dataset Quality Enhancement \\and Dataset\nValidation \\and Human-Computer Interaction \\and Multi-label Annotation.}\n","authors":["Esla Timothy Anzaku","Hyesoo Hong","Jin-Woo Park","Wonjun Yang","Kangmin Kim","JongBum Won","Deshika Vinoshani Kumari Herath","Arnout Van Messem","Wesley De Neve"],"pdf_url":"https://arxiv.org/pdf/2401.17736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17728v1","updated":"2024-01-31T10:47:25Z","published":"2024-01-31T10:47:25Z","title":"COMET: Contrastive Mean Teacher for Online Source-Free Universal Domain\n Adaptation","summary":" In real-world applications, there is often a domain shift from training to\ntest data. This observation resulted in the development of test-time adaptation\n(TTA). It aims to adapt a pre-trained source model to the test data without\nrequiring access to the source data. Thereby, most existing works are limited\nto the closed-set assumption, i.e. there is no category shift between source\nand target domain. We argue that in a realistic open-world setting a category\nshift can appear in addition to a domain shift. This means, individual source\nclasses may not appear in the target domain anymore, samples of new classes may\nbe part of the target domain or even both at the same time. Moreover, in many\nreal-world scenarios the test data is not accessible all at once but arrives\nsequentially as a stream of batches demanding an immediate prediction. Hence,\nTTA must be applied in an online manner. To the best of our knowledge, the\ncombination of these aspects, i.e. online source-free universal domain\nadaptation (online SF-UniDA), has not been studied yet. In this paper, we\nintroduce a Contrastive Mean Teacher (COMET) tailored to this novel scenario.\nIt applies a contrastive loss to rebuild a feature space where the samples of\nknown classes build distinct clusters and the samples of new classes separate\nwell from them. It is complemented by an entropy loss which ensures that the\nclassifier output has a small entropy for samples of known classes and a large\nentropy for samples of new classes to be easily detected and rejected as\nunknown. To provide the losses with reliable pseudo labels, they are embedded\ninto a mean teacher (MT) framework. We evaluate our method across two datasets\nand all category shifts to set an initial benchmark for online SF-UniDA.\nThereby, COMET yields state-of-the-art performance and proves to be consistent\nand robust across a variety of different scenarios.\n","authors":["Pascal Schlachter","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2401.17728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11588v2","updated":"2024-01-31T10:15:08Z","published":"2023-05-19T10:58:04Z","title":"Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields","summary":" Text-driven 3D scene generation is widely applicable to video gaming, film\nindustry, and metaverse applications that have a large demand for 3D scenes.\nHowever, existing text-to-3D generation methods are limited to producing 3D\nobjects with simple geometries and dreamlike styles that lack realism. In this\nwork, we present Text2NeRF, which is able to generate a wide range of 3D scenes\nwith complicated geometric structures and high-fidelity textures purely from a\ntext prompt. To this end, we adopt NeRF as the 3D representation and leverage a\npre-trained text-to-image diffusion model to constrain the 3D reconstruction of\nthe NeRF to reflect the scene description. Specifically, we employ the\ndiffusion model to infer the text-related image as the content prior and use a\nmonocular depth estimation method to offer the geometric prior. Both content\nand geometric priors are utilized to update the NeRF model. To guarantee\ntextured and geometric consistency between different views, we introduce a\nprogressive scene inpainting and updating strategy for novel view synthesis of\nthe scene. Our method requires no additional training data but only a natural\nlanguage description of the scene as the input. Extensive experiments\ndemonstrate that our Text2NeRF outperforms existing methods in producing\nphoto-realistic, multi-view consistent, and diverse 3D scenes from a variety of\nnatural language prompts. Our code is available at\nhttps://github.com/eckertzhang/Text2NeRF.\n","authors":["Jingbo Zhang","Xiaoyu Li","Ziyu Wan","Can Wang","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2305.11588v2.pdf","comment":"Accepted by TVCG; Homepage:\n https://eckertzhang.github.io/Text2NeRF.github.io/\n Code:https://github.com/eckertzhang/Text2NeRF"},{"id":"http://arxiv.org/abs/2401.17714v1","updated":"2024-01-31T10:09:26Z","published":"2024-01-31T10:09:26Z","title":"3D-Plotting Algorithm for Insects using YOLOv5","summary":" In ecological research, accurately collecting spatiotemporal position data is\na fundamental task for understanding the behavior and ecology of insects and\nother organisms. In recent years, advancements in computer vision techniques\nhave reached a stage of maturity where they can support, and in some cases,\nreplace manual observation. In this study, a simple and inexpensive method for\nmonitoring insects in three dimensions (3D) was developed so that their\nbehavior could be observed automatically in experimental environments. The main\nachievements of this study have been to create a 3D monitoring algorithm using\ninexpensive cameras and other equipment to design an adjusting algorithm for\ndepth error, and to validate how our plotting algorithm is quantitatively\nprecise, all of which had not been realized in conventional studies. By\noffering detailed 3D visualizations of insects, the plotting algorithm aids\nresearchers in more effectively comprehending how insects interact within their\nenvironments.\n","authors":["Daisuke Mori","Hiroki Hayami","Yasufumi Fujimoto","Isao Goto"],"pdf_url":"https://arxiv.org/pdf/2401.17714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03160v4","updated":"2024-01-31T10:02:49Z","published":"2023-09-06T16:59:36Z","title":"ResFields: Residual Neural Fields for Spatiotemporal Signals","summary":" Neural fields, a category of neural networks trained to represent\nhigh-frequency signals, have gained significant attention in recent years due\nto their impressive performance in modeling complex 3D data, such as signed\ndistance (SDFs) or radiance fields (NeRFs), via a single multi-layer perceptron\n(MLP). However, despite the power and simplicity of representing signals with\nan MLP, these methods still face challenges when modeling large and complex\ntemporal signals due to the limited capacity of MLPs. In this paper, we propose\nan effective approach to address this limitation by incorporating temporal\nresidual layers into neural fields, dubbed ResFields. It is a novel class of\nnetworks specifically designed to effectively represent complex temporal\nsignals. We conduct a comprehensive analysis of the properties of ResFields and\npropose a matrix factorization technique to reduce the number of trainable\nparameters and enhance generalization capabilities. Importantly, our\nformulation seamlessly integrates with existing MLP-based neural fields and\nconsistently improves results across various challenging tasks: 2D video\napproximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF\nreconstruction. Lastly, we demonstrate the practical utility of ResFields by\nshowcasing its effectiveness in capturing dynamic 3D scenes from sparse RGBD\ncameras of a lightweight capture system.\n","authors":["Marko Mihajlovic","Sergey Prokudin","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03160v4.pdf","comment":"[ICLR 2024 Spotlight] Project and code at:\n https://markomih.github.io/ResFields/"},{"id":"http://arxiv.org/abs/2401.17699v1","updated":"2024-01-31T09:38:44Z","published":"2024-01-31T09:38:44Z","title":"Unified Physical-Digital Face Attack Detection","summary":" Face Recognition (FR) systems can suffer from physical (i.e., print photo)\nand digital (i.e., DeepFake) attacks. However, previous related work rarely\nconsiders both situations at the same time. This implies the deployment of\nmultiple models and thus more computational burden. The main reasons for this\nlack of an integrated model are caused by two factors: (1) The lack of a\ndataset including both physical and digital attacks with ID consistency which\nmeans the same ID covers the real face and all attack types; (2) Given the\nlarge intra-class variance between these two attacks, it is difficult to learn\na compact feature space to detect both attacks simultaneously. To address these\nissues, we collect a Unified physical-digital Attack dataset, called\nUniAttackData. The dataset consists of $1,800$ participations of 2 and 12\nphysical and digital attacks, respectively, resulting in a total of 29,706\nvideos. Then, we propose a Unified Attack Detection framework based on\nVision-Language Models (VLMs), namely UniAttackDetection, which includes three\nmain modules: the Teacher-Student Prompts (TSP) module, focused on acquiring\nunified and specific knowledge respectively; the Unified Knowledge Mining (UKM)\nmodule, designed to capture a comprehensive feature space; and the Sample-Level\nPrompt Interaction (SLPI) module, aimed at grasping sample-level semantics.\nThese three modules seamlessly form a robust unified attack detection\nframework. Extensive experiments on UniAttackData and three other datasets\ndemonstrate the superiority of our approach for unified face attack detection.\n","authors":["Hao Fang","Ajian Liu","Haocheng Yuan","Junze Zheng","Dingheng Zeng","Yanhong Liu","Jiankang Deng","Sergio Escalera","Xiaoming Liu","Jun Wan","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2401.17699v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.17695v1","updated":"2024-01-31T09:31:28Z","published":"2024-01-31T09:31:28Z","title":"Datacube segmentation via Deep Spectral Clustering","summary":" Extended Vision techniques are ubiquitous in physics. However, the data cubes\nsteaming from such analysis often pose a challenge in their interpretation, due\nto the intrinsic difficulty in discerning the relevant information from the\nspectra composing the data cube.\n Furthermore, the huge dimensionality of data cube spectra poses a complex\ntask in its statistical interpretation; nevertheless, this complexity contains\na massive amount of statistical information that can be exploited in an\nunsupervised manner to outline some essential properties of the case study at\nhand, e.g.~it is possible to obtain an image segmentation via (deep) clustering\nof data-cube's spectra, performed in a suitably defined low-dimensional\nembedding space.\n To tackle this topic, we explore the possibility of applying unsupervised\nclustering methods in encoded space, i.e. perform deep clustering on the\nspectral properties of datacube pixels. A statistical dimensional reduction is\nperformed by an ad hoc trained (Variational) AutoEncoder, in charge of mapping\nspectra into lower dimensional metric spaces, while the clustering process is\nperformed by a (learnable) iterative K-Means clustering algorithm.\n We apply this technique to two different use cases, of different physical\norigins: a set of Macro mapping X-Ray Fluorescence (MA-XRF) synthetic data on\npictorial artworks, and a dataset of simulated astrophysical observations.\n","authors":["Alessandro Bombini","Fernando García-Avello Bofías","Caterina Bracci","Michele Ginolfi","Chiara Ruberto"],"pdf_url":"https://arxiv.org/pdf/2401.17695v1.pdf","comment":"20 pages, 10 figures, doi for code repository, dataset and trained\n model available and reported in the paper"},{"id":"http://arxiv.org/abs/2306.05658v2","updated":"2024-01-31T09:30:09Z","published":"2023-06-09T03:53:12Z","title":"GMS-3DQA: Projection-based Grid Mini-patch Sampling for 3D Model Quality\n Assessment","summary":" Nowadays, most 3D model quality assessment (3DQA) methods have been aimed at\nimproving performance. However, little attention has been paid to the\ncomputational cost and inference time required for practical applications.\nModel-based 3DQA methods extract features directly from the 3D models, which\nare characterized by their high degree of complexity. As a result, many\nresearchers are inclined towards utilizing projection-based 3DQA methods.\nNevertheless, previous projection-based 3DQA methods directly extract features\nfrom multi-projections to ensure quality prediction accuracy, which calls for\nmore resource consumption and inevitably leads to inefficiency. Thus in this\npaper, we address this challenge by proposing a no-reference (NR)\nprojection-based \\textit{\\underline{G}rid \\underline{M}ini-patch\n\\underline{S}ampling \\underline{3D} Model \\underline{Q}uality\n\\underline{A}ssessment (GMS-3DQA)} method. The projection images are rendered\nfrom six perpendicular viewpoints of the 3D model to cover sufficient quality\ninformation. To reduce redundancy and inference resources, we propose a\nmulti-projection grid mini-patch sampling strategy (MP-GMS), which samples grid\nmini-patches from the multi-projections and forms the sampled grid mini-patches\ninto one quality mini-patch map (QMM). The Swin-Transformer tiny backbone is\nthen used to extract quality-aware features from the QMMs. The experimental\nresults show that the proposed GMS-3DQA outperforms existing state-of-the-art\nNR-3DQA methods on the point cloud quality assessment databases. The efficiency\nanalysis reveals that the proposed GMS-3DQA requires far less computational\nresources and inference time than other 3DQA competitors. The code will be\navailable at https://github.com/zzc-1998/GMS-3DQA.\n","authors":["Zicheng Zhang","Wei Sun","Houning Wu","Yingjie Zhou","Chunyi Li","Xiongkuo Min","Guangtao Zhai","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2306.05658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17664v1","updated":"2024-01-31T08:35:40Z","published":"2024-01-31T08:35:40Z","title":"Image Anything: Towards Reasoning-coherent and Training-free Multi-modal\n Image Generation","summary":" The multifaceted nature of human perception and comprehension indicates that,\nwhen we think, our body can naturally take any combination of senses, a.k.a.,\nmodalities and form a beautiful picture in our brain. For example, when we see\na cattery and simultaneously perceive the cat's purring sound, our brain can\nconstruct a picture of a cat in the cattery. Intuitively, generative AI models\nshould hold the versatility of humans and be capable of generating images from\nany combination of modalities efficiently and collaboratively. This paper\npresents ImgAny, a novel end-to-end multi-modal generative model that can mimic\nhuman reasoning and generate high-quality images. Our method serves as the\nfirst attempt in its capacity of efficiently and flexibly taking any\ncombination of seven modalities, ranging from language, audio to vision\nmodalities, including image, point cloud, thermal, depth, and event data. Our\nkey idea is inspired by human-level cognitive processes and involves the\nintegration and harmonization of multiple input modalities at both the entity\nand attribute levels without specific tuning across modalities. Accordingly,\nour method brings two novel training-free technical branches: 1) Entity Fusion\nBranch ensures the coherence between inputs and outputs. It extracts entity\nfeatures from the multi-modal representations powered by our specially\nconstructed entity knowledge graph; 2) Attribute Fusion Branch adeptly\npreserves and processes the attributes. It efficiently amalgamates distinct\nattributes from diverse input modalities via our proposed attribute knowledge\ngraph. Lastly, the entity and attribute features are adaptively fused as the\nconditional inputs to the pre-trained Stable Diffusion model for image\ngeneration. Extensive experiments under diverse modality combinations\ndemonstrate its exceptional capability for visual content creation.\n","authors":["Yuanhuiyi Lyu","Xu Zheng","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17654v1","updated":"2024-01-31T08:16:32Z","published":"2024-01-31T08:16:32Z","title":"All Beings Are Equal in Open Set Recognition","summary":" In open-set recognition (OSR), a promising strategy is exploiting\npseudo-unknown data outside given $K$ known classes as an additional $K$+$1$-th\nclass to explicitly model potential open space. However, treating unknown\nclasses without distinction is unequal for them relative to known classes due\nto the category-agnostic and scale-agnostic of the unknowns. This inevitably\nnot only disrupts the inherent distributions of unknown classes but also incurs\nboth class-wise and instance-wise imbalances between known and unknown classes.\nIdeally, the OSR problem should model the whole class space as $K$+$\\infty$,\nbut enumerating all unknowns is impractical. Since the core of OSR is to\neffectively model the boundaries of known classes, this means just focusing on\nthe unknowns nearing the boundaries of targeted known classes seems sufficient.\nThus, as a compromise, we convert the open classes from infinite to $K$, with a\nnovel concept Target-Aware Universum (TAU) and propose a simple yet effective\nframework Dual Contrastive Learning with Target-Aware Universum (DCTAU). In\ndetails, guided by the targeted known classes, TAU automatically expands the\nunknown classes from the previous $1$ to $K$, effectively alleviating the\ndistribution disruption and the imbalance issues mentioned above. Then, a novel\nDual Contrastive (DC) loss is designed, where all instances irrespective of\nknown or TAU are considered as positives to contrast with their respective\nnegatives. Experimental results indicate DCTAU sets a new state-of-the-art.\n","authors":["Chaohua Li","Enhao Zhang","Chuanxing Geng","SongCan Chen"],"pdf_url":"https://arxiv.org/pdf/2401.17654v1.pdf","comment":"Accepted by the main track The 38th Annual AAAI Conference on\n Artificial Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2401.17642v1","updated":"2024-01-31T07:51:52Z","published":"2024-01-31T07:51:52Z","title":"Exploring the Common Appearance-Boundary Adaptation for Nighttime\n Optical Flow","summary":" We investigate a challenging task of nighttime optical flow, which suffers\nfrom weakened texture and amplified noise. These degradations weaken\ndiscriminative visual features, thus causing invalid motion feature matching.\nTypically, existing methods employ domain adaptation to transfer knowledge from\nauxiliary domain to nighttime domain in either input visual space or output\nmotion space. However, this direct adaptation is ineffective, since there\nexists a large domain gap due to the intrinsic heterogeneous nature of the\nfeature representations between auxiliary and nighttime domains. To overcome\nthis issue, we explore a common-latent space as the intermediate bridge to\nreinforce the feature alignment between auxiliary and nighttime domains. In\nthis work, we exploit two auxiliary daytime and event domains, and propose a\nnovel common appearance-boundary adaptation framework for nighttime optical\nflow. In appearance adaptation, we employ the intrinsic image decomposition to\nembed the auxiliary daytime image and the nighttime image into a\nreflectance-aligned common space. We discover that motion distributions of the\ntwo reflectance maps are very similar, benefiting us to consistently transfer\nmotion appearance knowledge from daytime to nighttime domain. In boundary\nadaptation, we theoretically derive the motion correlation formula between\nnighttime image and accumulated events within a spatiotemporal gradient-aligned\ncommon space. We figure out that the correlation of the two spatiotemporal\ngradient maps shares significant discrepancy, benefitting us to contrastively\ntransfer boundary knowledge from event to nighttime domain. Moreover,\nappearance adaptation and boundary adaptation are complementary to each other,\nsince they could jointly transfer global motion and local boundary knowledge to\nthe nighttime domain.\n","authors":["Hanyu Zhou","Yi Chang","Haoyue Liu","Wending Yan","Yuxing Duan","Zhiwei Shi","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15977v2","updated":"2024-01-31T07:41:04Z","published":"2024-01-29T09:06:43Z","title":"Motion-I2V: Consistent and Controllable Image-to-Video Generation with\n Explicit Motion Modeling","summary":" We introduce Motion-I2V, a novel framework for consistent and controllable\nimage-to-video generation (I2V). In contrast to previous methods that directly\nlearn the complicated image-to-video mapping, Motion-I2V factorizes I2V into\ntwo stages with explicit motion modeling. For the first stage, we propose a\ndiffusion-based motion field predictor, which focuses on deducing the\ntrajectories of the reference image's pixels. For the second stage, we propose\nmotion-augmented temporal attention to enhance the limited 1-D temporal\nattention in video latent diffusion models. This module can effectively\npropagate reference image's feature to synthesized frames with the guidance of\npredicted trajectories from the first stage. Compared with existing methods,\nMotion-I2V can generate more consistent videos even at the presence of large\nmotion and viewpoint variation. By training a sparse trajectory ControlNet for\nthe first stage, Motion-I2V can support users to precisely control motion\ntrajectories and motion regions with sparse trajectory and region annotations.\nThis offers more controllability of the I2V process than solely relying on\ntextual instructions. Additionally, Motion-I2V's second stage naturally\nsupports zero-shot video-to-video translation. Both qualitative and\nquantitative comparisons demonstrate the advantages of Motion-I2V over prior\napproaches in consistent and controllable image-to-video generation. Please see\nour project page at https://xiaoyushi97.github.io/Motion-I2V/.\n","authors":["Xiaoyu Shi","Zhaoyang Huang","Fu-Yun Wang","Weikang Bian","Dasong Li","Yi Zhang","Manyuan Zhang","Ka Chun Cheung","Simon See","Hongwei Qin","Jifeng Dai","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2401.15977v2.pdf","comment":"Project page: https://xiaoyushi97.github.io/Motion-I2V/"},{"id":"http://arxiv.org/abs/2401.17629v1","updated":"2024-01-31T07:11:01Z","published":"2024-01-31T07:11:01Z","title":"Spatial-and-Frequency-aware Restoration method for Images based on\n Diffusion Models","summary":" Diffusion models have recently emerged as a promising framework for Image\nRestoration (IR), owing to their ability to produce high-quality\nreconstructions and their compatibility with established methods. Existing\nmethods for solving noisy inverse problems in IR, considers the pixel-wise\ndata-fidelity. In this paper, we propose SaFaRI, a spatial-and-frequency-aware\ndiffusion model for IR with Gaussian noise. Our model encourages images to\npreserve data-fidelity in both the spatial and frequency domains, resulting in\nenhanced reconstruction quality. We comprehensively evaluate the performance of\nour model on a variety of noisy inverse problems, including inpainting,\ndenoising, and super-resolution. Our thorough evaluation demonstrates that\nSaFaRI achieves state-of-the-art performance on both the ImageNet datasets and\nFFHQ datasets, outperforming existing zero-shot IR methods in terms of LPIPS\nand FID metrics.\n","authors":["Kyungsung Lee","Donggyu Lee","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2401.17629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11395v2","updated":"2024-01-31T06:31:59Z","published":"2024-01-21T04:13:58Z","title":"UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with\n Fine-Grained Feature Representation","summary":" 3D open-vocabulary scene understanding aims to recognize arbitrary novel\ncategories beyond the base label space. However, existing works not only fail\nto fully utilize all the available modal information in the 3D domain but also\nlack sufficient granularity in representing the features of each modality. In\nthis paper, we propose a unified multimodal 3D open-vocabulary scene\nunderstanding network, namely UniM-OV3D, which aligns point clouds with image,\nlanguage and depth. To better integrate global and local features of the point\nclouds, we design a hierarchical point cloud feature extraction module that\nlearns comprehensive fine-grained feature representations. Further, to\nfacilitate the learning of coarse-to-fine point-semantic representations from\ncaptions, we propose the utilization of hierarchical 3D caption pairs,\ncapitalizing on geometric constraints across various viewpoints of 3D scenes.\nExtensive experimental results demonstrate the effectiveness and superiority of\nour method in open-vocabulary semantic and instance segmentation, which\nachieves state-of-the-art performance on both indoor and outdoor benchmarks\nsuch as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at\nhttps://github.com/hithqd/UniM-OV3D.\n","authors":["Qingdong He","Jinlong Peng","Zhengkai Jiang","Kai Wu","Xiaozhong Ji","Jiangning Zhang","Yabiao Wang","Chengjie Wang","Mingang Chen","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2401.11395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09171v3","updated":"2024-01-31T06:27:21Z","published":"2023-03-16T09:29:05Z","title":"Empowering CAM-Based Methods with Capability to Generate Fine-Grained\n and High-Faithfulness Explanations","summary":" Recently, the explanation of neural network models has garnered considerable\nresearch attention. In computer vision, CAM (Class Activation Map)-based\nmethods and LRP (Layer-wise Relevance Propagation) method are two common\nexplanation methods. However, since most CAM-based methods can only generate\nglobal weights, they can only generate coarse-grained explanations at a deep\nlayer. LRP and its variants, on the other hand, can generate fine-grained\nexplanations. But the faithfulness of the explanations is too low. To address\nthese challenges, in this paper, we propose FG-CAM (Fine-Grained CAM), which\nextends CAM-based methods to enable generating fine-grained and\nhigh-faithfulness explanations. FG-CAM uses the relationship between two\nadjacent layers of feature maps with resolution differences to gradually\nincrease the explanation resolution, while finding the contributing pixels and\nfiltering out the pixels that do not contribute. Our method not only solves the\nshortcoming of CAM-based methods without changing their characteristics, but\nalso generates fine-grained explanations that have higher faithfulness than LRP\nand its variants. We also present FG-CAM with denoising, which is a variant of\nFG-CAM and is able to generate less noisy explanations with almost no change in\nexplanation faithfulness. Experimental results show that the performance of\nFG-CAM is almost unaffected by the explanation resolution. FG-CAM outperforms\nexisting CAM-based methods significantly in both shallow and intermediate\nlayers, and outperforms LRP and its variants significantly in the input layer.\nOur code is available at https://github.com/dongmo-qcq/FG-CAM.\n","authors":["Changqing Qiu","Fusheng Jin","Yining Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.09171v3.pdf","comment":"This paper has been accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2309.02169v3","updated":"2024-01-31T06:18:14Z","published":"2023-09-05T12:16:14Z","title":"Dual Relation Alignment for Composed Image Retrieval","summary":" Composed image retrieval, a task involving the search for a target image\nusing a reference image and a complementary text as the query, has witnessed\nsignificant advancements owing to the progress made in cross-modal modeling.\nUnlike the general image-text retrieval problem with only one alignment\nrelation, i.e., image-text, we argue for the existence of two types of\nrelations in composed image retrieval. The explicit relation pertains to the\nreference image & complementary text-target image, which is commonly exploited\nby existing methods. Besides this intuitive relation, the observations during\nour practice have uncovered another implicit yet crucial relation, i.e.,\nreference image & target image-complementary text, since we found that the\ncomplementary text can be inferred by studying the relation between the target\nimage and the reference image. Regrettably, existing methods largely focus on\nleveraging the explicit relation to learn their networks, while overlooking the\nimplicit relation. In response to this weakness, We propose a new framework for\ncomposed image retrieval, termed dual relation alignment, which integrates both\nexplicit and implicit relations to fully exploit the correlations among the\ntriplets. Specifically, we design a vision compositor to fuse reference image\nand target image at first, then the resulted representation will serve two\nroles: (1) counterpart for semantic alignment with the complementary text and\n(2) compensation for the complementary text to boost the explicit relation\nmodeling, thereby implant the implicit relation into the alignment learning.\nOur method is evaluated on two popular datasets, CIRR and FashionIQ, through\nextensive experiments. The results confirm the effectiveness of our\ndual-relation learning in substantially enhancing composed image retrieval\nperformance.\n","authors":["Xintong Jiang","Yaxiong Wang","Yujiao Wu","Meng Wang","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2309.02169v3.pdf","comment":"The architecture of our model changes, hence methodolgy and\n experiments changes a lot, We have significantly revised the original\n manuscript of the paper, so a withdraw of our original script is needed"},{"id":"http://arxiv.org/abs/2401.17617v1","updated":"2024-01-31T06:12:28Z","published":"2024-01-31T06:12:28Z","title":"Unveiling the Power of Self-supervision for Multi-view Multi-human\n Association and Tracking","summary":" Multi-view multi-human association and tracking (MvMHAT), is a new but\nimportant problem for multi-person scene video surveillance, aiming to track a\ngroup of people over time in each view, as well as to identify the same person\nacross different views at the same time, which is different from previous MOT\nand multi-camera MOT tasks only considering the over-time human tracking. This\nway, the videos for MvMHAT require more complex annotations while containing\nmore information for self learning. In this work, we tackle this problem with a\nself-supervised learning aware end-to-end network. Specifically, we propose to\ntake advantage of the spatial-temporal self-consistency rationale by\nconsidering three properties of reflexivity, symmetry and transitivity. Besides\nthe reflexivity property that naturally holds, we design the self-supervised\nlearning losses based on the properties of symmetry and transitivity, for both\nappearance feature learning and assignment matrix optimization, to associate\nthe multiple humans over time and across views. Furthermore, to promote the\nresearch on MvMHAT, we build two new large-scale benchmarks for the network\ntraining and testing of different algorithms. Extensive experiments on the\nproposed benchmarks verify the effectiveness of our method. We have released\nthe benchmark and code to the public.\n","authors":["Wei Feng","Feifan Wang","Ruize Han","Zekun Qian","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11050v2","updated":"2024-01-31T06:00:39Z","published":"2023-10-17T07:37:32Z","title":"$k$-$t$ CLAIR: Self-Consistency Guided Multi-Prior Learning for Dynamic\n Parallel MR Image Reconstruction","summary":" Cardiac magnetic resonance imaging (CMR) has been widely used in clinical\npractice for the medical diagnosis of cardiac diseases. However, the long\nacquisition time hinders its development in real-time applications. Here, we\npropose a novel self-consistency guided multi-prior learning framework named\n$k$-$t$ CLAIR to exploit spatiotemporal correlations from highly undersampled\ndata for accelerated dynamic parallel MRI reconstruction. The $k$-$t$ CLAIR\nprogressively reconstructs faithful images by leveraging multiple complementary\npriors learned in the $x$-$t$, $x$-$f$, and $k$-$t$ domains in an iterative\nfashion, as dynamic MRI exhibits high spatiotemporal redundancy. Additionally,\n$k$-$t$ CLAIR incorporates calibration information for prior learning,\nresulting in a more consistent reconstruction. Experimental results on cardiac\ncine and T1W/T2W images demonstrate that $k$-$t$ CLAIR achieves high-quality\ndynamic MR reconstruction in terms of both quantitative and qualitative\nperformance.\n","authors":["Liping Zhang","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11050v2.pdf","comment":"12 pages, 3 figures, 4 tables. CMRxRecon Challenge, MICCAI 2023"},{"id":"http://arxiv.org/abs/2401.17609v1","updated":"2024-01-31T05:44:01Z","published":"2024-01-31T05:44:01Z","title":"LaneGraph2Seq: Lane Topology Extraction with Language Model via\n Vertex-Edge Encoding and Connectivity Enhancement","summary":" Understanding road structures is crucial for autonomous driving. Intricate\nroad structures are often depicted using lane graphs, which include centerline\ncurves and connections forming a Directed Acyclic Graph (DAG). Accurate\nextraction of lane graphs relies on precisely estimating vertex and edge\ninformation within the DAG. Recent research highlights Transformer-based\nlanguage models' impressive sequence prediction abilities, making them\neffective for learning graph representations when graph data are encoded as\nsequences. However, existing studies focus mainly on modeling vertices\nexplicitly, leaving edge information simply embedded in the network.\nConsequently, these approaches fall short in the task of lane graph extraction.\nTo address this, we introduce LaneGraph2Seq, a novel approach for lane graph\nextraction. It leverages a language model with vertex-edge encoding and\nconnectivity enhancement. Our serialization strategy includes a vertex-centric\ndepth-first traversal and a concise edge-based partition sequence.\nAdditionally, we use classifier-free guidance combined with nucleus sampling to\nimprove lane connectivity. We validate our method on prominent datasets,\nnuScenes and Argoverse 2, showcasing consistent and compelling results. Our\nLaneGraph2Seq approach demonstrates superior performance compared to\nstate-of-the-art techniques in lane graph extraction.\n","authors":["Renyuan Peng","Xinyue Cai","Hang Xu","Jiachen Lu","Feng Wen","Wei Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17609v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2205.15523v2","updated":"2024-01-31T05:30:22Z","published":"2022-05-31T03:47:08Z","title":"Variational Transfer Learning using Cross-Domain Latent Modulation","summary":" To successfully apply trained neural network models to new domains, powerful\ntransfer learning solutions are essential. We propose to introduce a novel\ncross-domain latent modulation mechanism to a variational autoencoder framework\nso as to achieve effective transfer learning. Our key idea is to procure deep\nrepresentations from one data domain and use it to influence the\nreparameterization of the latent variable of another domain. Specifically, deep\nrepresentations of the source and target domains are first extracted by a\nunified inference model and aligned by employing gradient reversal. The learned\ndeep representations are then cross-modulated to the latent encoding of the\nalternative domain, where consistency constraints are also applied. In the\nempirical validation that includes a number of transfer learning benchmark\ntasks for unsupervised domain adaptation and image-to-image translation, our\nmodel demonstrates competitive performance, which is also supported by evidence\nobtained from visualization.\n","authors":["Jinyong Hou","Jeremiah D. Deng","Stephen Cranefield","Xuejie Din"],"pdf_url":"https://arxiv.org/pdf/2205.15523v2.pdf","comment":"Under review. Extended version of a previous WACV paper\n (arXiv:2012.11727). 13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2201.09196v2","updated":"2024-01-31T05:30:08Z","published":"2022-01-23T06:45:47Z","title":"Learning to Predict Gradients for Semi-Supervised Continual Learning","summary":" A key challenge for machine intelligence is to learn new visual concepts\nwithout forgetting the previously acquired knowledge. Continual learning is\naimed towards addressing this challenge. However, there is a gap between\nexisting supervised continual learning and human-like intelligence, where human\nis able to learn from both labeled and unlabeled data. How unlabeled data\naffects learning and catastrophic forgetting in the continual learning process\nremains unknown. To explore these issues, we formulate a new semi-supervised\ncontinual learning method, which can be generically applied to existing\ncontinual learning models. Specifically, a novel gradient learner learns from\nlabeled data to predict gradients on unlabeled data. Hence, the unlabeled data\ncould fit into the supervised continual learning method. Different from\nconventional semi-supervised settings, we do not hypothesize that the\nunderlying classes, which are associated to the unlabeled data, are known to\nthe learning process. In other words, the unlabeled data could be very distinct\nfrom the labeled data. We evaluate the proposed method on mainstream continual\nlearning, adversarial continual learning, and semi-supervised learning tasks.\nThe proposed method achieves state-of-the-art performance on classification\naccuracy and backward transfer in the continual learning setting while\nachieving desired performance on classification accuracy in the semi-supervised\nlearning setting. This implies that the unlabeled images can enhance the\ngeneralizability of continual learning models on the predictive ability on\nunseen data and significantly alleviate catastrophic forgetting. The code is\navailable at \\url{https://github.com/luoyan407/grad_prediction.git}.\n","authors":["Yan Luo","Yongkang Wong","Mohan Kankanhalli","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2201.09196v2.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems\n (TNNLS)"},{"id":"http://arxiv.org/abs/2401.17217v2","updated":"2024-01-31T05:21:13Z","published":"2024-01-30T18:02:44Z","title":"GazeGPT: Augmenting Human Capabilities using Gaze-contingent Contextual\n AI for Smart Eyewear","summary":" Multimodal large language models (LMMs) excel in world knowledge and\nproblem-solving abilities. Through the use of a world-facing camera and\ncontextual AI, emerging smart accessories aim to provide a seamless interface\nbetween humans and LMMs. Yet, these wearable computing systems lack an\nunderstanding of the user's attention. We introduce GazeGPT as a new user\ninteraction paradigm for contextual AI. GazeGPT uses eye tracking to help the\nLMM understand which object in the world-facing camera view a user is paying\nattention to. Using extensive user evaluations, we show that this\ngaze-contingent mechanism is a faster and more accurate pointing mechanism than\nalternatives; that it augments human capabilities by significantly improving\ntheir accuracy in a dog-breed classification task; and that it is consistently\nranked as more natural than head- or body-driven selection mechanisms for\ncontextual AI. Moreover, we prototype a variety of application scenarios that\nsuggest GazeGPT could be of significant value to users as part of future\nAI-driven personal assistants.\n","authors":["Robert Konrad","Nitish Padmanaban","J. Gabriel Buckmaster","Kevin C. Boyle","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2401.17217v2.pdf","comment":"Project video: https://youtu.be/AuDFHHTK_m8"},{"id":"http://arxiv.org/abs/2401.17604v1","updated":"2024-01-31T05:20:29Z","published":"2024-01-31T05:20:29Z","title":"Computation and Parameter Efficient Multi-Modal Fusion Transformer for\n Cued Speech Recognition","summary":" Cued Speech (CS) is a pure visual coding method used by hearing-impaired\npeople that combines lip reading with several specific hand shapes to make the\nspoken language visible. Automatic CS recognition (ACSR) seeks to transcribe\nvisual cues of speech into text, which can help hearing-impaired people to\ncommunicate effectively. The visual information of CS contains lip reading and\nhand cueing, thus the fusion of them plays an important role in ACSR. However,\nmost previous fusion methods struggle to capture the global dependency present\nin long sequence inputs of multi-modal CS data. As a result, these methods\ngenerally fail to learn the effective cross-modal relationships that contribute\nto the fusion. Recently, attention-based transformers have been a prevalent\nidea for capturing the global dependency over the long sequence in multi-modal\nfusion, but existing multi-modal fusion transformers suffer from both poor\nrecognition accuracy and inefficient computation for the ACSR task. To address\nthese problems, we develop a novel computation and parameter efficient\nmulti-modal fusion transformer by proposing a novel Token-Importance-Aware\nAttention mechanism (TIAA), where a token utilization rate (TUR) is formulated\nto select the important tokens from the multi-modal streams. More precisely,\nTIAA firstly models the modality-specific fine-grained temporal dependencies\nover all tokens of each modality, and then learns the efficient cross-modal\ninteraction for the modality-shared coarse-grained temporal dependencies over\nthe important tokens of different modalities. Besides, a light-weight gated\nhidden projection is designed to control the feature flows of TIAA. The\nresulting model, named Economical Cued Speech Fusion Transformer (EcoCued),\nachieves state-of-the-art performance on all existing CS datasets, compared\nwith existing transformer-based fusion methods and ACSR fusion methods.\n","authors":["Lei Liu","Li Liu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2401.17604v1.pdf","comment":"Accepted by TASLP"},{"id":"http://arxiv.org/abs/2401.17603v1","updated":"2024-01-31T05:13:53Z","published":"2024-01-31T05:13:53Z","title":"Topology-Aware Latent Diffusion for 3D Shape Generation","summary":" We introduce a new generative model that combines latent diffusion with\npersistent homology to create 3D shapes with high diversity, with a special\nemphasis on their topological characteristics. Our method involves representing\n3D shapes as implicit fields, then employing persistent homology to extract\ntopological features, including Betti numbers and persistence diagrams. The\nshape generation process consists of two steps. Initially, we employ a\ntransformer-based autoencoding module to embed the implicit representation of\neach 3D shape into a set of latent vectors. Subsequently, we navigate through\nthe learned latent space via a diffusion model. By strategically incorporating\ntopological features into the diffusion process, our generative module is able\nto produce a richer variety of 3D shapes with different topological structures.\nFurthermore, our framework is flexible, supporting generation tasks constrained\nby a variety of inputs, including sparse and partial point clouds, as well as\nsketches. By modifying the persistence diagrams, we can alter the topology of\nthe shapes generated from these input modalities.\n","authors":["Jiangbei Hu","Ben Fei","Baixin Xu","Fei Hou","Weidong Yang","Shengfa Wang","Na Lei","Chen Qian","Ying He"],"pdf_url":"https://arxiv.org/pdf/2401.17603v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.14718v2","updated":"2024-01-31T05:09:44Z","published":"2024-01-26T08:59:38Z","title":"A Survey on Video Prediction: From Deterministic to Generative\n Approaches","summary":" Video prediction, a fundamental task in computer vision, aims to enable\nmodels to generate sequences of future frames based on existing video content.\nThis task has garnered widespread application across various domains. In this\npaper, we comprehensively survey both historical and contemporary works in this\nfield, encompassing the most widely used datasets and algorithms. Our survey\nscrutinizes the challenges and evolving landscape of video prediction within\nthe realm of computer vision. We propose a novel taxonomy centered on the\nstochastic nature of video prediction algorithms. This taxonomy accentuates the\ngradual transition from deterministic to generative prediction methodologies,\nunderlining significant advancements and shifts in approach.\n","authors":["Ruibo Ming","Zhewei Huang","Zhuoxuan Ju","Jianming Hu","Lihui Peng","Shuchang Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.14718v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.17600v1","updated":"2024-01-31T04:57:12Z","published":"2024-01-31T04:57:12Z","title":"Good at captioning, bad at counting: Benchmarking GPT-4V on Earth\n observation data","summary":" Large Vision-Language Models (VLMs) have demonstrated impressive performance\non complex tasks involving visual input with natural language instructions.\nHowever, it remains unclear to what extent capabilities on natural images\ntransfer to Earth observation (EO) data, which are predominantly satellite and\naerial images less common in VLM training data. In this work, we propose a\ncomprehensive benchmark to gauge the progress of VLMs toward being useful tools\nfor EO data by assessing their abilities on scene understanding, localization\nand counting, and change detection tasks. Motivated by real-world applications,\nour benchmark includes scenarios like urban monitoring, disaster relief, land\nuse, and conservation. We discover that, although state-of-the-art VLMs like\nGPT-4V possess extensive world knowledge that leads to strong performance on\nopen-ended tasks like location understanding and image captioning, their poor\nspatial reasoning limits usefulness on object localization and counting tasks.\nOur benchmark will be made publicly available at https://vleo.danielz.ch/ and\non Hugging Face at\nhttps://huggingface.co/collections/mit-ei/vleo-benchmark-datasets-65b789b0466555489cce0d70\nfor easy model evaluation.\n","authors":["Chenhui Zhang","Sherrie Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17600v1.pdf","comment":"62 pages; work in progress"},{"id":"http://arxiv.org/abs/2008.11945v6","updated":"2024-01-31T04:45:32Z","published":"2020-08-27T06:53:53Z","title":"Moderately Supervised Learning: Definition, Framework and Generality","summary":" Learning with supervision has achieved remarkable success in numerous\nartificial intelligence (AI) applications. In the current literature, by\nreferring to the properties of the labels prepared for the training dataset,\nlearning with supervision is categorized as supervised learning (SL) and weakly\nsupervised learning (WSL). SL concerns the situation where the training data\nset is assigned with ideal (complete, exact and accurate) labels, while WSL\nconcerns the situation where the training data set is assigned with non-ideal\n(incomplete, inexact or inaccurate) labels. However, various solutions for SL\ntasks have shown that the given labels are not always easy to learn, and the\ntransformation from the given labels to easy-to-learn targets can significantly\naffect the performance of the final SL solutions. Without considering the\nproperties of the transformation from the given labels to easy-to-learn\ntargets, the definition of SL conceals some details that can be critical to\nbuilding the appropriate solutions for specific SL tasks. Thus, for engineers\nin the AI application field, it is desirable to reveal these details\nsystematically. This article attempts to achieve this goal by expanding the\ncategorization of SL and investigating the sub-type moderately supervised\nlearning (MSL) that concerns the situation where the given labels are ideal,\nbut due to the simplicity in annotation, careful designs are required to\ntransform the given labels into easy-to-learn targets. From the perspectives of\nthe definition, framework and generality, we conceptualize MSL to present a\ncomplete fundamental basis to systematically analyse MSL tasks. At meantime,\nrevealing the relation between the conceptualization of MSL and the\nmathematicians' vision, this paper as well establishes a tutorial for AI\napplication engineers to refer to viewing a problem to be solved from the\nmathematicians' vision.\n","authors":["Yongquan Yang"],"pdf_url":"https://arxiv.org/pdf/2008.11945v6.pdf","comment":"This is the final published version (33 pages)"},{"id":"http://arxiv.org/abs/2401.17593v1","updated":"2024-01-31T04:34:31Z","published":"2024-01-31T04:34:31Z","title":"Head and Neck Tumor Segmentation from [18F]F-FDG PET/CT Images Based on\n 3D Diffusion Model","summary":" Head and neck (H&N) cancers are among the most prevalent types of cancer\nworldwide, and [18F]F-FDG PET/CT is widely used for H&N cancer management.\nRecently, the diffusion model has demonstrated remarkable performance in\nvarious image-generation tasks. In this work, we proposed a 3D diffusion model\nto accurately perform H&N tumor segmentation from 3D PET and CT volumes. The 3D\ndiffusion model was developed considering the 3D nature of PET and CT images\nacquired. During the reverse process, the model utilized a 3D UNet structure\nand took the concatenation of PET, CT, and Gaussian noise volumes as the\nnetwork input to generate the tumor mask. Experiments based on the HECKTOR\nchallenge dataset were conducted to evaluate the effectiveness of the proposed\ndiffusion model. Several state-of-the-art techniques based on U-Net and\nTransformer structures were adopted as the reference methods. Benefits of\nemploying both PET and CT as the network input as well as further extending the\ndiffusion model from 2D to 3D were investigated based on various quantitative\nmetrics and the uncertainty maps generated. Results showed that the proposed 3D\ndiffusion model could generate more accurate segmentation results compared with\nother methods. Compared to the diffusion model in 2D format, the proposed 3D\nmodel yielded superior results. Our experiments also highlighted the advantage\nof utilizing dual-modality PET and CT data over only single-modality data for\nH&N tumor segmentation.\n","authors":["Yafei Dong","Kuang Gong"],"pdf_url":"https://arxiv.org/pdf/2401.17593v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.17592v1","updated":"2024-01-31T04:32:41Z","published":"2024-01-31T04:32:41Z","title":"Local Feature Matching Using Deep Learning: A Survey","summary":" Local feature matching enjoys wide-ranging applications in the realm of\ncomputer vision, encompassing domains such as image retrieval, 3D\nreconstruction, and object recognition. However, challenges persist in\nimproving the accuracy and robustness of matching due to factors like viewpoint\nand lighting variations. In recent years, the introduction of deep learning\nmodels has sparked widespread exploration into local feature matching\ntechniques. The objective of this endeavor is to furnish a comprehensive\noverview of local feature matching methods. These methods are categorized into\ntwo key segments based on the presence of detectors. The Detector-based\ncategory encompasses models inclusive of Detect-then-Describe, Joint Detection\nand Description, Describe-then-Detect, as well as Graph Based techniques. In\ncontrast, the Detector-free category comprises CNN Based, Transformer Based,\nand Patch Based methods. Our study extends beyond methodological analysis,\nincorporating evaluations of prevalent datasets and metrics to facilitate a\nquantitative comparison of state-of-the-art techniques. The paper also explores\nthe practical application of local feature matching in diverse domains such as\nStructure from Motion, Remote Sensing Image Registration, and Medical Image\nRegistration, underscoring its versatility and significance across various\nfields. Ultimately, we endeavor to outline the current challenges faced in this\ndomain and furnish future research directions, thereby serving as a reference\nfor researchers involved in local feature matching and its interconnected\ndomains.\n","authors":["Shibiao Xu","Shunpeng Chen","Rongtao Xu","Changwei Wang","Peng Lu","Li Guo"],"pdf_url":"https://arxiv.org/pdf/2401.17592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17583v1","updated":"2024-01-31T03:58:28Z","published":"2024-01-31T03:58:28Z","title":"Agile But Safe: Learning Collision-Free High-Speed Legged Locomotion","summary":" Legged robots navigating cluttered environments must be jointly agile for\nefficient task execution and safe to avoid collisions with obstacles or humans.\nExisting studies either develop conservative controllers (< 1.0 m/s) to ensure\nsafety, or focus on agility without considering potentially fatal collisions.\nThis paper introduces Agile But Safe (ABS), a learning-based control framework\nthat enables agile and collision-free locomotion for quadrupedal robots. ABS\ninvolves an agile policy to execute agile motor skills amidst obstacles and a\nrecovery policy to prevent failures, collaboratively achieving high-speed and\ncollision-free navigation. The policy switch in ABS is governed by a learned\ncontrol-theoretic reach-avoid value network, which also guides the recovery\npolicy as an objective function, thereby safeguarding the robot in a closed\nloop. The training process involves the learning of the agile policy, the\nreach-avoid value network, the recovery policy, and an exteroception\nrepresentation network, all in simulation. These trained modules can be\ndirectly deployed in the real world with onboard sensing and computation,\nleading to high-speed and collision-free navigation in confined indoor and\noutdoor spaces with both static and dynamic obstacles.\n","authors":["Tairan He","Chong Zhang","Wenli Xiao","Guanqi He","Changliu Liu","Guanya Shi"],"pdf_url":"https://arxiv.org/pdf/2401.17583v1.pdf","comment":"Project website: https://agile-but-safe.github.io/"},{"id":"http://arxiv.org/abs/2311.12831v3","updated":"2024-01-31T03:53:31Z","published":"2023-10-02T06:06:32Z","title":"ECNR: Efficient Compressive Neural Representation of Time-Varying\n Volumetric Datasets","summary":" Due to its conceptual simplicity and generality, compressive neural\nrepresentation has emerged as a promising alternative to traditional\ncompression methods for managing massive volumetric datasets. The current\npractice of neural compression utilizes a single large multilayer perceptron\n(MLP) to encode the global volume, incurring slow training and inference. This\npaper presents an efficient compressive neural representation (ECNR) solution\nfor time-varying data compression, utilizing the Laplacian pyramid for adaptive\nsignal fitting. Following a multiscale structure, we leverage multiple small\nMLPs at each scale for fitting local content or residual blocks. By assigning\nsimilar blocks to the same MLP via size uniformization, we enable balanced\nparallelization among MLPs to significantly speed up training and inference.\nWorking in concert with the multiscale structure, we tailor a deep compression\nstrategy to compact the resulting model. We show the effectiveness of ECNR with\nmultiple datasets and compare it with state-of-the-art compression methods\n(mainly SZ3, TTHRESH, and neurcomp). The results position ECNR as a promising\nsolution for volumetric data compression.\n","authors":["Kaiyuan Tang","Chaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12831v3.pdf","comment":"Accepted by IEEE PacificVis 2024 (conference papers track)"},{"id":"http://arxiv.org/abs/2401.15893v2","updated":"2024-01-31T03:53:05Z","published":"2024-01-29T05:16:19Z","title":"Arbitrary-Scale Downscaling of Tidal Current Data Using Implicit\n Continuous Representation","summary":" Numerical models have long been used to understand geoscientific phenomena,\nincluding tidal currents, crucial for renewable energy production and coastal\nengineering. However, their computational cost hinders generating data of\nvarying resolutions. As an alternative, deep learning-based downscaling methods\nhave gained traction due to their faster inference speeds. But most of them are\nlimited to only inference fixed scale and overlook important characteristics of\ntarget geoscientific data. In this paper, we propose a novel downscaling\nframework for tidal current data, addressing its unique characteristics, which\nare dissimilar to images: heterogeneity and local dependency. Moreover, our\nframework can generate any arbitrary-scale output utilizing a continuous\nrepresentation model. Our proposed framework demonstrates significantly\nimproved flow velocity predictions by 93.21% (MSE) and 63.85% (MAE) compared to\nthe Baseline model while achieving a remarkable 33.2% reduction in FLOPs.\n","authors":["Dongheon Lee","Seungmyong Jeong","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2401.15893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17571v1","updated":"2024-01-31T03:28:11Z","published":"2024-01-31T03:28:11Z","title":"Is Registering Raw Tagged-MR Enough for Strain Estimation in the Era of\n Deep Learning?","summary":" Magnetic Resonance Imaging with tagging (tMRI) has long been utilized for\nquantifying tissue motion and strain during deformation. However, a phenomenon\nknown as tag fading, a gradual decrease in tag visibility over time, often\ncomplicates post-processing. The first contribution of this study is to model\ntag fading by considering the interplay between $T_1$ relaxation and the\nrepeated application of radio frequency (RF) pulses during serial imaging\nsequences. This is a factor that has been overlooked in prior research on tMRI\npost-processing. Further, we have observed an emerging trend of utilizing raw\ntagged MRI within a deep learning-based (DL) registration framework for motion\nestimation. In this work, we evaluate and analyze the impact of commonly used\nimage similarity objectives in training DL registrations on raw tMRI. This is\nthen compared with the Harmonic Phase-based approach, a traditional approach\nwhich is claimed to be robust to tag fading. Our findings, derived from both\nsimulated images and an actual phantom scan, reveal the limitations of various\nsimilarity losses in raw tMRI and emphasize caution in registration tasks where\nimage intensity changes over time.\n","authors":["Zhangxing Bian","Ahmed Alshareef","Shuwen Wei","Junyu Chen","Yuli Wang","Jonghye Woo","Dzung L. Pham","Jiachen Zhuo","Aaron Carass","Jerry L. Prince"],"pdf_url":"https://arxiv.org/pdf/2401.17571v1.pdf","comment":"Accepted to SPIE Medical Imaging 2024 (oral)"},{"id":"http://arxiv.org/abs/2308.07490v2","updated":"2024-01-31T03:07:35Z","published":"2023-08-14T22:47:48Z","title":"BSED: Baseline Shapley-Based Explainable Detector","summary":" Explainable artificial intelligence (XAI) has witnessed significant advances\nin the field of object recognition, with saliency maps being used to highlight\nimage features relevant to the predictions of learned models. Although these\nadvances have made AI-based technology more interpretable to humans, several\nissues have come to light. Some approaches present explanations irrelevant to\npredictions, and cannot guarantee the validity of XAI (axioms). In this study,\nwe propose the Baseline Shapley-based Explainable Detector (BSED), which\nextends the Shapley value to object detection, thereby enhancing the validity\nof interpretation. The Shapley value can attribute the prediction of a learned\nmodel to a baseline feature while satisfying the explainability axioms. The\nprocessing cost for the BSED is within the reasonable range, while the original\nShapley value is prohibitively computationally expensive. Furthermore, BSED is\na generalizable method that can be applied to various detectors in a\nmodel-agnostic manner, and interpret various detection targets without\nfine-grained parameter tuning. These strengths can enable the practical\napplicability of XAI. We present quantitative and qualitative comparisons with\nexisting methods to demonstrate the superior performance of our method in terms\nof explanation validity. Moreover, we present some applications, such as\ncorrecting detection based on explanations from our method.\n","authors":["Michihiro Kuroki","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2308.07490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17547v1","updated":"2024-01-31T02:25:52Z","published":"2024-01-31T02:25:52Z","title":"Task-Oriented Diffusion Model Compression","summary":" As recent advancements in large-scale Text-to-Image (T2I) diffusion models\nhave yielded remarkable high-quality image generation, diverse downstream\nImage-to-Image (I2I) applications have emerged. Despite the impressive results\nachieved by these I2I models, their practical utility is hampered by their\nlarge model size and the computational burden of the iterative denoising\nprocess. In this paper, we explore the compression potential of these I2I\nmodels in a task-oriented manner and introduce a novel method for reducing both\nmodel size and the number of timesteps. Through extensive experiments, we\nobserve key insights and use our empirical knowledge to develop practical\nsolutions that aim for near-optimal results with minimal exploration costs. We\nvalidate the effectiveness of our method by applying it to InstructPix2Pix for\nimage editing and StableSR for image restoration. Our approach achieves\nsatisfactory output quality with 39.2% and 56.4% reduction in model footprint\nand 81.4% and 68.7% decrease in latency to InstructPix2Pix and StableSR,\nrespectively.\n","authors":["Geonung Kim","Beomsu Kim","Eunhyeok Park","Sunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2401.17547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17544v1","updated":"2024-01-31T02:18:27Z","published":"2024-01-31T02:18:27Z","title":"Trainable Fixed-Point Quantization for Deep Learning Acceleration on\n FPGAs","summary":" Quantization is a crucial technique for deploying deep learning models on\nresource-constrained devices, such as embedded FPGAs. Prior efforts mostly\nfocus on quantizing matrix multiplications, leaving other layers like BatchNorm\nor shortcuts in floating-point form, even though fixed-point arithmetic is more\nefficient on FPGAs. A common practice is to fine-tune a pre-trained model to\nfixed-point for FPGA deployment, but potentially degrading accuracy.\n This work presents QFX, a novel trainable fixed-point quantization approach\nthat automatically learns the binary-point position during model training.\nAdditionally, we introduce a multiplier-free quantization strategy within QFX\nto minimize DSP usage. QFX is implemented as a PyTorch-based library that\nefficiently emulates fixed-point arithmetic, supported by FPGA HLS, in a\ndifferentiable manner during backpropagation. With minimal effort, models\ntrained with QFX can readily be deployed through HLS, producing the same\nnumerical results as their software counterparts. Our evaluation shows that\ncompared to post-training quantization, QFX can quantize models trained with\nelement-wise layers quantized to fewer bits and achieve higher accuracy on both\nCIFAR-10 and ImageNet datasets. We further demonstrate the efficacy of\nmultiplier-free quantization using a state-of-the-art binarized neural network\naccelerator designed for an embedded FPGA (AMD Xilinx Ultra96 v2). We plan to\nrelease QFX in open-source format.\n","authors":["Dingyi Dai","Yichi Zhang","Jiahao Zhang","Zhanqiu Hu","Yaohui Cai","Qi Sun","Zhiru Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17542v1","updated":"2024-01-31T02:09:21Z","published":"2024-01-31T02:09:21Z","title":"Data-Effective Learning: A Comprehensive Medical Benchmark","summary":" Data-effective learning aims to use data in the most impactful way to train\nAI models, which involves strategies that focus on data quality rather than\nquantity, ensuring the data used for training has high informational value.\nData-effective learning plays a profound role in accelerating AI training,\nreducing computational costs, and saving data storage, which is very important\nas the volume of medical data in recent years has grown beyond many people's\nexpectations. However, due to the lack of standards and comprehensive\nbenchmark, research on medical data-effective learning is poorly studied. To\naddress this gap, our paper introduces a comprehensive benchmark specifically\nfor evaluating data-effective learning in the medical field. This benchmark\nincludes a dataset with millions of data samples from 31 medical centers\n(DataDEL), a baseline method for comparison (MedDEL), and a new evaluation\nmetric (NormDEL) to objectively measure data-effective learning performance.\nOur extensive experimental results show the baseline MedDEL can achieve\nperformance comparable to the original large dataset with only 5% of the data.\nEstablishing such an open data-effective learning benchmark is crucial for the\nmedical AI research community because it facilitates efficient data use,\npromotes collaborative breakthroughs, and fosters the development of\ncost-effective, scalable, and impactful healthcare solutions. The project can\nbe accessed at\nhttps://github.com/shadow2469/Data-Effective-Learning-A-Comprehensive-Medical-Benchmark.git.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12076v3","updated":"2024-01-31T02:01:11Z","published":"2023-11-20T03:51:58Z","title":"Towards Few-shot Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is critical for ensuring the reliability\nof open-world intelligent systems. Despite the notable advancements in existing\nOOD detection methodologies, our study identifies a significant performance\ndrop under the scarcity of training samples. In this context, we introduce a\nnovel few-shot OOD detection benchmark, carefully constructed to address this\ngap. Our empirical analysis reveals the superiority of ParameterEfficient\nFine-Tuning (PEFT) strategies, such as visual prompt tuning and visual adapter\ntuning, over conventional techniques, including fully fine-tuning and linear\nprobing tuning in the few-shot OOD detection task. Recognizing some crucial\ninformation from the pre-trained model, which is pivotal for OOD detection, may\nbe lost during the fine-tuning process, we propose a method termed\nDomainSpecific and General Knowledge Fusion (DSGF). This approach is designed\nto be compatible with diverse fine-tuning frameworks. Our experiments show that\nthe integration of DSGF significantly enhances the few-shot OOD detection\ncapabilities across various methods and fine-tuning methodologies, including\nfully fine-tuning, visual adapter tuning, and visual prompt tuning. The code\nwill be released.\n","authors":["Jiuqing Dong","Yongbin Gao","Heng Zhou","Jun Cen","Yifan Yao","Sook Yoon","Park Dong Sun"],"pdf_url":"https://arxiv.org/pdf/2311.12076v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09641v2","updated":"2024-01-31T01:33:16Z","published":"2022-09-09T20:21:03Z","title":"Calibrating Segmentation Networks with Margin-based Label Smoothing","summary":" Despite the undeniable progress in visual recognition tasks fueled by deep\nneural networks, there exists recent evidence showing that these models are\npoorly calibrated, resulting in over-confident predictions. The standard\npractices of minimizing the cross entropy loss during training promote the\npredicted softmax probabilities to match the one-hot label assignments.\nNevertheless, this yields a pre-softmax activation of the correct class that is\nsignificantly larger than the remaining activations, which exacerbates the\nmiscalibration problem. Recent observations from the classification literature\nsuggest that loss functions that embed implicit or explicit maximization of the\nentropy of predictions yield state-of-the-art calibration performances. Despite\nthese findings, the impact of these losses in the relevant task of calibrating\nmedical image segmentation networks remains unexplored. In this work, we\nprovide a unifying constrained-optimization perspective of current\nstate-of-the-art calibration losses. Specifically, these losses could be viewed\nas approximations of a linear penalty (or a Lagrangian term) imposing equality\nconstraints on logit distances. This points to an important limitation of such\nunderlying equality constraints, whose ensuing gradients constantly push\ntowards a non-informative solution, which might prevent from reaching the best\ncompromise between the discriminative performance and calibration of the model\nduring gradient-based optimization. Following our observations, we propose a\nsimple and flexible generalization based on inequality constraints, which\nimposes a controllable margin on logit distances. Comprehensive experiments on\na variety of public medical image segmentation benchmarks demonstrate that our\nmethod sets novel state-of-the-art results on these tasks in terms of network\ncalibration, whereas the discriminative performance is also improved.\n","authors":["Balamurali Murugesan","Bingyuan Liu","Adrian Galdran","Ismail Ben Ayed","Jose Dolz"],"pdf_url":"https://arxiv.org/pdf/2209.09641v2.pdf","comment":"MedIA 2023. The code is available at\n https://github.com/Bala93/MarginLoss. arXiv admin note: substantial text\n overlap with arXiv:2111.15430"},{"id":"http://arxiv.org/abs/2401.11143v3","updated":"2024-01-31T01:22:43Z","published":"2024-01-20T06:42:32Z","title":"Gaussian Adaptive Attention is All You Need: Robust Contextual\n Representations Across Multiple Modalities","summary":" We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a\nnovel probabilistic attention framework, and the Gaussian Adaptive Transformer\n(GAT), designed to enhance information aggregation across multiple modalities,\nincluding Speech, Text and Vision. GAAM integrates learnable mean and variance\ninto its attention mechanism, implemented in a Multi-Headed framework enabling\nit to collectively model any Probability Distribution for dynamic recalibration\nof feature significance. This method demonstrates significant improvements,\nespecially with highly non-stationary data, surpassing the state-of-the-art\nattention techniques in model performance (up to approximately +20% in\naccuracy) by identifying key elements within the feature space. GAAM's\ncompatibility with dot-product-based attention models and relatively low number\nof parameters showcases its adaptability and potential to boost existing\nattention frameworks. Empirically, GAAM exhibits superior adaptability and\nefficacy across a diverse range of tasks, including emotion recognition in\nspeech, image classification, and text classification, thereby establishing its\nrobustness and versatility in handling multi-modal data. Furthermore, we\nintroduce the Importance Factor (IF), a new learning-based metric that enhances\nthe explainability of models trained with GAAM-based methods. Overall, GAAM\nrepresents an advancement towards development of better performing and more\nexplainable attention models across multiple modalities.\n","authors":["Georgios Ioannides","Aman Chadha","Aaron Elkins"],"pdf_url":"https://arxiv.org/pdf/2401.11143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12862v3","updated":"2024-01-31T01:05:14Z","published":"2023-09-22T13:37:10Z","title":"Associative Transformer","summary":" Emerging from the pairwise attention in conventional Transformers, there is a\ngrowing interest in sparse attention mechanisms that align more closely with\nlocalized, contextual learning in the biological brain. Existing studies such\nas the Coordination method employ iterative cross-attention mechanisms with a\nbottleneck to enable the sparse association of inputs. However, these methods\nare parameter inefficient and fail in more complex relational reasoning tasks.\nTo this end, we propose Associative Transformer (AiT) to enhance the\nassociation among sparsely attended input patches, improving parameter\nefficiency and performance in relational reasoning tasks. AiT leverages a\nlearnable explicit memory, comprised of various specialized priors, with a\nbottleneck attention to facilitate the extraction of diverse localized\nfeatures. Moreover, we propose a novel associative memory-enabled patch\nreconstruction with a Hopfield energy function. The extensive experiments in\nfour image classification tasks with three different sizes of AiT demonstrate\nthat AiT requires significantly fewer parameters and attention layers while\noutperforming Vision Transformers and a broad range of sparse Transformers.\nAdditionally, AiT establishes new SOTA performance in the Sort-of-CLEVR\ndataset, outperforming the previous Coordination method.\n","authors":["Yuwei Sun","Hideya Ochiai","Zhirong Wu","Stephen Lin","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2309.12862v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.01327v5","updated":"2024-01-31T00:40:51Z","published":"2022-03-02T17:38:44Z","title":"Hyperspectral Pixel Unmixing with Latent Dirichlet Variational\n Autoencoder","summary":" We present a method for hyperspectral pixel {\\it unmixing}. The proposed\nmethod assumes that (1) {\\it abundances} can be encoded as Dirichlet\ndistributions and (2) spectra of {\\it endmembers} can be represented as\nmultivariate Normal distributions. The method solves the problem of abundance\nestimation and endmember extraction within a variational autoencoder setting\nwhere a Dirichlet bottleneck layer models the abundances, and the decoder\nperforms endmember extraction. The proposed method can also leverage transfer\nlearning paradigm, where the model is only trained on synthetic data containing\npixels that are linear combinations of one or more endmembers of interest. In\nthis case, we retrieve endmembers (spectra) from the United States Geological\nSurvey Spectral Library. The model thus trained can be subsequently used to\nperform pixel unmixing on \"real data\" that contains a subset of the endmembers\nused to generated the synthetic data. The model achieves state-of-the-art\nresults on several benchmarks: Cuprite, Urban Hydice and Samson. We also\npresent new synthetic dataset, OnTech-HSI-Syn-21, that can be used to study\nhyperspectral pixel unmixing methods. We showcase the transfer learning\ncapabilities of the proposed model on Cuprite and OnTech-HSI-Syn-21 datasets.\nIn summary, the proposed method can be applied for pixel unmixing a variety of\ndomains, including agriculture, forestry, mineralogy, analysis of materials,\nhealthcare, etc. Additionally, the proposed method eschews the need for\nlabelled data for training by leveraging the transfer learning paradigm, where\nthe model is trained on synthetic data generated using the endmembers present\nin the \"real\" data.\n","authors":["Kiran Mantripragada","Faisal Z. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2203.01327v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17515v1","updated":"2024-01-31T00:16:02Z","published":"2024-01-31T00:16:02Z","title":"Towards Image Semantics and Syntax Sequence Learning","summary":" Convolutional neural networks and vision transformers have achieved\noutstanding performance in machine perception, particularly for image\nclassification. Although these image classifiers excel at predicting\nimage-level class labels, they may not discriminate missing or shifted parts\nwithin an object. As a result, they may fail to detect corrupted images that\ninvolve missing or disarrayed semantic information in the object composition.\nOn the contrary, human perception easily distinguishes such corruptions. To\nmitigate this gap, we introduce the concept of \"image grammar\", consisting of\n\"image semantics\" and \"image syntax\", to denote the semantics of parts or\npatches of an image and the order in which these parts are arranged to create a\nmeaningful object. To learn the image grammar relative to a class of visual\nobjects/scenes, we propose a weakly supervised two-stage approach. In the first\nstage, we use a deep clustering framework that relies on iterative clustering\nand feature refinement to produce part-semantic segmentation. In the second\nstage, we incorporate a recurrent bi-LSTM module to process a sequence of\nsemantic segmentation patches to capture the image syntax. Our framework is\ntrained to reason over patch semantics and detect faulty syntax. We benchmark\nthe performance of several grammar learning models in detecting patch\ncorruptions. Finally, we verify the capabilities of our framework in Celeb and\nSUNRGBD datasets and demonstrate that it can achieve a grammar validation\naccuracy of 70 to 90% in a wide variety of semantic and syntactical corruption\nscenarios.\n","authors":["Chun Tao","Timur Ibrayev","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2401.17515v1.pdf","comment":"21 pages, 22 figures, 5 tables"},{"id":"http://arxiv.org/abs/2402.00243v1","updated":"2024-01-31T23:52:14Z","published":"2024-01-31T23:52:14Z","title":"Capacity Constraint Analysis Using Object Detection for Smart\n Manufacturing","summary":" The increasing popularity of Deep Learning (DL) based Object Detection (OD)\nmethods and their real-world applications have opened new venues in smart\nmanufacturing. Traditional industries struck by capacity constraints after\nCoronavirus Disease (COVID-19) require non-invasive methods for in-depth\noperations' analysis to optimize and increase their revenue. In this study, we\nhave initially developed a Convolutional Neural Network (CNN) based OD model to\ntackle this issue. This model is trained to accurately identify the presence of\nchairs and individuals on the production floor. The identified objects are then\npassed to the CNN based tracker, which tracks them throughout their life cycle\nin the workstation. The extracted meta-data is further processed through a\nnovel framework for the capacity constraint analysis. We identified that the\nStation C is only 70.6% productive through 6 months. Additionally, the time\nspent at each station is recorded and aggregated for each object. This data\nproves helpful in conducting annual audits and effectively managing labor and\nmaterial over time.\n","authors":["Hafiz Mughees Ahmad","Afshin Rahimi","Khizer Hayat"],"pdf_url":"https://arxiv.org/pdf/2402.00243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00240v1","updated":"2024-01-31T23:48:48Z","published":"2024-01-31T23:48:48Z","title":"Spectral Norm of Convolutional Layers with Circular and Zero Paddings","summary":" This paper leverages the use of \\emph{Gram iteration} an efficient,\ndeterministic, and differentiable method for computing spectral norm with an\nupper bound guarantee. Designed for circular convolutional layers, we\ngeneralize the use of the Gram iteration to zero padding convolutional layers\nand prove its quadratic convergence. We also provide theorems for bridging the\ngap between circular and zero padding convolution's spectral norm. We design a\n\\emph{spectral rescaling} that can be used as a competitive $1$-Lipschitz layer\nthat enhances network robustness. Demonstrated through experiments, our method\noutperforms state-of-the-art techniques in precision, computational cost, and\nscalability. The code of experiments is available at\nhttps://github.com/blaisedelattre/lip4conv.\n","authors":["Blaise Delattre","Quentin Barthélemy","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2402.00240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00225v1","updated":"2024-01-31T23:06:39Z","published":"2024-01-31T23:06:39Z","title":"Geometry aware 3D generation from in-the-wild images in ImageNet","summary":" Generating accurate 3D models is a challenging problem that traditionally\nrequires explicit learning from 3D datasets using supervised learning. Although\nrecent advances have shown promise in learning 3D models from 2D images, these\nmethods often rely on well-structured datasets with multi-view images of each\ninstance or camera pose information. Furthermore, these datasets usually\ncontain clean backgrounds with simple shapes, making them expensive to acquire\nand hard to generalize, which limits the applicability of these methods. To\novercome these limitations, we propose a method for reconstructing 3D geometry\nfrom the diverse and unstructured Imagenet dataset without camera pose\ninformation. We use an efficient triplane representation to learn 3D models\nfrom 2D images and modify the architecture of the generator backbone based on\nStyleGAN2 to adapt to the highly diverse dataset. To prevent mode collapse and\nimprove the training stability on diverse data, we propose to use multi-view\ndiscrimination. The trained generator can produce class-conditional 3D models\nas well as renderings from arbitrary viewpoints. The class-conditional\ngeneration results demonstrate significant improvement over the current\nstate-of-the-art method. Additionally, using PTI, we can efficiently\nreconstruct the whole 3D geometry from single-view images.\n","authors":["Qijia Shen","Guangrun Wang"],"pdf_url":"https://arxiv.org/pdf/2402.00225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06377v3","updated":"2024-01-31T22:57:22Z","published":"2023-08-11T20:21:54Z","title":"CATS v2: Hybrid encoders for robust medical segmentation","summary":" Convolutional Neural Networks (CNNs) have exhibited strong performance in\nmedical image segmentation tasks by capturing high-level (local) information,\nsuch as edges and textures. However, due to the limited field of view of\nconvolution kernel, it is hard for CNNs to fully represent global information.\nRecently, transformers have shown good performance for medical image\nsegmentation due to their ability to better model long-range dependencies.\nNevertheless, transformers struggle to capture high-level spatial features as\neffectively as CNNs. A good segmentation model should learn a better\nrepresentation from local and global features to be both precise and\nsemantically accurate. In our previous work, we proposed CATS, which is a\nU-shaped segmentation network augmented with transformer encoder. In this work,\nwe further extend this model and propose CATS v2 with hybrid encoders.\nSpecifically, hybrid encoders consist of a CNN-based encoder path paralleled to\na transformer path with a shifted window, which better leverage both local and\nglobal information to produce robust 3D medical image segmentation. We fuse the\ninformation from the convolutional encoder and the transformer at the skip\nconnections of different resolutions to form the final segmentation. The\nproposed method is evaluated on three public challenge datasets: Beyond the\nCranial Vault (BTCV), Cross-Modality Domain Adaptation (CrossMoDA) and task 5\nof Medical Segmentation Decathlon (MSD-5), to segment abdominal organs,\nvestibular schwannoma (VS) and prostate, respectively. Compared with the\nstate-of-the-art methods, our approach demonstrates superior performance in\nterms of higher Dice scores. Our code is publicly available at\nhttps://github.com/MedICL-VU/CATS.\n","authors":["Hao Li","Han Liu","Dewei Hu","Xing Yao","Jiacheng Wang","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2308.06377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00186v1","updated":"2024-01-31T21:28:40Z","published":"2024-01-31T21:28:40Z","title":"Distance and Collision Probability Estimation from Gaussian Surface\n Models","summary":" This paper describes continuous-space methodologies to estimate the collision\nprobability, Euclidean distance and gradient between an ellipsoidal robot model\nand an environment surface modeled as a set of Gaussian distributions.\nContinuous-space collision probability estimation is critical for\nuncertainty-aware motion planning. Most collision detection and avoidance\napproaches assume the robot is modeled as a sphere, but ellipsoidal\nrepresentations provide tighter approximations and enable navigation in\ncluttered and narrow spaces. State-of-the-art methods derive the Euclidean\ndistance and gradient by processing raw point clouds, which is computationally\nexpensive for large workspaces. Recent advances in Gaussian surface modeling\n(e.g. mixture models, splatting) enable compressed and high-fidelity surface\nrepresentations. Few methods exist to estimate continuous-space occupancy from\nsuch models. They require Gaussians to model free space and are unable to\nestimate the collision probability, Euclidean distance and gradient for an\nellipsoidal robot. The proposed methods bridge this gap by extending prior work\nin ellipsoid-to-ellipsoid Euclidean distance and collision probability\nestimation to Gaussian surface models. A geometric blending approach is also\nproposed to improve collision probability estimation. The approaches are\nevaluated with numerical 2D and 3D experiments using real-world point cloud\ndata.\n","authors":["Kshitij Goel","Wennie Tabib"],"pdf_url":"https://arxiv.org/pdf/2402.00186v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2402.00175v1","updated":"2024-01-31T21:05:34Z","published":"2024-01-31T21:05:34Z","title":"Weakly-Supervised Detection of Bone Lesions in CT","summary":" The skeletal region is one of the common sites of metastatic spread of cancer\nin the breast and prostate. CT is routinely used to measure the size of lesions\nin the bones. However, they can be difficult to spot due to the wide variations\nin their sizes, shapes, and appearances. Precise localization of such lesions\nwould enable reliable tracking of interval changes (growth, shrinkage, or\nunchanged status). To that end, an automated technique to detect bone lesions\nis highly desirable. In this pilot work, we developed a pipeline to detect bone\nlesions (lytic, blastic, and mixed) in CT volumes via a proxy segmentation\ntask. First, we used the bone lesions that were prospectively marked by\nradiologists in a few 2D slices of CT volumes and converted them into weak 3D\nsegmentation masks. Then, we trained a 3D full-resolution nnUNet model using\nthese weak 3D annotations to segment the lesions and thereby detected them. Our\nautomated method detected bone lesions in CT with a precision of 96.7% and\nrecall of 47.3% despite the use of incomplete and partial training data. To the\nbest of our knowledge, we are the first to attempt the direct detection of bone\nlesions in CT via a proxy segmentation task.\n","authors":["Tao Sheng","Tejas Sudharshan Mathai","Alexander Shieh","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2402.00175v1.pdf","comment":"Accepted at SPIE 2024"},{"id":"http://arxiv.org/abs/2402.00163v1","updated":"2024-01-31T20:37:35Z","published":"2024-01-31T20:37:35Z","title":"Improving Object Detection Quality in Football Through Super-Resolution\n Techniques","summary":" This study explores the potential of super-resolution techniques in enhancing\nobject detection accuracy in football. Given the sport's fast-paced nature and\nthe critical importance of precise object (e.g. ball, player) tracking for both\nanalysis and broadcasting, super-resolution could offer significant\nimprovements. We investigate how advanced image processing through\nsuper-resolution impacts the accuracy and reliability of object detection\nalgorithms in processing football match footage.\n Our methodology involved applying state-of-the-art super-resolution\ntechniques to a diverse set of football match videos from SoccerNet, followed\nby object detection using Faster R-CNN. The performance of these algorithms,\nboth with and without super-resolution enhancement, was rigorously evaluated in\nterms of detection accuracy.\n The results indicate a marked improvement in object detection accuracy when\nsuper-resolution preprocessing is applied. The improvement of object detection\nthrough the integration of super-resolution techniques yields significant\nbenefits, especially for low-resolution scenarios, with a notable 12\\% increase\nin mean Average Precision (mAP) at an IoU (Intersection over Union) range of\n0.50:0.95 for 320x240 size images when increasing the resolution fourfold using\nRLFN. As the dimensions increase, the magnitude of improvement becomes more\nsubdued; however, a discernible improvement in the quality of detection is\nconsistently evident. Additionally, we discuss the implications of these\nfindings for real-time sports analytics, player tracking, and the overall\nviewing experience. The study contributes to the growing field of sports\ntechnology by demonstrating the practical benefits and limitations of\nintegrating super-resolution techniques in football analytics and broadcasting.\n","authors":["Karolina Seweryn","Gabriel Chęć","Szymon Łukasik","Anna Wróblewska"],"pdf_url":"https://arxiv.org/pdf/2402.00163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00137v1","updated":"2024-01-31T19:30:04Z","published":"2024-01-31T19:30:04Z","title":"Multimodal Neurodegenerative Disease Subtyping Explained by ChatGPT","summary":" Alzheimer's disease (AD) is the most prevalent neurodegenerative disease; yet\nits currently available treatments are limited to stopping disease progression.\nMoreover, effectiveness of these treatments is not guaranteed due to the\nheterogenetiy of the disease. Therefore, it is essential to be able to identify\nthe disease subtypes at a very early stage. Current data driven approaches are\nable to classify the subtypes at later stages of AD or related disorders, but\nstruggle when predicting at the asymptomatic or prodromal stage. Moreover, most\nexisting models either lack explainability behind the classification or only\nuse a single modality for the assessment, limiting scope of its analysis. Thus,\nwe propose a multimodal framework that uses early-stage indicators such as\nimaging, genetics and clinical assessments to classify AD patients into\nsubtypes at early stages. Similarly, we build prompts and use large language\nmodels, such as ChatGPT, to interpret the findings of our model. In our\nframework, we propose a tri-modal co-attention mechanism (Tri-COAT) to\nexplicitly learn the cross-modal feature associations. Our proposed model\noutperforms baseline models and provides insight into key cross-modal feature\nassociations supported by known biological mechanisms.\n","authors":["Diego Machado Reyes","Hanqing Chao","Juergen Hahn","Li Shen","Pingkun Yan"],"pdf_url":"https://arxiv.org/pdf/2402.00137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00129v1","updated":"2024-01-31T19:14:12Z","published":"2024-01-31T19:14:12Z","title":"CMRNext: Camera to LiDAR Matching in the Wild for Localization and\n Extrinsic Calibration","summary":" LiDARs are widely used for mapping and localization in dynamic environments.\nHowever, their high cost limits their widespread adoption. On the other hand,\nmonocular localization in LiDAR maps using inexpensive cameras is a\ncost-effective alternative for large-scale deployment. Nevertheless, most\nexisting approaches struggle to generalize to new sensor setups and\nenvironments, requiring retraining or fine-tuning. In this paper, we present\nCMRNext, a novel approach for camera-LIDAR matching that is independent of\nsensor-specific parameters, generalizable, and can be used in the wild for\nmonocular localization in LiDAR maps and camera-LiDAR extrinsic calibration.\nCMRNext exploits recent advances in deep neural networks for matching\ncross-modal data and standard geometric techniques for robust pose estimation.\nWe reformulate the point-pixel matching problem as an optical flow estimation\nproblem and solve the Perspective-n-Point problem based on the resulting\ncorrespondences to find the relative pose between the camera and the LiDAR\npoint cloud. We extensively evaluate CMRNext on six different robotic\nplatforms, including three publicly available datasets and three in-house\nrobots. Our experimental evaluations demonstrate that CMRNext outperforms\nexisting approaches on both tasks and effectively generalizes to previously\nunseen environments and sensor setups in a zero-shot manner. We make the code\nand pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de .\n","authors":["Daniele Cattaneo","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2402.00129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00128v1","updated":"2024-01-31T19:12:56Z","published":"2024-01-31T19:12:56Z","title":"Real-time Traffic Object Detection for Autonomous Driving","summary":" With recent advances in computer vision, it appears that autonomous driving\nwill be part of modern society sooner rather than later. However, there are\nstill a significant number of concerns to address. Although modern computer\nvision techniques demonstrate superior performance, they tend to prioritize\naccuracy over efficiency, which is a crucial aspect of real-time applications.\nLarge object detection models typically require higher computational power,\nwhich is achieved by using more sophisticated onboard hardware. For autonomous\ndriving, these requirements translate to increased fuel costs and, ultimately,\na reduction in mileage. Further, despite their computational demands, the\nexisting object detectors are far from being real-time. In this research, we\nassess the robustness of our previously proposed, highly efficient pedestrian\ndetector LSFM on well-established autonomous driving benchmarks, including\ndiverse weather conditions and nighttime scenes. Moreover, we extend our LSFM\nmodel for general object detection to achieve real-time object detection in\ntraffic scenes. We evaluate its performance, low latency, and generalizability\non traffic object detection datasets. Furthermore, we discuss the inadequacy of\nthe current key performance indicator employed by object detection systems in\nthe context of autonomous driving and propose a more suitable alternative that\nincorporates real-time requirements.\n","authors":["Abdul Hannan Khan","Syed Tahseen Raza Rizvi","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2402.00128v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2402.00126v1","updated":"2024-01-31T19:11:58Z","published":"2024-01-31T19:11:58Z","title":"Common Sense Reasoning for Deep Fake Detection","summary":" State-of-the-art approaches rely on image-based features extracted via neural\nnetworks for the deepfake detection binary classification. While these\napproaches trained in the supervised sense extract likely fake features, they\nmay fall short in representing unnatural `non-physical' semantic facial\nattributes -- blurry hairlines, double eyebrows, rigid eye pupils, or unnatural\nskin shading. However, such facial attributes are generally easily perceived by\nhumans via common sense reasoning. Furthermore, image-based feature extraction\nmethods that provide visual explanation via saliency maps can be hard to be\ninterpreted by humans. To address these challenges, we propose the use of\ncommon sense reasoning to model deepfake detection, and extend it to the\nDeepfake Detection VQA (DD-VQA) task with the aim to model human intuition in\nexplaining the reason behind labeling an image as either real or fake. To this\nend, we introduce a new dataset that provides answers to the questions related\nto the authenticity of an image, along with its corresponding explanations. We\nalso propose a Vision and Language Transformer-based framework for the DD-VQA\ntask, incorporating text and image aware feature alignment formulations.\nFinally, we evaluate our method on both the performance of deepfake detection\nand the quality of the generated explanations. We hope that this task inspires\nresearchers to explore new avenues for enhancing language-based\ninterpretability and cross-modality applications in the realm of deepfake\ndetection.\n","authors":["Yue Zhang","Ben Colman","Ali Shahriyari","Gaurav Bharaj"],"pdf_url":"https://arxiv.org/pdf/2402.00126v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.18064v1","updated":"2024-01-31T18:41:08Z","published":"2024-01-31T18:41:08Z","title":"Neural Locality Sensitive Hashing for Entity Blocking","summary":" Locality-sensitive hashing (LSH) is a fundamental algorithmic technique\nwidely employed in large-scale data processing applications, such as\nnearest-neighbor search, entity resolution, and clustering. However, its\napplicability in some real-world scenarios is limited due to the need for\ncareful design of hashing functions that align with specific metrics. Existing\nLSH-based Entity Blocking solutions primarily rely on generic similarity\nmetrics such as Jaccard similarity, whereas practical use cases often demand\ncomplex and customized similarity rules surpassing the capabilities of generic\nsimilarity metrics. Consequently, designing LSH functions for these customized\nsimilarity rules presents considerable challenges. In this research, we propose\na neuralization approach to enhance locality-sensitive hashing by training deep\nneural networks to serve as hashing functions for complex metrics. We assess\nthe effectiveness of this approach within the context of the entity resolution\nproblem, which frequently involves the use of task-specific metrics in\nreal-world applications. Specifically, we introduce NLSHBlock (Neural-LSH\nBlock), a novel blocking methodology that leverages pre-trained language\nmodels, fine-tuned with a novel LSH-based loss function. Through extensive\nevaluations conducted on a diverse range of real-world datasets, we demonstrate\nthe superiority of NLSHBlock over existing methods, exhibiting significant\nperformance improvements. Furthermore, we showcase the efficacy of NLSHBlock in\nenhancing the performance of the entity matching phase, particularly within the\nsemi-supervised setting.\n","authors":["Runhui Wang","Luyang Kong","Yefan Tao","Andrew Borthwick","Davor Golac","Henrik Johnson","Shadie Hijazi","Dong Deng","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.18064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16108v2","updated":"2024-01-31T16:51:37Z","published":"2024-01-29T12:27:18Z","title":"Future Impact Decomposition in Request-level Recommendations","summary":" In recommender systems, reinforcement learning solutions have shown promising\nresults in optimizing the interaction sequence between users and the system\nover the long-term performance. For practical reasons, the policy's actions are\ntypically designed as recommending a list of items to handle users' frequent\nand continuous browsing requests more efficiently. In this list-wise\nrecommendation scenario, the user state is updated upon every request in the\ncorresponding MDP formulation. However, this request-level formulation is\nessentially inconsistent with the user's item-level behavior. In this study, we\ndemonstrate that an item-level optimization approach can better utilize item\ncharacteristics and optimize the policy's performance even under the\nrequest-level MDP. We support this claim by comparing the performance of\nstandard request-level methods with the proposed item-level actor-critic\nframework in both simulation and online experiments. Furthermore, we show that\na reward-based future decomposition strategy can better express the item-wise\nfuture impact and improve the recommendation accuracy in the long term. To\nachieve a more thorough understanding of the decomposition strategy, we propose\na model-based re-weighting framework with adversarial learning that further\nboost the performance and investigate its correlation with the reward-based\nstrategy.\n","authors":["Xiaobei Wang","Shuchang Liu","Xueliang Wang","Qingpeng Cai","Lantao Hu","Han Li","Peng Jiang","Guangming Xie"],"pdf_url":"https://arxiv.org/pdf/2401.16108v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.17952v1","updated":"2024-01-31T15:59:16Z","published":"2024-01-31T15:59:16Z","title":"Error-Tolerant E-Discovery Protocols","summary":" We consider the multi-party classification problem introduced by Dong,\nHartline, and Vijayaraghavan (2022) in the context of electronic discovery\n(e-discovery). Based on a request for production from the requesting party, the\nresponding party is required to provide documents that are responsive to the\nrequest except for those that are legally privileged. Our goal is to find a\nprotocol that verifies that the responding party sends almost all responsive\ndocuments while minimizing the disclosure of non-responsive documents. We\nprovide protocols in the challenging non-realizable setting, where the instance\nmay not be perfectly separated by a linear classifier. We demonstrate\nempirically that our protocol successfully manages to find almost all relevant\ndocuments, while incurring only a small disclosure of non-responsive documents.\nWe complement this with a theoretical analysis of our protocol in the\nsingle-dimensional setting, and other experiments on simulated data which\nsuggest that the non-responsive disclosure incurred by our protocol may be\nunavoidable.\n","authors":["Jinshuo Dong","Jason D. Hartline","Liren Shan","Aravindan Vijayaraghavan"],"pdf_url":"https://arxiv.org/pdf/2401.17952v1.pdf","comment":"28 pages, 6 figures, CSLAW 2024"},{"id":"http://arxiv.org/abs/2401.17878v1","updated":"2024-01-31T14:36:44Z","published":"2024-01-31T14:36:44Z","title":"A Survey on Data-Centric Recommender Systems","summary":" Recommender systems (RS) have become essential tools for mitigating\ninformation overload in a range of real-world scenarios. Recent trends in RS\nhave seen a paradigm shift, moving the spotlight from model-centric innovations\nto the importance of data quality and quantity. This evolution has given rise\nto the concept of data-centric recommender systems (Data-Centric RS), marking a\nsignificant development in the field. This survey provides the first systematic\noverview of Data-Centric RS, covering 1) the foundational concepts of\nrecommendation data and Data-Centric RS; 2) three primary issues in\nrecommendation data; 3) recent research developed to address these issues; and\n4) several potential future directions in Data-Centric RS.\n","authors":["Riwei Lai","Li Chen","Rui Chen","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17859v1","updated":"2024-01-31T14:19:08Z","published":"2024-01-31T14:19:08Z","title":"Towards Semantic Consistency: Dirichlet Energy Driven Robust Multi-Modal\n Entity Alignment","summary":" In Multi-Modal Knowledge Graphs (MMKGs), Multi-Modal Entity Alignment (MMEA)\nis crucial for identifying identical entities across diverse modal attributes.\nHowever, semantic inconsistency, mainly due to missing modal attributes, poses\na significant challenge. Traditional approaches rely on attribute\ninterpolation, but this often introduces modality noise, distorting the\noriginal semantics. Moreover, the lack of a universal theoretical framework\nlimits advancements in achieving semantic consistency. This study introduces a\nnovel approach, DESAlign, which addresses these issues by applying a\ntheoretical framework based on Dirichlet energy to ensure semantic consistency.\nWe discover that semantic inconsistency leads to model overfitting to modality\nnoise, causing performance fluctuations, particularly when modalities are\nmissing. DESAlign innovatively combats over-smoothing and interpolates absent\nsemantics using existing modalities. Our approach includes a multi-modal\nknowledge graph learning strategy and a propagation technique that employs\nexisting semantic features to compensate for missing ones, providing explicit\nEuler solutions. Comprehensive evaluations across 18 benchmarks, including\nmonolingual and bilingual scenarios, demonstrate that DESAlign surpasses\nexisting methods, setting a new standard in performance. Further testing on 42\nbenchmarks with high rates of missing modalities confirms its robustness,\noffering an effective solution to semantic inconsistency in real-world MMKGs.\n","authors":["Yuanyi Wang","Haifeng Sun","Jiabo Wang","Jingyu Wang","Wei Tang","Qi Qi","Shaoling Sun","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2401.17859v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.16210 by other authors"},{"id":"http://arxiv.org/abs/2401.17855v1","updated":"2024-01-31T14:17:00Z","published":"2024-01-31T14:17:00Z","title":"Network-based Topic Structure Visualization","summary":" In the real world, many topics are inter-correlated, making it challenging to\ninvestigate their structure and relationships. Understanding the interplay\nbetween topics and their relevance can provide valuable insights for\nresearchers, guiding their studies and informing the direction of research. In\nthis paper, we utilize the topic-words distribution, obtained from topic\nmodels, as item-response data to model the structure of topics using a latent\nspace item response model. By estimating the latent positions of topics based\non their distances toward words, we can capture the underlying topic structure\nand reveal their relationships. Visualizing the latent positions of topics in\nEuclidean space allows for an intuitive understanding of their proximity and\nassociations. We interpret relationships among topics by characterizing each\ntopic based on representative words selected using a newly proposed scoring\nscheme. Additionally, we assess the maturity of topics by tracking their latent\npositions using different word sets, providing insights into the robustness of\ntopics. To demonstrate the effectiveness of our approach, we analyze the topic\ncomposition of COVID-19 studies during the early stage of its emergence using\nbiomedical literature in the PubMed database. The software and data used in\nthis paper are publicly available at https://github.com/jeon9677/gViz .\n","authors":["Yeseul Jeon","Jina Park","Ick Hoon Jin","Dongjun Chungc"],"pdf_url":"https://arxiv.org/pdf/2401.17855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17839v1","updated":"2024-01-31T13:57:24Z","published":"2024-01-31T13:57:24Z","title":"Global-Liar: Factuality of LLMs over Time and Geographic Regions","summary":" The increasing reliance on AI-driven solutions, particularly Large Language\nModels (LLMs) like the GPT series, for information retrieval highlights the\ncritical need for their factuality and fairness, especially amidst the rampant\nspread of misinformation and disinformation online. Our study evaluates the\nfactual accuracy, stability, and biases in widely adopted GPT models, including\nGPT-3.5 and GPT-4, contributing to reliability and integrity of AI-mediated\ninformation dissemination.\n We introduce 'Global-Liar,' a dataset uniquely balanced in terms of\ngeographic and temporal representation, facilitating a more nuanced evaluation\nof LLM biases. Our analysis reveals that newer iterations of GPT models do not\nalways equate to improved performance. Notably, the GPT-4 version from March\ndemonstrates higher factual accuracy than its subsequent June release.\nFurthermore, a concerning bias is observed, privileging statements from the\nGlobal North over the Global South, thus potentially exacerbating existing\ninformational inequities. Regions such as Africa and the Middle East are at a\ndisadvantage, with much lower factual accuracy. The performance fluctuations\nover time suggest that model updates may not consistently benefit all regions\nequally.\n Our study also offers insights into the impact of various LLM configuration\nsettings, such as binary decision forcing, model re-runs and temperature, on\nmodel's factuality. Models constrained to binary (true/false) choices exhibit\nreduced factuality compared to those allowing an 'unclear' option. Single\ninference at a low temperature setting matches the reliability of majority\nvoting across various configurations. The insights gained highlight the need\nfor culturally diverse and geographically inclusive model training and\nevaluation. This approach is key to achieving global equity in technology,\ndistributing AI benefits fairly worldwide.\n","authors":["Shujaat Mirza","Bruno Coelho","Yuyuan Cui","Christina Pöpper","Damon McCoy"],"pdf_url":"https://arxiv.org/pdf/2401.17839v1.pdf","comment":"24 pages, 12 figures, 9 tables"},{"id":"http://arxiv.org/abs/2306.08915v2","updated":"2024-01-31T11:53:57Z","published":"2023-06-15T07:38:25Z","title":"Prompt Performance Prediction for Image Generation","summary":" The ability to predict the performance of a query before results are returned\nhas been a longstanding challenge in Information Retrieval (IR) systems.\nInspired by this task, we introduce, in this paper, a novel task called \"Prompt\nPerformance Prediction\" (PPP) that aims to predict the performance of a prompt,\nbefore obtaining the actual generated images. We demonstrate the plausibility\nof our task by measuring the correlation coefficient between predicted and\nactual performance scores across: three datasets containing pairs of prompts\nand generated images AND three art domain datasets of real images and real user\nappreciation ratings. Our results show promising performance prediction\ncapabilities, suggesting potential applications for optimizing user prompts.\n","authors":["Nicolas Bizzozzero","Ihab Bendidi","Olivier Risser-Maroix"],"pdf_url":"https://arxiv.org/pdf/2306.08915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06633v2","updated":"2024-01-31T11:07:32Z","published":"2024-01-12T15:26:40Z","title":"Ada-Retrieval: An Adaptive Multi-Round Retrieval Paradigm for Sequential\n Recommendations","summary":" Retrieval models aim at selecting a small set of item candidates which match\nthe preference of a given user. They play a vital role in large-scale\nrecommender systems since subsequent models such as rankers highly depend on\nthe quality of item candidates. However, most existing retrieval models employ\na single-round inference paradigm, which may not adequately capture the dynamic\nnature of user preferences and stuck in one area in the item space. In this\npaper, we propose Ada-Retrieval, an adaptive multi-round retrieval paradigm for\nrecommender systems that iteratively refines user representations to better\ncapture potential candidates in the full item space. Ada-Retrieval comprises\ntwo key modules: the item representation adapter and the user representation\nadapter, designed to inject context information into items' and users'\nrepresentations. The framework maintains a model-agnostic design, allowing\nseamless integration with various backbone models such as RNNs or Transformers.\nWe perform experiments on three widely used public datasets, incorporating five\npowerful sequential recommenders as backbone models. Our results demonstrate\nthat Ada-Retrieval significantly enhances the performance of various base\nmodels, with consistent improvements observed across different datasets. Our\ncode and data are publicly available at:\nhttps://github.com/ll0ruc/Ada-Retrieval.\n","authors":["Lei Li","Jianxun Lian","Xiao Zhou","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2401.06633v2.pdf","comment":"9 pages, Accepted to AAAI2024"},{"id":"http://arxiv.org/abs/2401.17723v1","updated":"2024-01-31T10:35:53Z","published":"2024-01-31T10:35:53Z","title":"LoRec: Large Language Model for Robust Sequential Recommendation against\n Poisoning Attacks","summary":" Sequential recommender systems stand out for their ability to capture users'\ndynamic interests and the patterns of item-to-item transitions. However, the\ninherent openness of sequential recommender systems renders them vulnerable to\npoisoning attacks, where fraudulent users are injected into the training data\nto manipulate learned patterns. Traditional defense strategies predominantly\ndepend on predefined assumptions or rules extracted from specific known\nattacks, limiting their generalizability to unknown attack types. To solve the\nabove problems, considering the rich open-world knowledge encapsulated in Large\nLanguage Models (LLMs), our research initially focuses on the capabilities of\nLLMs in the detection of unknown fraudulent activities within recommender\nsystems, a strategy we denote as LLM4Dec. Empirical evaluations demonstrate the\nsubstantial capability of LLMs in identifying unknown fraudsters, leveraging\ntheir expansive, open-world knowledge.\n Building upon this, we propose the integration of LLMs into defense\nstrategies to extend their effectiveness beyond the confines of known attacks.\nWe propose LoRec, an advanced framework that employs LLM-Enhanced Calibration\nto strengthen the robustness of sequential recommender systems against\npoisoning attacks. LoRec integrates an LLM-enhanced CalibraTor (LCT) that\nrefines the training process of sequential recommender systems with knowledge\nderived from LLMs, applying a user-wise reweighting to diminish the impact of\nfraudsters injected by attacks. By incorporating LLMs' open-world knowledge,\nthe LCT effectively converts the limited, specific priors or rules into a more\ngeneral pattern of fraudsters, offering improved defenses against poisoning\nattacks. Our comprehensive experiments validate that LoRec, as a general\nframework, significantly strengthens the robustness of sequential recommender\nsystems.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.17723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14678v2","updated":"2024-01-31T09:34:37Z","published":"2024-01-26T06:42:04Z","title":"Prompt-enhanced Federated Content Representation Learning for\n Cross-domain Recommendation","summary":" Cross-domain Recommendation (CDR) as one of the effective techniques in\nalleviating the data sparsity issues has been widely studied in recent years.\nHowever, previous works may cause domain privacy leakage since they necessitate\nthe aggregation of diverse domain data into a centralized server during the\ntraining process. Though several studies have conducted privacy preserving CDR\nvia Federated Learning (FL), they still have the following limitations: 1) They\nneed to upload users' personal information to the central server, posing the\nrisk of leaking user privacy. 2) Existing federated methods mainly rely on\natomic item IDs to represent items, which prevents them from modeling items in\na unified feature space, increasing the challenge of knowledge transfer among\ndomains. 3) They are all based on the premise of knowing overlapped users\nbetween domains, which proves impractical in real-world applications. To\naddress the above limitations, we focus on Privacy-preserving Cross-domain\nRecommendation (PCDR) and propose PFCR as our solution. For Limitation 1, we\ndevelop a FL schema by exclusively utilizing users' interactions with local\nclients and devising an encryption method for gradient encryption. For\nLimitation 2, we model items in a universal feature space by their description\ntexts. For Limitation 3, we initially learn federated content representations,\nharnessing the generality of natural language to establish bridges between\ndomains. Subsequently, we craft two prompt fine-tuning strategies to tailor the\npre-trained model to the target domain. Extensive experiments on two real-world\ndatasets demonstrate the superiority of our PFCR method compared to the SOTA\napproaches.\n","authors":["Lei Guo","Ziang Lu","Junliang Yu","Nguyen Quoc Viet Hung","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.14678v2.pdf","comment":"11 pages, 3 figures, accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2401.17645v1","updated":"2024-01-31T07:58:54Z","published":"2024-01-31T07:58:54Z","title":"ReSLLM: Large Language Models are Strong Resource Selectors for\n Federated Search","summary":" Federated search, which involves integrating results from multiple\nindependent search engines, will become increasingly pivotal in the context of\nRetrieval-Augmented Generation pipelines empowering LLM-based applications such\nas chatbots. These systems often distribute queries among various search\nengines, ranging from specialized (e.g., PubMed) to general (e.g., Google),\nbased on the nature of user utterances. A critical aspect of federated search\nis resource selection - the selection of appropriate resources prior to issuing\nthe query to ensure high-quality and rapid responses, and contain costs\nassociated with calling the external search engines. However, current SOTA\nresource selection methodologies primarily rely on feature-based learning\napproaches. These methods often involve the labour intensive and expensive\ncreation of training labels for each resource. In contrast, LLMs have exhibited\nstrong effectiveness as zero-shot methods across NLP and IR tasks. We\nhypothesise that in the context of federated search LLMs can assess the\nrelevance of resources without the need for extensive predefined labels or\nfeatures. In this paper, we propose ReSLLM. Our ReSLLM method exploits LLMs to\ndrive the selection of resources in federated search in a zero-shot setting. In\naddition, we devise an unsupervised fine tuning protocol, the Synthetic Label\nAugmentation Tuning (SLAT), where the relevance of previously logged queries\nand snippets from resources is predicted using an off-the-shelf LLM and then in\nturn used to fine-tune ReSLLM with respect to resource selection. Our empirical\nevaluation and analysis details the factors influencing the effectiveness of\nLLMs in this context. The results showcase the merits of ReSLLM for resource\nselection: not only competitive effectiveness in the zero-shot setting, but\nalso obtaining large when fine-tuned using SLAT-protocol.\n","authors":["Shuai Wang","Shengyao Zhuang","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.17645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17630v1","updated":"2024-01-31T07:20:56Z","published":"2024-01-31T07:20:56Z","title":"Towards Personalized Privacy: User-Governed Data Contribution for\n Federated Recommendation","summary":" Federated recommender systems (FedRecs) have gained significant attention for\ntheir potential to protect user's privacy by keeping user privacy data locally\nand only communicating model parameters/gradients to the server. Nevertheless,\nthe currently existing architecture of FedRecs assumes that all users have the\nsame 0-privacy budget, i.e., they do not upload any data to the server, thus\noverlooking those users who are less concerned about privacy and are willing to\nupload data to get a better recommendation service. To bridge this gap, this\npaper explores a user-governed data contribution federated recommendation\narchitecture where users are free to take control of whether they share data\nand the proportion of data they share to the server. To this end, this paper\npresents a cloud-device collaborative graph neural network federated\nrecommendation model, named CDCGNNFed. It trains user-centric ego graphs\nlocally, and high-order graphs based on user-shared data in the server in a\ncollaborative manner via contrastive learning. Furthermore, a graph mending\nstrategy is utilized to predict missing links in the graph on the server, thus\nleveraging the capabilities of graph neural networks over high-order graphs.\nExtensive experiments were conducted on two public datasets, and the results\ndemonstrate the effectiveness of the proposed method.\n","authors":["Liang Qu","Wei Yuan","Ruiqi Zheng","Lizhen Cui","Yuhui Shi","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.17630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17543v1","updated":"2024-01-31T02:14:31Z","published":"2024-01-31T02:14:31Z","title":"Fréchet Distance for Offline Evaluation of Information Retrieval\n Systems with Sparse Labels","summary":" The rapid advancement of natural language processing, information retrieval\n(IR), computer vision, and other technologies has presented significant\nchallenges in evaluating the performance of these systems. One of the main\nchallenges is the scarcity of human-labeled data, which hinders the fair and\naccurate assessment of these systems. In this work, we specifically focus on\nevaluating IR systems with sparse labels, borrowing from recent research on\nevaluating computer vision tasks. taking inspiration from the success of using\nFr\\'echet Inception Distance (FID) in assessing text-to-image generation\nsystems. We propose leveraging the Fr\\'echet Distance to measure the distance\nbetween the distributions of relevant judged items and retrieved results. Our\nexperimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query\nsets demonstrate the effectiveness of the Fr\\'echet Distance as a metric for\nevaluating IR systems, particularly in settings where a few labels are\navailable. This approach contributes to the advancement of evaluation\nmethodologies in real-world scenarios such as the assessment of generative IR\nsystems.\n","authors":["Negar Arabzadeh","Charles L. A. Clarke"],"pdf_url":"https://arxiv.org/pdf/2401.17543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07713v2","updated":"2024-01-31T23:27:26Z","published":"2023-10-11T17:59:05Z","title":"InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining","summary":" Pretraining auto-regressive large language models (LLMs) with retrieval\ndemonstrates better perplexity and factual accuracy by leveraging external\ndatabases. However, the size of existing pretrained retrieval-augmented LLM is\nstill limited (e.g., Retro has 7.5B parameters), which limits the effectiveness\nof instruction tuning and zero-shot generalization. In this work, we introduce\nRetro 48B, the largest LLM pretrained with retrieval. Specifically, we continue\nto pretrain a 43B GPT model on additional 100 billion tokens using the Retro\naugmentation method by retrieving from 1.2 trillion tokens. Notably, the\nobtained foundation model, Retro 48B, largely outperforms the counterpart GPT\n43B trained on 1.2T tokens in terms of perplexity with only 2.58% additional\nGPU hours, demonstrating the significant scaling potential of the method. After\ninstruction tuning on Retro, InstructRetro demonstrates significant improvement\nover the instruction tuned GPT on a wide range of zero-shot tasks.\nSpecifically, the average improvement of InstructRetro is 7% over its GPT\ncounterpart across 8 short-form QA and reading comprehension tasks, 10% over\nGPT across 4 challenging long-form QA tasks, and 16% over GPT across 3\nsummarization tasks. Surprisingly, we find that one can ablate the encoder from\nInstructRetro architecture and directly use its decoder backbone, while\nachieving comparable results. Our results highlight the promising direction to\nobtain a better GPT decoder through continued pretraining with retrieval before\ninstruction tuning. Our code and checkpoints are publicly available at:\nhttps://github.com/NVIDIA/Megatron-LM/tree/InstructRetro/tools/retro.\n","authors":["Boxin Wang","Wei Ping","Lawrence McAfee","Peng Xu","Bo Li","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.07713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03367v1","updated":"2024-01-31T22:06:07Z","published":"2024-01-31T22:06:07Z","title":"RAG-Fusion: a New Take on Retrieval-Augmented Generation","summary":" Infineon has identified a need for engineers, account managers, and customers\nto rapidly obtain product information. This problem is traditionally addressed\nwith retrieval-augmented generation (RAG) chatbots, but in this study, I\nevaluated the use of the newly popularized RAG-Fusion method. RAG-Fusion\ncombines RAG and reciprocal rank fusion (RRF) by generating multiple queries,\nreranking them with reciprocal scores and fusing the documents and scores.\nThrough manually evaluating answers on accuracy, relevance, and\ncomprehensiveness, I found that RAG-Fusion was able to provide accurate and\ncomprehensive answers due to the generated queries contextualizing the original\nquery from various perspectives. However, some answers strayed off topic when\nthe generated queries' relevance to the original query is insufficient. This\nresearch marks significant progress in artificial intelligence (AI) and natural\nlanguage processing (NLP) applications and demonstrates transformations in a\nglobal and multi-industry context.\n","authors":["Zackary Rackauckas"],"pdf_url":"https://arxiv.org/pdf/2402.03367v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.03366v1","updated":"2024-01-31T14:06:26Z","published":"2024-01-31T14:06:26Z","title":"Uncertainty-Aware Explainable Recommendation with Large Language Models","summary":" Providing explanations within the recommendation system would boost user\nsatisfaction and foster trust, especially by elaborating on the reasons for\nselecting recommended items tailored to the user. The predominant approach in\nthis domain revolves around generating text-based explanations, with a notable\nemphasis on applying large language models (LLMs). However, refining LLMs for\nexplainable recommendations proves impractical due to time constraints and\ncomputing resource limitations. As an alternative, the current approach\ninvolves training the prompt rather than the LLM. In this study, we developed a\nmodel that utilizes the ID vectors of user and item inputs as prompts for\nGPT-2. We employed a joint training mechanism within a multi-task learning\nframework to optimize both the recommendation task and explanation task. This\nstrategy enables a more effective exploration of users' interests, improving\nrecommendation effectiveness and user satisfaction. Through the experiments,\nour method achieving 1.59 DIV, 0.57 USR and 0.41 FCR on the Yelp, TripAdvisor\nand Amazon dataset respectively, demonstrates superior performance over four\nSOTA methods in terms of explainability evaluation metric. In addition, we\nidentified that the proposed model is able to ensure stable textual quality on\nthe three public datasets.\n","authors":["Yicui Peng","Hao Chen","Chingsheng Lin","Guo Huang","Jinrong Hu","Hui Guo","Bin Kong","Shu Hu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03365v1","updated":"2024-01-31T11:03:58Z","published":"2024-01-31T11:03:58Z","title":"Heterophily-Aware Fair Recommendation using Graph Convolutional Networks","summary":" In recent years, graph neural networks (GNNs) have become a popular tool to\nimprove the accuracy and performance of recommender systems. Modern recommender\nsystems are not only designed to serve the end users, but also to benefit other\nparticipants, such as items and items providers. These participants may have\ndifferent or conflicting goals and interests, which raise the need for fairness\nand popularity bias considerations. GNN-based recommendation methods also face\nthe challenges of unfairness and popularity bias and their normalization and\naggregation processes suffer from these challenges. In this paper, we propose a\nfair GNN-based recommender system, called HetroFair, to improve items' side\nfairness. HetroFair uses two separate components to generate fairness-aware\nembeddings: i) fairness-aware attention which incorporates dot product in the\nnormalization process of GNNs, to decrease the effect of nodes' degrees, and\nii) heterophily feature weighting to assign distinct weights to different\nfeatures during the aggregation process. In order to evaluate the effectiveness\nof HetroFair, we conduct extensive experiments over six real-world datasets.\nOur experimental results reveal that HetroFair not only alleviates the\nunfairness and popularity bias on the items' side, but also achieves superior\naccuracy on the users' side. Our implementation is publicly available at\nhttps://github.com/NematGH/HetroFair\n","authors":["Nemat Gholinejad","Mostafa Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2402.03365v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.18079v1","updated":"2024-01-31T18:58:14Z","published":"2024-01-31T18:58:14Z","title":"KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache\n Quantization","summary":" LLMs are seeing growing use for applications such as document analysis and\nsummarization which require large context windows, and with these large context\nwindows KV cache activations surface as the dominant contributor to memory\nconsumption during inference. Quantization is a promising approach for\ncompressing KV cache activations; however, existing solutions fail to represent\nactivations accurately in ultra-low precisions, such as sub-4-bit. In this\nwork, we present KVQuant, which addresses this problem by incorporating novel\nmethods for quantizing cached KV activations, including: (i) Per-Channel Key\nQuantization, where we adjust the dimension along which we quantize the Key\nactivations to better match the distribution; (ii) Pre-RoPE Key Quantization,\nwhere we quantize Key activations before the rotary positional embedding to\nmitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization,\nwhere we derive per-layer sensitivity-weighted non-uniform datatypes that\nbetter represent the distributions; (iv) Per-Vector Dense-and-Sparse\nQuantization, where we isolate outliers separately for each vector to minimize\nskews in quantization ranges; and (v) Q-Norm, where we normalize quantization\ncentroids in order to mitigate distribution shift, providing additional\nbenefits for 2-bit quantization. By applying our method to the LLaMA, LLaMA-2,\nand Mistral models, we achieve $<0.1$ perplexity degradation with 3-bit\nquantization on both Wikitext-2 and C4, outperforming existing approaches. Our\nmethod enables serving the LLaMA-7B model with a context length of up to 1\nmillion on a single A100-80GB GPU and up to 10 million on an 8-GPU system.\n","authors":["Coleman Hooper","Sehoon Kim","Hiva Mohammadzadeh","Michael W. Mahoney","Yakun Sophia Shao","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2401.18079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16468v2","updated":"2024-01-31T18:54:15Z","published":"2024-01-29T18:53:33Z","title":"High-Quality Image Restoration Following Human Instructions","summary":" Image restoration is a fundamental problem that involves recovering a\nhigh-quality clean image from its degraded observation. All-In-One image\nrestoration models can effectively restore images from various types and levels\nof degradation using degradation-specific information as prompts to guide the\nrestoration model. In this work, we present the first approach that uses\nhuman-written instructions to guide the image restoration model. Given natural\nlanguage prompts, our model can recover high-quality images from their degraded\ncounterparts, considering multiple degradation types. Our method, InstructIR,\nachieves state-of-the-art results on several restoration tasks including image\ndenoising, deraining, deblurring, dehazing, and (low-light) image enhancement.\nInstructIR improves +1dB over previous all-in-one restoration methods.\nMoreover, our dataset and results represent a novel benchmark for new research\non text-guided image restoration and enhancement. Our code, datasets and models\nare available at: https://github.com/mv-lab/InstructIR\n","authors":["Marcos V. Conde","Gregor Geigle","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2401.16468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18070v1","updated":"2024-01-31T18:48:20Z","published":"2024-01-31T18:48:20Z","title":"Do Language Models Exhibit the Same Cognitive Biases in Problem Solving\n as Human Learners?","summary":" There is increasing interest in employing large language models (LLMs) as\ncognitive models. For such purposes, it is central to understand which\ncognitive properties are well-modeled by LLMs, and which are not. In this work,\nwe study the biases of LLMs in relation to those known in children when solving\narithmetic word problems. Surveying the learning science literature, we posit\nthat the problem-solving process can be split into three distinct steps: text\ncomprehension, solution planning and solution execution. We construct tests for\neach one in order to understand which parts of this process can be faithfully\nmodeled by current state-of-the-art LLMs. We generate a novel set of word\nproblems for each of these tests, using a neuro-symbolic method that enables\nfine-grained control over the problem features. We find evidence that LLMs,\nwith and without instruction-tuning, exhibit human-like biases in both the\ntext-comprehension and the solution-planning steps of the solving process, but\nnot during the final step which relies on the problem's arithmetic expressions\n(solution execution).\n","authors":["Andreas Opedal","Alessandro Stolfo","Haruki Shirakami","Ying Jiao","Ryan Cotterell","Bernhard Schölkopf","Abulhair Saparov","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2401.18070v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.18059v1","updated":"2024-01-31T18:30:21Z","published":"2024-01-31T18:30:21Z","title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval","summary":" Retrieval-augmented language models can better adapt to changes in world\nstate and incorporate long-tail knowledge. However, most existing methods\nretrieve only short contiguous chunks from a retrieval corpus, limiting\nholistic understanding of the overall document context. We introduce the novel\napproach of recursively embedding, clustering, and summarizing chunks of text,\nconstructing a tree with differing levels of summarization from the bottom up.\nAt inference time, our RAPTOR model retrieves from this tree, integrating\ninformation across lengthy documents at different levels of abstraction.\nControlled experiments show that retrieval with recursive summaries offers\nsignificant improvements over traditional retrieval-augmented LMs on several\ntasks. On question-answering tasks that involve complex, multi-step reasoning,\nwe show state-of-the-art results; for example, by coupling RAPTOR retrieval\nwith the use of GPT-4, we can improve the best performance on the QuALITY\nbenchmark by 20% in absolute accuracy.\n","authors":["Parth Sarthi","Salman Abdullah","Aditi Tuli","Shubh Khanna","Anna Goldie","Christopher D. Manning"],"pdf_url":"https://arxiv.org/pdf/2401.18059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18058v1","updated":"2024-01-31T18:29:39Z","published":"2024-01-31T18:29:39Z","title":"LongAlign: A Recipe for Long Context Alignment of Large Language Models","summary":" Extending large language models to effectively handle long contexts requires\ninstruction fine-tuning on input sequences of similar length. To address this,\nwe present LongAlign -- a recipe of the instruction data, training, and\nevaluation for long context alignment. First, we construct a long\ninstruction-following dataset using Self-Instruct. To ensure the data\ndiversity, it covers a broad range of tasks from various long context sources.\nSecond, we adopt the packing and sorted batching strategies to speed up\nsupervised fine-tuning on data with varied length distributions. Additionally,\nwe develop a loss weighting method to balance the contribution to the loss\nacross different sequences during packing training. Third, we introduce the\nLongBench-Chat benchmark for evaluating instruction-following capabilities on\nqueries of 10k-100k in length. Experiments show that LongAlign outperforms\nexisting recipes for LLMs in long context tasks by up to 30\\%, while also\nmaintaining their proficiency in handling short, generic tasks. The code, data,\nand long-aligned models are open-sourced at https://github.com/THUDM/LongAlign.\n","authors":["Yushi Bai","Xin Lv","Jiajie Zhang","Yuze He","Ji Qi","Lei Hou","Jie Tang","Yuxiao Dong","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2401.18058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18057v1","updated":"2024-01-31T18:29:10Z","published":"2024-01-31T18:29:10Z","title":"Rank Supervised Contrastive Learning for Time Series Classification","summary":" Recently, various contrastive learning techniques have been developed to\ncategorize time series data and exhibit promising performance. A general\nparadigm is to utilize appropriate augmentations and construct feasible\npositive samples such that the encoder can yield robust and discriminative\nrepresentations by mapping similar data points closer together in the feature\nspace while pushing dissimilar data points farther apart. Despite its efficacy,\nthe fine-grained relative similarity (e.g., rank) information of positive\nsamples is largely ignored, especially when labeled samples are limited. To\nthis end, we present Rank Supervised Contrastive Learning (RankSCL) to perform\ntime series classification. Different from conventional contrastive learning\nframeworks, RankSCL augments raw data in a targeted way in the embedding space\nand adopts certain filtering rules to select more informative positive and\nnegative pairs of samples. Moreover, a novel rank loss is developed to assign\ndifferent weights for different levels of positive samples, enable the encoder\nto extract the fine-grained information of the same class, and produce a clear\nboundary among different classes. Thoroughly empirical studies on 128 UCR\ndatasets and 30 UEA datasets demonstrate that the proposed RankSCL can achieve\nstate-of-the-art performance compared to existing baseline methods.\n","authors":["Qianying Ren","Dongsheng Luo","Dongjin Song"],"pdf_url":"https://arxiv.org/pdf/2401.18057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18054v1","updated":"2024-01-31T18:20:42Z","published":"2024-01-31T18:20:42Z","title":"Benchmarking Sensitivity of Continual Graph Learning for Skeleton-Based\n Action Recognition","summary":" Continual learning (CL) is the research field that aims to build machine\nlearning models that can accumulate knowledge continuously over different tasks\nwithout retraining from scratch. Previous studies have shown that pre-training\ngraph neural networks (GNN) may lead to negative transfer (Hu et al., 2020)\nafter fine-tuning, a setting which is closely related to CL. Thus, we focus on\nstudying GNN in the continual graph learning (CGL) setting. We propose the\nfirst continual graph learning benchmark for spatio-temporal graphs and use it\nto benchmark well-known CGL methods in this novel setting. The benchmark is\nbased on the N-UCLA and NTU-RGB+D datasets for skeleton-based action\nrecognition. Beyond benchmarking for standard performance metrics, we study the\nclass and task-order sensitivity of CGL methods, i.e., the impact of learning\norder on each class/task's performance, and the architectural sensitivity of\nCGL methods with backbone GNN at various widths and depths. We reveal that\ntask-order robust methods can still be class-order sensitive and observe\nresults that contradict previous empirical observations on architectural\nsensitivity in CL.\n","authors":["Wei Wei","Tom De Schepper","Kevin Mets"],"pdf_url":"https://arxiv.org/pdf/2401.18054v1.pdf","comment":"This work is accepted at VISAPP 2024 as a short paper"},{"id":"http://arxiv.org/abs/2401.18047v1","updated":"2024-01-31T18:08:06Z","published":"2024-01-31T18:08:06Z","title":"Epidemic Modeling using Hybrid of Time-varying SIRD, Particle Swarm\n Optimization, and Deep Learning","summary":" Epidemiological models are best suitable to model an epidemic if the spread\npattern is stationary. To deal with non-stationary patterns and multiple waves\nof an epidemic, we develop a hybrid model encompassing epidemic modeling,\nparticle swarm optimization, and deep learning. The model mainly caters to\nthree objectives for better prediction: 1. Periodic estimation of the model\nparameters. 2. Incorporating impact of all the aspects using data fitting and\nparameter optimization 3. Deep learning based prediction of the model\nparameters. In our model, we use a system of ordinary differential equations\n(ODEs) for Susceptible-Infected-Recovered-Dead (SIRD) epidemic modeling,\nParticle Swarm Optimization (PSO) for model parameter optimization, and\nstacked-LSTM for forecasting the model parameters. Initial or one time\nestimation of model parameters is not able to model multiple waves of an\nepidemic. So, we estimate the model parameters periodically (weekly). We use\nPSO to identify the optimum values of the model parameters. We next train the\nstacked-LSTM on the optimized parameters, and perform forecasting of the model\nparameters for upcoming four weeks. Further, we fed the LSTM forecasted\nparameters into the SIRD model to forecast the number of COVID-19 cases. We\nevaluate the model for highly affected three countries namely; the USA, India,\nand the UK. The proposed hybrid model is able to deal with multiple waves, and\nhas outperformed existing methods on all the three datasets.\n","authors":["Naresh Kumar","Seba Susan"],"pdf_url":"https://arxiv.org/pdf/2401.18047v1.pdf","comment":"Accepted in ICCCNT 2023"},{"id":"http://arxiv.org/abs/2311.06643v2","updated":"2024-01-31T18:06:16Z","published":"2023-11-11T18:58:01Z","title":"Privacy Risks Analysis and Mitigation in Federated Learning for Medical\n Images","summary":" Federated learning (FL) is gaining increasing popularity in the medical\ndomain for analyzing medical images, which is considered an effective technique\nto safeguard sensitive patient data and comply with privacy regulations.\nHowever, several recent studies have revealed that the default settings of FL\nmay leak private training data under privacy attacks. Thus, it is still unclear\nwhether and to what extent such privacy risks of FL exist in the medical\ndomain, and if so, \"how to mitigate such risks?\". In this paper, first, we\npropose a holistic framework for Medical data Privacy risk analysis and\nmitigation in Federated Learning (MedPFL) to analyze privacy risks and develop\neffective mitigation strategies in FL for protecting private medical data.\nSecond, we demonstrate the substantial privacy risks of using FL to process\nmedical images, where adversaries can easily perform privacy attacks to\nreconstruct private medical images accurately. Third, we show that the defense\napproach of adding random noises may not always work effectively to protect\nmedical images against privacy attacks in FL, which poses unique and pressing\nchallenges associated with medical data for privacy protection.\n","authors":["Badhan Chandra Das","M. Hadi Amini","Yanzhao Wu"],"pdf_url":"https://arxiv.org/pdf/2311.06643v2.pdf","comment":"V1"},{"id":"http://arxiv.org/abs/2401.18039v1","updated":"2024-01-31T18:01:36Z","published":"2024-01-31T18:01:36Z","title":"Variable selection for Naïve Bayes classification","summary":" The Na\\\"ive Bayes has proven to be a tractable and efficient method for\nclassification in multivariate analysis. However, features are usually\ncorrelated, a fact that violates the Na\\\"ive Bayes' assumption of conditional\nindependence, and may deteriorate the method's performance. Moreover, datasets\nare often characterized by a large number of features, which may complicate the\ninterpretation of the results as well as slow down the method's execution.\n In this paper we propose a sparse version of the Na\\\"ive Bayes classifier\nthat is characterized by three properties. First, the sparsity is achieved\ntaking into account the correlation structure of the covariates. Second,\ndifferent performance measures can be used to guide the selection of features.\nThird, performance constraints on groups of higher interest can be included.\nOur proposal leads to a smart search, which yields competitive running times,\nwhereas the flexibility in terms of performance measure for classification is\nintegrated. Our findings show that, when compared against well-referenced\nfeature selection approaches, the proposed sparse Na\\\"ive Bayes obtains\ncompetitive results regarding accuracy, sparsity and running times for balanced\ndatasets. In the case of datasets with unbalanced (or with different\nimportance) classes, a better compromise between classification rates for the\ndifferent classes is achieved.\n","authors":["Rafael Blanquero","Emilio Carrizosa","Pepa Ramírez-Cobo","M. Remedios Sillero-Denamiel"],"pdf_url":"https://arxiv.org/pdf/2401.18039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18035v1","updated":"2024-01-31T17:59:57Z","published":"2024-01-31T17:59:57Z","title":"Optimizing contrastive learning for cortical folding pattern detection","summary":" The human cerebral cortex has many bumps and grooves called gyri and sulci.\nEven though there is a high inter-individual consistency for the main cortical\nfolds, this is not the case when we examine the exact shapes and details of the\nfolding patterns. Because of this complexity, characterizing the cortical\nfolding variability and relating them to subjects' behavioral characteristics\nor pathologies is still an open scientific problem. Classical approaches\ninclude labeling a few specific patterns, either manually or\nsemi-automatically, based on geometric distances, but the recent availability\nof MRI image datasets of tens of thousands of subjects makes modern\ndeep-learning techniques particularly attractive. Here, we build a\nself-supervised deep-learning model to detect folding patterns in the cingulate\nregion. We train a contrastive self-supervised model (SimCLR) on both Human\nConnectome Project (1101 subjects) and UKBioBank (21070 subjects) datasets with\ntopological-based augmentations on the cortical skeletons, which are\ntopological objects that capture the shape of the folds. We explore several\nbackbone architectures (convolutional network, DenseNet, and PointNet) for the\nSimCLR. For evaluation and testing, we perform a linear classification task on\na database manually labeled for the presence of the \"double-parallel\" folding\npattern in the cingulate region, which is related to schizophrenia\ncharacteristics. The best model, giving a test AUC of 0.76, is a convolutional\nnetwork with 6 layers, a 10-dimensional latent space, a linear projection head,\nand using the branch-clipping augmentation. This is the first time that a\nself-supervised deep learning model has been applied to cortical skeletons on\nsuch a large dataset and quantitatively evaluated. We can now envisage the next\nstep: applying it to other brain regions to detect other biomarkers.\n","authors":["Aymeric Gaudin","Louise Guillon","Clara Fischer","Arnaud Cachia","Denis Rivière","Jean-François Mangin","Joël Chavas"],"pdf_url":"https://arxiv.org/pdf/2401.18035v1.pdf","comment":"9 pages, 6 figures, 1 table, SPIE Imaging 2024"},{"id":"http://arxiv.org/abs/2307.06555v5","updated":"2024-01-31T17:57:17Z","published":"2023-07-13T04:46:05Z","title":"Deep Network Approximation: Beyond ReLU to Diverse Activation Functions","summary":" This paper explores the expressive power of deep neural networks for a\ndiverse range of activation functions. An activation function set $\\mathscr{A}$\nis defined to encompass the majority of commonly used activation functions,\nsuch as $\\mathtt{ReLU}$, $\\mathtt{LeakyReLU}$, $\\mathtt{ReLU}^2$,\n$\\mathtt{ELU}$, $\\mathtt{CELU}$, $\\mathtt{SELU}$, $\\mathtt{Softplus}$,\n$\\mathtt{GELU}$, $\\mathtt{SiLU}$, $\\mathtt{Swish}$, $\\mathtt{Mish}$,\n$\\mathtt{Sigmoid}$, $\\mathtt{Tanh}$, $\\mathtt{Arctan}$, $\\mathtt{Softsign}$,\n$\\mathtt{dSiLU}$, and $\\mathtt{SRS}$. We demonstrate that for any activation\nfunction $\\varrho\\in \\mathscr{A}$, a $\\mathtt{ReLU}$ network of width $N$ and\ndepth $L$ can be approximated to arbitrary precision by a $\\varrho$-activated\nnetwork of width $3N$ and depth $2L$ on any bounded set. This finding enables\nthe extension of most approximation results achieved with $\\mathtt{ReLU}$\nnetworks to a wide variety of other activation functions, albeit with slightly\nincreased constants. Significantly, we establish that the (width,$\\,$depth)\nscaling factors can be further reduced from $(3,2)$ to $(1,1)$ if $\\varrho$\nfalls within a specific subset of $\\mathscr{A}$. This subset includes\nactivation functions such as $\\mathtt{ELU}$, $\\mathtt{CELU}$, $\\mathtt{SELU}$,\n$\\mathtt{Softplus}$, $\\mathtt{GELU}$, $\\mathtt{SiLU}$, $\\mathtt{Swish}$, and\n$\\mathtt{Mish}$.\n","authors":["Shijun Zhang","Jianfeng Lu","Hongkai Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.06555v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15496v2","updated":"2024-01-31T17:36:29Z","published":"2024-01-27T20:20:39Z","title":"Baichuan2-Sum: Instruction Finetune Baichuan2-7B Model for Dialogue\n Summarization","summary":" Large language models (LLMs) like Llama, Baichuan and Bloom models show\nremarkable ability with instruction fine-tuning in many natural language tasks.\nNevertheless, for the dialogue summarization task, which aims to generate\nsummaries for different roles in dialogue, most of the state-of-the-art methods\nconduct on small models (e.g Bart and Bert). Existing methods try to add task\nspecified optimization on small models like adding global-local centrality\nscore to models. In this paper, we propose an instruction fine-tuning model:\nBaichuan2-Sum, for role-oriented diaglouge summarization. By setting different\ninstructions for different roles, the model can learn from the dialogue\ninteractions and output the expected summaries. Furthermore, we applied NEFTune\ntechnique to add suitable noise during training to improve the results. The\nexperiments demonstrate that the proposed model achieves the new\nstate-of-the-art results on two public dialogue summarization datasets: CSDS\nand SAMSUM. We release our model and related codes to facilitate future studies\non dialogue summarization task.\n","authors":["Jianfei Xiao","Yancan Chen","Yimin Ou","Hanyi Yu","Yiyong Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.15496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13530v2","updated":"2024-01-31T17:29:26Z","published":"2023-01-31T10:24:50Z","title":"Domain-Generalizable Multiple-Domain Clustering","summary":" This work generalizes the problem of unsupervised domain generalization to\nthe case in which no labeled samples are available (completely unsupervised).\nWe are given unlabeled samples from multiple source domains, and we aim to\nlearn a shared predictor that assigns examples to semantically related\nclusters. Evaluation is done by predicting cluster assignments in previously\nunseen domains. Towards this goal, we propose a two-stage training framework:\n(1) self-supervised pre-training for extracting domain invariant semantic\nfeatures. (2) multi-head cluster prediction with pseudo labels, which rely on\nboth the feature space and cluster head prediction, further leveraging a novel\nprediction-based label smoothing scheme. We demonstrate empirically that our\nmodel is more accurate than baselines that require fine-tuning using samples\nfrom the target domain or some level of supervision. Our code is available at\nhttps://github.com/AmitRozner/domain-generalizable-multiple-domain-clustering.\n","authors":["Amit Rozner","Barak Battash","Lior Wolf","Ofir Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2301.13530v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.18018v1","updated":"2024-01-31T17:28:24Z","published":"2024-01-31T17:28:24Z","title":"Prompt-Driven LLM Safeguarding via Directed Representation Optimization","summary":" Prepending model inputs with safety prompts is a common practice of\nsafeguarding large language models (LLMs) from complying with queries that\ncontain harmful intents. However, the working mechanisms of safety prompts have\nnot yet been fully understood, which hinders the potential for automatically\noptimizing them for improved LLM safety. Motivated by this problem, we\ninvestigate the impact of safety prompts from the perspective of model\nrepresentations. We find that in models' representation space, harmful and\nharmless queries can be largely distinguished, but this is not noticeably\nenhanced by safety prompts. Instead, the queries' representations are moved by\ndifferent safety prompts in similar directions, where models become more prone\nto refusal (i.e., refusing to provide assistance) even when the queries are\nharmless. Inspired by these findings, we propose a method called DRO (Directed\nRepresentation Optimization) for automatic safety prompt optimization. DRO\ntreats safety prompts as continuous, trainable embeddings and learns to move\nthe representations of harmful/harmless queries along/opposite the direction in\nwhich the model's refusal probability increases. We demonstrate that DRO\nremarkably improves the safeguarding performance of human-crafted safety\nprompts and outperforms strong baselines, as evaluated on out-of-domain\nbenchmarks, without compromising the general model capability.\n","authors":["Chujie Zheng","Fan Yin","Hao Zhou","Fandong Meng","Jie Zhou","Kai-Wei Chang","Minlie Huang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.18018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18017v1","updated":"2024-01-31T17:28:05Z","published":"2024-01-31T17:28:05Z","title":"Causal Discovery by Kernel Deviance Measures with Heterogeneous\n Transforms","summary":" The discovery of causal relationships in a set of random variables is a\nfundamental objective of science and has also recently been argued as being an\nessential component towards real machine intelligence. One class of causal\ndiscovery techniques are founded based on the argument that there are inherent\nstructural asymmetries between the causal and anti-causal direction which could\nbe leveraged in determining the direction of causation. To go about capturing\nthese discrepancies between cause and effect remains to be a challenge and many\ncurrent state-of-the-art algorithms propose to compare the norms of the kernel\nmean embeddings of the conditional distributions. In this work, we argue that\nsuch approaches based on RKHS embeddings are insufficient in capturing\nprincipal markers of cause-effect asymmetry involving higher-order structural\nvariabilities of the conditional distributions. We propose Kernel Intrinsic\nInvariance Measure with Heterogeneous Transform (KIIM-HT) which introduces a\nnovel score measure based on heterogeneous transformation of RKHS embeddings to\nextract relevant higher-order moments of the conditional densities for causal\ndiscovery. Inference is made via comparing the score of each hypothetical\ncause-effect direction. Tests and comparisons on a synthetic dataset, a\ntwo-dimensional synthetic dataset and the real-world benchmark dataset\nT\\\"ubingen Cause-Effect Pairs verify our approach. In addition, we conduct a\nsensitivity analysis to the regularization parameter to faithfully compare\nprevious work to our method and an experiment with trials on varied\nhyperparameter values to showcase the robustness of our algorithm.\n","authors":["Tim Tse","Zhitang Chen","Shengyu Zhu","Yue Liu"],"pdf_url":"https://arxiv.org/pdf/2401.18017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18012v1","updated":"2024-01-31T17:20:28Z","published":"2024-01-31T17:20:28Z","title":"Causal Coordinated Concurrent Reinforcement Learning","summary":" In this work, we propose a novel algorithmic framework for data sharing and\ncoordinated exploration for the purpose of learning more data-efficient and\nbetter performing policies under a concurrent reinforcement learning (CRL)\nsetting. In contrast to other work which make the assumption that all agents\nact under identical environments, we relax this restriction and instead\nconsider the formulation where each agent acts within an environment which\nshares a global structure but also exhibits individual variations. Our\nalgorithm leverages a causal inference algorithm in the form of Additive Noise\nModel - Mixture Model (ANM-MM) in extracting model parameters governing\nindividual differentials via independence enforcement. We propose a new data\nsharing scheme based on a similarity measure of the extracted model parameters\nand demonstrate superior learning speeds on a set of autoregressive, pendulum\nand cart-pole swing-up tasks and finally, we show the effectiveness of diverse\naction selection between common agents under a sparse reward setting. To the\nbest of our knowledge, this is the first work in considering non-identical\nenvironments in CRL and one of the few works which seek to integrate causal\ninference with reinforcement learning (RL).\n","authors":["Tim Tse","Isaac Chan","Zhitang Chen"],"pdf_url":"https://arxiv.org/pdf/2401.18012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18006v1","updated":"2024-01-31T17:08:34Z","published":"2024-01-31T17:08:34Z","title":"EEG-GPT: Exploring Capabilities of Large Language Models for EEG\n Classification and Interpretation","summary":" In conventional machine learning (ML) approaches applied to\nelectroencephalography (EEG), this is often a limited focus, isolating specific\nbrain activities occurring across disparate temporal scales (from transient\nspikes in milliseconds to seizures lasting minutes) and spatial scales (from\nlocalized high-frequency oscillations to global sleep activity). This siloed\napproach limits the development EEG ML models that exhibit multi-scale\nelectrophysiological understanding and classification capabilities. Moreover,\ntypical ML EEG approaches utilize black-box approaches, limiting their\ninterpretability and trustworthiness in clinical contexts. Thus, we propose\nEEG-GPT, a unifying approach to EEG classification that leverages advances in\nlarge language models (LLM). EEG-GPT achieves excellent performance comparable\nto current state-of-the-art deep learning methods in classifying normal from\nabnormal EEG in a few-shot learning paradigm utilizing only 2% of training\ndata. Furthermore, it offers the distinct advantages of providing intermediate\nreasoning steps and coordinating specialist EEG tools across multiple scales in\nits operation, offering transparent and interpretable step-by-step\nverification, thereby promoting trustworthiness in clinical contexts.\n","authors":["Jonathan W. Kim","Ahmed Alaa","Danilo Bernardo"],"pdf_url":"https://arxiv.org/pdf/2401.18006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17992v1","updated":"2024-01-31T16:52:19Z","published":"2024-01-31T16:52:19Z","title":"Multilinear Operator Networks","summary":" Despite the remarkable capabilities of deep neural networks in image\nrecognition, the dependence on activation functions remains a largely\nunexplored area and has yet to be eliminated. On the other hand, Polynomial\nNetworks is a class of models that does not require activation functions, but\nhave yet to perform on par with modern architectures. In this work, we aim\nclose this gap and propose MONet, which relies solely on multilinear operators.\nThe core layer of MONet, called Mu-Layer, captures multiplicative interactions\nof the elements of the input token. MONet captures high-degree interactions of\nthe input elements and we demonstrate the efficacy of our approach on a series\nof image recognition and scientific computing benchmarks. The proposed model\noutperforms prior polynomial networks and performs on par with modern\narchitectures. We believe that MONet can inspire further research on models\nthat use entirely multilinear operations.\n","authors":["Yixin Cheng","Grigorios G. Chrysos","Markos Georgopoulos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2401.17992v1.pdf","comment":"International Conference on Learning Representations Poster(2024)"},{"id":"http://arxiv.org/abs/2401.17975v1","updated":"2024-01-31T16:31:54Z","published":"2024-01-31T16:31:54Z","title":"Understanding polysemanticity in neural networks through coding theory","summary":" Despite substantial efforts, neural network interpretability remains an\nelusive goal, with previous research failing to provide succinct explanations\nof most single neurons' impact on the network output. This limitation is due to\nthe polysemantic nature of most neurons, whereby a given neuron is involved in\nmultiple unrelated network states, complicating the interpretation of that\nneuron. In this paper, we apply tools developed in neuroscience and information\ntheory to propose both a novel practical approach to network interpretability\nand theoretical insights into polysemanticity and the density of codes. We\ninfer levels of redundancy in the network's code by inspecting the\neigenspectrum of the activation's covariance matrix. Furthermore, we show how\nrandom projections can reveal whether a network exhibits a smooth or\nnon-differentiable code and hence how interpretable the code is. This same\nframework explains the advantages of polysemantic neurons to learning\nperformance and explains trends found in recent results by Elhage et\nal.~(2022). Our approach advances the pursuit of interpretability in neural\nnetworks, providing insights into their underlying structure and suggesting new\navenues for circuit-level interpretability.\n","authors":["Simon C. Marshall","Jan H. Kirchner"],"pdf_url":"https://arxiv.org/pdf/2401.17975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.06917v6","updated":"2024-01-31T16:29:09Z","published":"2021-10-13T17:55:06Z","title":"Extracting Dynamical Models from Data","summary":" The problem of determining the underlying dynamics of a system when only\ngiven data of its state over time has challenged scientists for decades. In\nthis paper, the approach of using machine learning to model the updates of the\nphase space variables is introduced; this is done as a function of the phase\nspace variables. (More generally, the modeling is done over functions of the\njet space.) This approach (named FJet) allows one to accurately replicate the\ndynamics, and is demonstrated on the examples of the damped harmonic\noscillator, the damped pendulum, and the Duffing oscillator; the underlying\ndifferential equation is also accurately recovered for each example. In\naddition, the results in no way depend on how the data is sampled over time\n(i.e., regularly or irregularly). It is demonstrated that a regression\nimplementation of FJet is similar to the model resulting from a Taylor series\nexpansion of the Runge-Kutta (RK) numerical integration scheme. This\nidentification confers the advantage of explicitly revealing the function space\nto use in the modeling, as well as the associated uncertainty quantification\nfor the updates. Finally, it is shown in the undamped harmonic oscillator\nexample that the stability of the updates is stable $10^9$ times longer than\nwith $4$th-order RK (with time step $0.1$).\n","authors":["Michael F. Zimmer"],"pdf_url":"https://arxiv.org/pdf/2110.06917v6.pdf","comment":"19 pages, 18 figures"},{"id":"http://arxiv.org/abs/2401.17972v1","updated":"2024-01-31T16:27:47Z","published":"2024-01-31T16:27:47Z","title":"MelNet: A Real-Time Deep Learning Algorithm for Object Detection","summary":" In this study, a novel deep learning algorithm for object detection, named\nMelNet, was introduced. MelNet underwent training utilizing the KITTI dataset\nfor object detection. Following 300 training epochs, MelNet attained an mAP\n(mean average precision) score of 0.732. Additionally, three alternative models\n-YOLOv5, EfficientDet, and Faster-RCNN-MobileNetv3- were trained on the KITTI\ndataset and juxtaposed with MelNet for object detection.\n The outcomes underscore the efficacy of employing transfer learning in\ncertain instances. Notably, preexisting models trained on prominent datasets\n(e.g., ImageNet, COCO, and Pascal VOC) yield superior results. Another finding\nunderscores the viability of creating a new model tailored to a specific\nscenario and training it on a specific dataset. This investigation demonstrates\nthat training MelNet exclusively on the KITTI dataset also surpasses\nEfficientDet after 150 epochs. Consequently, post-training, MelNet's\nperformance closely aligns with that of other pre-trained models.\n","authors":["Yashar Azadvatan","Murat Kurt"],"pdf_url":"https://arxiv.org/pdf/2401.17972v1.pdf","comment":"11 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.17967v1","updated":"2024-01-31T16:16:48Z","published":"2024-01-31T16:16:48Z","title":"CONCORD: Towards a DSL for Configurable Graph Code Representation","summary":" Deep learning is widely used to uncover hidden patterns in large code\ncorpora. To achieve this, constructing a format that captures the relevant\ncharacteristics and features of source code is essential. Graph-based\nrepresentations have gained attention for their ability to model structural and\nsemantic information. However, existing tools lack flexibility in constructing\ngraphs across different programming languages, limiting their use.\nAdditionally, the output of these tools often lacks interoperability and\nresults in excessively large graphs, making graph-based neural networks\ntraining slower and less scalable.\n We introduce CONCORD, a domain-specific language to build customizable graph\nrepresentations. It implements reduction heuristics to reduce graphs' size\ncomplexity. We demonstrate its effectiveness in code smell detection as an\nillustrative use case and show that: first, CONCORD can produce code\nrepresentations automatically per the specified configuration, and second, our\nheuristics can achieve comparable performance with significantly reduced size.\nCONCORD will help researchers a) create and experiment with customizable\ngraph-based code representations for different software engineering tasks\ninvolving DL, b) reduce the engineering work to generate graph representations,\nc) address the issue of scalability in GNN models, and d) enhance the\nreproducibility of experiments in research through a standardized approach to\ncode representation and analysis.\n","authors":["Mootez Saad","Tushar Sharma"],"pdf_url":"https://arxiv.org/pdf/2401.17967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17958v1","updated":"2024-01-31T16:07:44Z","published":"2024-01-31T16:07:44Z","title":"Convergence Analysis for General Probability Flow ODEs of Diffusion\n Models in Wasserstein Distances","summary":" Score-based generative modeling with probability flow ordinary differential\nequations (ODEs) has achieved remarkable success in a variety of applications.\nWhile various fast ODE-based samplers have been proposed in the literature and\nemployed in practice, the theoretical understandings about convergence\nproperties of the probability flow ODE are still quite limited. In this paper,\nwe provide the first non-asymptotic convergence analysis for a general class of\nprobability flow ODE samplers in 2-Wasserstein distance, assuming accurate\nscore estimates. We then consider various examples and establish results on the\niteration complexity of the corresponding ODE-based samplers.\n","authors":["Xuefeng Gao","Lingjiong Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.17958v1.pdf","comment":"47 pages, 3 tables. arXiv admin note: text overlap with\n arXiv:2311.11003"},{"id":"http://arxiv.org/abs/2307.05318v3","updated":"2024-01-31T15:54:58Z","published":"2023-07-11T15:01:48Z","title":"Predicting small molecules solubilities on endpoint devices using deep\n ensemble neural networks","summary":" Aqueous solubility is a valuable yet challenging property to predict.\nComputing solubility using first-principles methods requires accounting for the\ncompeting effects of entropy and enthalpy, resulting in long computations for\nrelatively poor accuracy. Data-driven approaches, such as deep learning, offer\nimproved accuracy and computational efficiency but typically lack uncertainty\nquantification. Additionally, ease of use remains a concern for any\ncomputational technique, resulting in the sustained popularity of group-based\ncontribution methods. In this work, we addressed these problems with a deep\nlearning model with predictive uncertainty that runs on a static website\n(without a server). This approach moves computing needs onto the website\nvisitor without requiring installation, removing the need to pay for and\nmaintain servers. Our model achieves satisfactory results in solubility\nprediction. Furthermore, we demonstrate how to create molecular property\nprediction models that balance uncertainty and ease of use. The code is\navailable at https://github.com/ur-whitelab/mol.dev, and the model is usable at\nhttps://mol.dev.\n","authors":["Mayk Caldas Ramos","Andrew D. White"],"pdf_url":"https://arxiv.org/pdf/2307.05318v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12612v2","updated":"2024-01-31T15:52:01Z","published":"2023-08-24T07:22:29Z","title":"Try with Simpler -- An Evaluation of Improved Principal Component\n Analysis in Log-based Anomaly Detection","summary":" The rapid growth of deep learning (DL) has spurred interest in enhancing\nlog-based anomaly detection. This approach aims to extract meaning from log\nevents (log message templates) and develop advanced DL models for anomaly\ndetection. However, these DL methods face challenges like heavy reliance on\ntraining data, labels, and computational resources due to model complexity. In\ncontrast, traditional machine learning and data mining techniques are less\ndata-dependent and more efficient but less effective than DL. To make log-based\nanomaly detection more practical, the goal is to enhance traditional techniques\nto match DL's effectiveness. Previous research in a different domain (linking\nquestions on Stack Overflow) suggests that optimized traditional techniques can\nrival state-of-the-art DL methods. Drawing inspiration from this concept, we\nconducted an empirical study. We optimized the unsupervised PCA (Principal\nComponent Analysis), a traditional technique, by incorporating lightweight\nsemantic-based log representation. This addresses the issue of unseen log\nevents in training data, enhancing log representation. Our study compared seven\nlog-based anomaly detection methods, including four DL-based, two traditional,\nand the optimized PCA technique, using public and industrial datasets. Results\nindicate that the optimized unsupervised PCA technique achieves similar\neffectiveness to advanced supervised/semi-supervised DL methods while being\nmore stable with limited training data and resource-efficient. This\ndemonstrates the adaptability and strength of traditional techniques through\nsmall yet impactful adaptations.\n","authors":["Lin Yang","Junjie Chen","Shutao Gao","Zhihao Gong","Hongyu Zhang","Yue Kang","Huaan Li"],"pdf_url":"https://arxiv.org/pdf/2308.12612v2.pdf","comment":"Accepted by TOSEM"},{"id":"http://arxiv.org/abs/2210.06225v2","updated":"2024-01-31T15:50:17Z","published":"2022-10-12T14:12:04Z","title":"On the Generalizability of ECG-based Stress Detection Models","summary":" Stress is prevalent in many aspects of everyday life including work,\nhealthcare, and social interactions. Many works have studied handcrafted\nfeatures from various bio-signals that are indicators of stress. Recently, deep\nlearning models have also been proposed to detect stress. Typically, stress\nmodels are trained and validated on the same dataset, often involving one\nstressful scenario. However, it is not practical to collect stress data for\nevery scenario. So, it is crucial to study the generalizability of these models\nand determine to what extent they can be used in other scenarios. In this\npaper, we explore the generalization capabilities of Electrocardiogram\n(ECG)-based deep learning models and models based on handcrafted ECG features,\ni.e., Heart Rate Variability (HRV) features. To this end, we train three HRV\nmodels and two deep learning models that use ECG signals as input. We use ECG\nsignals from two popular stress datasets - WESAD and SWELL-KW - differing in\nterms of stressors and recording devices. First, we evaluate the models using\nleave-one-subject-out (LOSO) cross-validation using training and validation\nsamples from the same dataset. Next, we perform a cross-dataset validation of\nthe models, that is, LOSO models trained on the WESAD dataset are validated\nusing SWELL-KW samples and vice versa. While deep learning models achieve the\nbest results on the same dataset, models based on HRV features considerably\noutperform them on data from a different dataset. This trend is observed for\nall the models on both datasets. Therefore, HRV models are a better choice for\nstress recognition in applications that are different from the dataset\nscenario. To the best of our knowledge, this is the first work to compare the\ncross-dataset generalizability between ECG-based deep learning models and HRV\nmodels.\n","authors":["Pooja Prajod","Elisabeth André"],"pdf_url":"https://arxiv.org/pdf/2210.06225v2.pdf","comment":"Published in Proceedings of 2022 21st IEEE International Conference\n on Machine Learning and Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2209.08316v2","updated":"2024-01-31T15:49:34Z","published":"2022-09-17T12:01:35Z","title":"An Empathetic AI Coach for Self-Attachment Therapy","summary":" In this work, we present a new dataset and a computational strategy for a\ndigital coach that aims to guide users in practicing the protocols of\nself-attachment therapy. Our framework augments a rule-based conversational\nagent with a deep-learning classifier for identifying the underlying emotion in\na user's text response, as well as a deep-learning assisted retrieval method\nfor producing novel, fluent and empathetic utterances. We also craft a set of\nhuman-like personas that users can choose to interact with. Our goal is to\nachieve a high level of engagement during virtual therapy sessions. We evaluate\nthe effectiveness of our framework in a non-clinical trial with N=16\nparticipants, all of whom have had at least four interactions with the agent\nover the course of five days. We find that our platform is consistently rated\nhigher for empathy, user engagement and usefulness than the simple rule-based\nframework. Finally, we provide guidelines to further improve the design and\nperformance of the application, in accordance with the feedback received.\n","authors":["Lisa Alazraki","Ali Ghachem","Neophytos Polydorou","Foaad Khosmood","Abbas Edalat"],"pdf_url":"https://arxiv.org/pdf/2209.08316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17919v1","updated":"2024-01-31T15:33:37Z","published":"2024-01-31T15:33:37Z","title":"LOCOST: State-Space Models for Long Document Abstractive Summarization","summary":" State-space models are a low-complexity alternative to transformers for\nencoding long sequences and capturing long-term dependencies. We propose\nLOCOST: an encoder-decoder architecture based on state-space models for\nconditional text generation with long context inputs. With a computational\ncomplexity of $O(L \\log L)$, this architecture can handle significantly longer\nsequences than state-of-the-art models that are based on sparse attention\npatterns. We evaluate our model on a series of long document abstractive\nsummarization tasks. The model reaches a performance level that is 93-96%\ncomparable to the top-performing sparse transformers of the same size while\nsaving up to 50% memory during training and up to 87% during inference.\nAdditionally, LOCOST effectively handles input texts exceeding 600K tokens at\ninference time, setting new state-of-the-art results on full-book summarization\nand opening new perspectives for long input processing.\n","authors":["Florian Le Bronnec","Song Duong","Mathieu Ravaut","Alexandre Allauzen","Nancy F. Chen","Vincent Guigue","Alberto Lumbreras","Laure Soulier","Patrick Gallinari"],"pdf_url":"https://arxiv.org/pdf/2401.17919v1.pdf","comment":"9 pages, 5 figures, 7 tables, EACL 2024 conference"},{"id":"http://arxiv.org/abs/2308.13352v3","updated":"2024-01-31T14:53:18Z","published":"2023-08-25T12:47:59Z","title":"A Generic Machine Learning Framework for Fully-Unsupervised Anomaly\n Detection with Contaminated Data","summary":" Anomaly detection (AD) tasks have been solved using machine learning\nalgorithms in various domains and applications. The great majority of these\nalgorithms use normal data to train a residual-based model and assign anomaly\nscores to unseen samples based on their dissimilarity with the learned normal\nregime. The underlying assumption of these approaches is that anomaly-free data\nis available for training. This is, however, often not the case in real-world\noperational settings, where the training data may be contaminated with an\nunknown fraction of abnormal samples. Training with contaminated data, in turn,\ninevitably leads to a deteriorated AD performance of the residual-based\nalgorithms.\n In this paper we introduce a framework for a fully unsupervised refinement of\ncontaminated training data for AD tasks. The framework is generic and can be\napplied to any residual-based machine learning model. We demonstrate the\napplication of the framework to two public datasets of multivariate time series\nmachine data from different application fields. We show its clear superiority\nover the naive approach of training with contaminated data without refinement.\nMoreover, we compare it to the ideal, unrealistic reference in which\nanomaly-free data would be available for training. The method is based on\nevaluating the contribution of individual samples to the generalization ability\nof a given model, and contrasting the contribution of anomalies with the one of\nnormal samples. As a result, the proposed approach is comparable to, and often\noutperforms training with normal samples only.\n","authors":["Markus Ulmer","Jannik Zgraggen","Lilach Goren Huber"],"pdf_url":"https://arxiv.org/pdf/2308.13352v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.11309v2","updated":"2024-01-31T14:39:49Z","published":"2021-05-18T16:59:52Z","title":"Efficiently Solving High-Order and Nonlinear ODEs with Rational Fraction\n Polynomial: the Ratio Net","summary":" Recent advances in solving ordinary differential equations (ODEs) with neural\nnetworks have been remarkable. Neural networks excel at serving as trial\nfunctions and approximating solutions within functional spaces, aided by\ngradient backpropagation algorithms. However, challenges remain in solving\ncomplex ODEs, including high-order and nonlinear cases, emphasizing the need\nfor improved efficiency and effectiveness. Traditional methods have typically\nrelied on established knowledge integration to improve problem-solving\nefficiency. In contrast, this study takes a different approach by introducing a\nnew neural network architecture for constructing trial functions, known as\nratio net. This architecture draws inspiration from rational fraction\npolynomial approximation functions, specifically the Pade approximant. Through\nempirical trials, it demonstrated that the proposed method exhibits higher\nefficiency compared to existing approaches, including polynomial-based and\nmultilayer perceptron (MLP) neural network-based methods. The ratio net holds\npromise for advancing the efficiency and effectiveness of solving differential\nequations.\n","authors":["Chenxin Qin","Ruhao Liu","Maocai Li","Shengyuan Li","Yi Liu","Chichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2105.11309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17880v1","updated":"2024-01-31T14:37:06Z","published":"2024-01-31T14:37:06Z","title":"Graph Attention-based Reinforcement Learning for Trajectory Design and\n Resource Assignment in Multi-UAV Assisted Communication","summary":" In the multiple unmanned aerial vehicle (UAV)- assisted downlink\ncommunication, it is challenging for UAV base stations (UAV BSs) to realize\ntrajectory design and resource assignment in unknown environments. The\ncooperation and competition between UAV BSs in the communication network leads\nto a Markov game problem. Multi-agent reinforcement learning is a significant\nsolution for the above decision-making. However, there are still many common\nissues, such as the instability of the system and low utilization of historical\ndata, that limit its application. In this paper, a novel graph-attention\nmulti-agent trust region (GA-MATR) reinforcement learning framework is proposed\nto solve the multi-UAV assisted communication problem. Graph recurrent network\nis introduced to process and analyze complex topology of the communication\nnetwork, so as to extract useful information and patterns from observational\ninformation. The attention mechanism provides additional weighting for conveyed\ninformation, so that the critic network can accurately evaluate the value of\nbehavior for UAV BSs. This provides more reliable feedback signals and helps\nthe actor network update the strategy more effectively. Ablation simulations\nindicate that the proposed approach attains improved convergence over the\nbaselines. UAV BSs learn the optimal communication strategies to achieve their\nmaximum cumulative rewards. Additionally, multi-agent trust region method with\nmonotonic convergence provides an estimated Nash equilibrium for the multi-UAV\nassisted communication Markov game.\n","authors":["Zikai Feng","Di Wu","Mengxing Huang","Chau Yuen"],"pdf_url":"https://arxiv.org/pdf/2401.17880v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2401.17870v1","updated":"2024-01-31T14:27:35Z","published":"2024-01-31T14:27:35Z","title":"Efficient Subseasonal Weather Forecast using Teleconnection-informed\n Transformers","summary":" Subseasonal forecasting, which is pivotal for agriculture, water resource\nmanagement, and early warning of disasters, faces challenges due to the chaotic\nnature of the atmosphere. Recent advances in machine learning (ML) have\nrevolutionized weather forecasting by achieving competitive predictive skills\nto numerical models. However, training such foundation models requires\nthousands of GPU days, which causes substantial carbon emissions and limits\ntheir broader applicability. Moreover, ML models tend to fool the pixel-wise\nerror scores by producing smoothed results which lack physical consistency and\nmeteorological meaning. To deal with the aforementioned problems, we propose a\nteleconnection-informed transformer. Our architecture leverages the pretrained\nPangu model to achieve good initial weights and integrates a\nteleconnection-informed temporal module to improve predictability in an\nextended temporal range. Remarkably, by adjusting 1.1% of the Pangu model's\nparameters, our method enhances predictability on four surface and five\nupper-level atmospheric variables at a two-week lead time. Furthermore, the\nteleconnection-filtered features improve the spatial granularity of outputs\nsignificantly, indicating their potential physical consistency. Our research\nunderscores the importance of atmospheric and oceanic teleconnections in\ndriving future weather conditions. Besides, it presents a resource-efficient\npathway for researchers to leverage existing foundation models on versatile\ndownstream tasks.\n","authors":["Shan Zhao","Zhitong Xiong","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.17870v1.pdf","comment":"Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2401.17868v1","updated":"2024-01-31T14:27:07Z","published":"2024-01-31T14:27:07Z","title":"Convolution Meets LoRA: Parameter Efficient Finetuning for Segment\n Anything Model","summary":" The Segment Anything Model (SAM) stands as a foundational framework for image\nsegmentation. While it exhibits remarkable zero-shot generalization in typical\nscenarios, its advantage diminishes when applied to specialized domains like\nmedical imagery and remote sensing. To address this limitation, this paper\nintroduces Conv-LoRA, a simple yet effective parameter-efficient fine-tuning\napproach. By integrating ultra-lightweight convolutional parameters into\nLow-Rank Adaptation (LoRA), Conv-LoRA can inject image-related inductive biases\ninto the plain ViT encoder, further reinforcing SAM's local prior assumption.\nNotably, Conv-LoRA not only preserves SAM's extensive segmentation knowledge\nbut also revives its capacity of learning high-level image semantics, which is\nconstrained by SAM's foreground-background segmentation pretraining.\nComprehensive experimentation across diverse benchmarks spanning multiple\ndomains underscores Conv-LoRA's superiority in adapting SAM to real-world\nsemantic segmentation tasks.\n","authors":["Zihan Zhong","Zhiqiang Tang","Tong He","Haoyang Fang","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.17868v1.pdf","comment":"Accepted at ICLR 2024 Conference"},{"id":"http://arxiv.org/abs/2401.17865v1","updated":"2024-01-31T14:23:51Z","published":"2024-01-31T14:23:51Z","title":"Manipulating Predictions over Discrete Inputs in Machine Teaching","summary":" Machine teaching often involves the creation of an optimal (typically\nminimal) dataset to help a model (referred to as the `student') achieve\nspecific goals given by a teacher. While abundant in the continuous domain, the\nstudies on the effectiveness of machine teaching in the discrete domain are\nrelatively limited. This paper focuses on machine teaching in the discrete\ndomain, specifically on manipulating student models' predictions based on the\ngoals of teachers via changing the training data efficiently. We formulate this\ntask as a combinatorial optimization problem and solve it by proposing an\niterative searching algorithm. Our algorithm demonstrates significant numerical\nmerit in the scenarios where a teacher attempts at correcting erroneous\npredictions to improve the student's models, or maliciously manipulating the\nmodel to misclassify some specific samples to the target class aligned with his\npersonal profits. Experimental results show that our proposed algorithm can\nhave superior performance in effectively and efficiently manipulating the\npredictions of the model, surpassing conventional baselines.\n","authors":["Xiaodong Wu","Yufei Han","Hayssam Dahrouj","Jianbing Ni","Zhenwen Liang","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17865v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.11199v2","updated":"2024-01-31T14:11:56Z","published":"2023-08-22T05:21:31Z","title":"ConcatPlexer: Additional Dim1 Batching for Faster ViTs","summary":" Transformers have demonstrated tremendous success not only in the natural\nlanguage processing (NLP) domain but also the field of computer vision,\nigniting various creative approaches and applications. Yet, the superior\nperformance and modeling flexibility of transformers came with a severe\nincrease in computation costs, and hence several works have proposed methods to\nreduce this burden. Inspired by a cost-cutting method originally proposed for\nlanguage models, Data Multiplexing (DataMUX), we propose a novel approach for\nefficient visual recognition that employs additional dim1 batching (i.e.,\nconcatenation) that greatly improves the throughput with little compromise in\nthe accuracy. We first introduce a naive adaptation of DataMux for vision\nmodels, Image Multiplexer, and devise novel components to overcome its\nweaknesses, rendering our final model, ConcatPlexer, at the sweet spot between\ninference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and\nCIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and\n83.4% validation accuracy, respectively.\n","authors":["Donghoon Han","Seunghyeon Seo","Donghyeon Jeon","Jiho Jang","Chaerin Kong","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.11199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17019v2","updated":"2024-01-31T14:08:52Z","published":"2023-12-28T13:42:59Z","title":"Efficient Learning of Long-Range and Equivariant Quantum Systems","summary":" In this work, we consider a fundamental task in quantum many-body physics -\nfinding and learning ground states of quantum Hamiltonians and their\nproperties. Recent works have studied the task of predicting the ground state\nexpectation value of sums of geometrically local observables by learning from\ndata. For short-range gapped Hamiltonians, a sample complexity that is\nlogarithmic in the number of qubits and quasipolynomial in the error was\nobtained. Here we extend these results beyond the local requirements on both\nHamiltonians and observables, motivated by the relevance of long-range\ninteractions in molecular and atomic systems. For interactions decaying as a\npower law with exponent greater than twice the dimension of the system, we\nrecover the same efficient logarithmic scaling with respect to the number of\nqubits, but the dependence on the error worsens to exponential. Further, we\nshow that learning algorithms equivariant under the automorphism group of the\ninteraction hypergraph achieve a sample complexity reduction, leading in\nparticular to a constant number of samples for learning sums of local\nobservables in systems with periodic boundary conditions. We demonstrate the\nefficient scaling in practice by learning from DMRG simulations of $1$D\nlong-range and disordered systems with up to $128$ qubits. Finally, we provide\nan analysis of the concentration of expectation values of global observables\nstemming from the central limit theorem, resulting in increased prediction\naccuracy.\n","authors":["Štěpán Šmíd","Roberto Bondesan"],"pdf_url":"https://arxiv.org/pdf/2312.17019v2.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2310.13786v3","updated":"2024-01-31T14:04:54Z","published":"2023-10-20T19:32:54Z","title":"Fundamental Limits of Membership Inference Attacks on Machine Learning\n Models","summary":" Membership inference attacks (MIA) can reveal whether a particular data point\nwas part of the training dataset, potentially exposing sensitive information\nabout individuals. This article provides theoretical guarantees by exploring\nthe fundamental statistical limitations associated with MIAs on machine\nlearning models. More precisely, we first derive the statistical quantity that\ngoverns the effectiveness and success of such attacks. We then deduce that in a\nvery general regression setting with overfitting algorithms, attacks may have a\nhigh probability of success. Finally, we investigate several situations for\nwhich we provide bounds on this quantity of interest. Our results enable us to\ndeduce the accuracy of potential attacks based on the number of samples and\nother structural parameters of learning models. In certain instances, these\nparameters can be directly estimated from the dataset.\n","authors":["Eric Aubinais","Elisabeth Gassiat","Pablo Piantanida"],"pdf_url":"https://arxiv.org/pdf/2310.13786v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17838v1","updated":"2024-01-31T13:56:08Z","published":"2024-01-31T13:56:08Z","title":"A Cross-View Hierarchical Graph Learning Hypernetwork for Skill\n Demand-Supply Joint Prediction","summary":" The rapidly changing landscape of technology and industries leads to dynamic\nskill requirements, making it crucial for employees and employers to anticipate\nsuch shifts to maintain a competitive edge in the labor market. Existing\nefforts in this area either rely on domain-expert knowledge or regarding skill\nevolution as a simplified time series forecasting problem. However, both\napproaches overlook the sophisticated relationships among different skills and\nthe inner-connection between skill demand and supply variations. In this paper,\nwe propose a Cross-view Hierarchical Graph learning Hypernetwork (CHGH)\nframework for joint skill demand-supply prediction. Specifically, CHGH is an\nencoder-decoder network consisting of i) a cross-view graph encoder to capture\nthe interconnection between skill demand and supply, ii) a hierarchical graph\nencoder to model the co-evolution of skills from a cluster-wise perspective,\nand iii) a conditional hyper-decoder to jointly predict demand and supply\nvariations by incorporating historical demand-supply gaps. Extensive\nexperiments on three real-world datasets demonstrate the superiority of the\nproposed framework compared to seven baselines and the effectiveness of the\nthree modules.\n","authors":["Wenshuo Chao","Zhaopeng Qiu","Likang Wu","Zhuoning Guo","Zhi Zheng","Hengshu Zhu","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2401.17838v1.pdf","comment":"11 pages, 7 figures, AAAI24"},{"id":"http://arxiv.org/abs/2401.17835v1","updated":"2024-01-31T13:52:11Z","published":"2024-01-31T13:52:11Z","title":"Predicting the Future with Simple World Models","summary":" World models can represent potentially high-dimensional pixel observations in\ncompact latent spaces, making it tractable to model the dynamics of the\nenvironment. However, the latent dynamics inferred by these models may still be\nhighly complex. Abstracting the dynamics of the environment with simple models\ncan have several benefits. If the latent dynamics are simple, the model may\ngeneralize better to novel transitions, and discover useful latent\nrepresentations of environment states. We propose a regularization scheme that\nsimplifies the world model's latent dynamics. Our model, the Parsimonious\nLatent Space Model (PLSM), minimizes the mutual information between latent\nstates and the dynamics that arise between them. This makes the dynamics softly\nstate-invariant, and the effects of the agent's actions more predictable. We\ncombine the PLSM with three different model classes used for i) future latent\nstate prediction, ii) video prediction, and iii) planning. We find that our\nregularization improves accuracy, generalization, and performance in downstream\ntasks.\n","authors":["Tankred Saanum","Peter Dayan","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2401.17835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17823v1","updated":"2024-01-31T13:28:07Z","published":"2024-01-31T13:28:07Z","title":"Privacy-preserving data release leveraging optimal transport and\n particle gradient descent","summary":" We present a novel approach for differentially private data synthesis of\nprotected tabular datasets, a relevant task in highly sensitive domains such as\nhealthcare and government. Current state-of-the-art methods predominantly use\nmarginal-based approaches, where a dataset is generated from private estimates\nof the marginals. In this paper, we introduce PrivPGD, a new generation method\nfor marginal-based private data synthesis, leveraging tools from optimal\ntransport and particle gradient descent. Our algorithm outperforms existing\nmethods on a large range of datasets while being highly scalable and offering\nthe flexibility to incorporate additional domain-specific constraints.\n","authors":["Konstantin Donhauser","Javier Abad","Neha Hulkund","Fanny Yang"],"pdf_url":"https://arxiv.org/pdf/2401.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17809v1","updated":"2024-01-31T13:08:45Z","published":"2024-01-31T13:08:45Z","title":"SWEA: Changing Factual Knowledge in Large Language Models via Subject\n Word Embedding Altering","summary":" Model editing has recently gained widespread attention. Current model editing\nmethods primarily involve modifying model parameters or adding additional\nmodules to the existing model. However, the former causes irreversible damage\nto LLMs, while the latter incurs additional inference overhead and fuzzy vector\nmatching is not always reliable. To address these issues, we propose an\nexpandable Subject Word Embedding Altering (SWEA) framework, which modifies the\nrepresentation of subjects and achieve the goal of editing knowledge during the\ninference stage. SWEA uses precise key matching outside the model and performs\nreliable subject word embedding altering, thus protecting the original weights\nof the model without increasing inference overhead. We then propose optimizing\nthen suppressing fusion method, which first optimizes the embedding vector for\nthe editing target and then suppresses the Knowledge Embedding Dimension (KED)\nto obtain the final fused embedding. We thus propose SWEAOS method for editing\nfactual knowledge in LLMs. We demonstrate the state-of-the-art performance of\nSWEAOS on the COUNTERFACT and zsRE datasets. To further validate the reasoning\nability of SWEAOS in editing knowledge, we evaluate it on the more complex\nRIPPLEEDITS benchmark. The results on two subdatasets demonstrate that our\nSWEAOS possesses state-of-the-art reasoning ability.\n","authors":["Xiaopeng Li","Shasha Li","Bin Ji","Shezheng Song","Xi Wang","Jun Ma","Jie Yu","Xiaodong Liu","Jing Wang","Weimin Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17809v1.pdf","comment":"Work in progress; Our code will be released"},{"id":"http://arxiv.org/abs/2305.12095v4","updated":"2024-01-31T13:05:35Z","published":"2023-05-20T05:16:31Z","title":"CARD: Channel Aligned Robust Blend Transformer for Time Series\n Forecasting","summary":" Recent studies have demonstrated the great power of Transformer models for\ntime series forecasting. One of the key elements that lead to the transformer's\nsuccess is the channel-independent (CI) strategy to improve the training\nrobustness. However, the ignorance of the correlation among different channels\nin CI would limit the model's forecasting capacity. In this work, we design a\nspecial Transformer, i.e., {\\bf C}hannel {\\bf A}ligned {\\bf R}obust Blen{\\bf d}\nTransformer (CARD for short), that addresses key shortcomings of CI type\nTransformer in time series forecasting. First, CARD introduces a\nchannel-aligned attention structure that allows it to capture both temporal\ncorrelations among signals and dynamical dependence among multiple variables\nover time. Second, in order to efficiently utilize the multi-scale knowledge,\nwe design a token blend module to generate tokens with different resolutions.\nThird, we introduce a robust loss function for time series forecasting to\nalleviate the potential overfitting issue. This new loss function weights the\nimportance of forecasting over a finite horizon based on prediction\nuncertainties. Our evaluation of multiple long-term and short-term forecasting\ndatasets demonstrates that CARD significantly outperforms state-of-the-art time\nseries forecasting methods. The code is available at the following anonymous\nrepository: \\url{https://anonymous.4open.science/r/CARD-6EEC}\n","authors":["Wang Xue","Tian Zhou","Qingsong Wen","Jinyang Gao","Bolin Ding","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2305.12095v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.17802v1","updated":"2024-01-31T12:52:10Z","published":"2024-01-31T12:52:10Z","title":"Distillation Enhanced Time Series Forecasting Network with Momentum\n Contrastive Learning","summary":" Contrastive representation learning is crucial in time series analysis as it\nalleviates the issue of data noise and incompleteness as well as sparsity of\nsupervision signal. However, existing constrastive learning frameworks usually\nfocus on intral-temporal features, which fails to fully exploit the intricate\nnature of time series data. To address this issue, we propose DE-TSMCL, an\ninnovative distillation enhanced framework for long sequence time series\nforecasting. Specifically, we design a learnable data augmentation mechanism\nwhich adaptively learns whether to mask a timestamp to obtain optimized\nsub-sequences. Then, we propose a contrastive learning task with momentum\nupdate to explore inter-sample and intra-temporal correlations of time series\nto learn the underlying structure feature on the unlabeled time series.\nMeanwhile, we design a supervised task to learn more robust representations and\nfacilitate the contrastive learning process. Finally, we jointly optimize the\nabove two tasks. By developing model loss from multiple tasks, we can learn\neffective representations for downstream forecasting task. Extensive\nexperiments, in comparison with state-of-the-arts, well demonstrate the\neffectiveness of DE-TSMCL, where the maximum improvement can reach to 27.3%.\n","authors":["Haozhi Gao","Qianqian Ren","Jinbao Li"],"pdf_url":"https://arxiv.org/pdf/2401.17802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10895v3","updated":"2024-01-31T12:40:51Z","published":"2023-07-20T14:18:44Z","title":"Variational Autoencoding of Dental Point Clouds","summary":" Digital dentistry has made significant advancements, yet numerous challenges\nremain. This paper introduces the FDI 16 dataset, an extensive collection of\ntooth meshes and point clouds. Additionally, we present a novel approach:\nVariational FoldingNet (VF-Net), a fully probabilistic variational autoencoder\ndesigned for point clouds. Notably, prior latent variable models for point\nclouds lack a one-to-one correspondence between input and output points.\nInstead, they rely on optimizing Chamfer distances, a metric that lacks a\nnormalized distributional counterpart, rendering it unsuitable for\nprobabilistic modeling. We replace the explicit minimization of Chamfer\ndistances with a suitable encoder, increasing computational efficiency while\nsimplifying the probabilistic extension. This allows for straightforward\napplication in various tasks, including mesh generation, shape completion, and\nrepresentation learning. Empirically, we provide evidence of lower\nreconstruction error in dental reconstruction and interpolation, showcasing\nstate-of-the-art performance in dental sample generation while identifying\nvaluable latent representations.\n","authors":["Johan Ziruo Ye","Thomas Ørkild","Peter Lempel Søndergaard","Søren Hauberg"],"pdf_url":"https://arxiv.org/pdf/2307.10895v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15176v3","updated":"2024-01-31T12:40:40Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v3.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2310.20703v2","updated":"2024-01-31T12:39:06Z","published":"2023-10-31T17:59:05Z","title":"Vanishing Gradients in Reinforcement Finetuning of Language Models","summary":" Pretrained language models are commonly aligned with human preferences and\ndownstream tasks via reinforcement finetuning (RFT), which refers to maximizing\na (possibly learned) reward function using policy gradient algorithms. This\nwork identifies a fundamental optimization obstacle in RFT: we prove that the\nexpected gradient for an input vanishes when its reward standard deviation\nunder the model is small, even if the expected reward is far from optimal.\nThrough experiments on an RFT benchmark and controlled environments, as well as\na theoretical analysis, we then demonstrate that vanishing gradients due to\nsmall reward standard deviation are prevalent and detrimental, leading to\nextremely slow reward maximization. Lastly, we explore ways to overcome\nvanishing gradients in RFT. We find the common practice of an initial\nsupervised finetuning (SFT) phase to be the most promising candidate, which\nsheds light on its importance in an RFT pipeline. Moreover, we show that a\nrelatively small number of SFT optimization steps on as few as 1% of the input\nsamples can suffice, indicating that the initial SFT phase need not be\nexpensive in terms of compute and data labeling efforts. Overall, our results\nemphasize that being mindful for inputs whose expected gradient vanishes, as\nmeasured by the reward standard deviation, is crucial for successful execution\nof RFT.\n","authors":["Noam Razin","Hattie Zhou","Omid Saremi","Vimal Thilak","Arwen Bradley","Preetum Nakkiran","Joshua Susskind","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2310.20703v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.17791v1","updated":"2024-01-31T12:33:31Z","published":"2024-01-31T12:33:31Z","title":"Graph Transformers without Positional Encodings","summary":" Recently, Transformers for graph representation learning have become\nincreasingly popular, achieving state-of-the-art performance on a wide-variety\nof datasets, either alone or in combination with message-passing graph neural\nnetworks (MP-GNNs). Infusing graph inductive-biases in the innately\nstructure-agnostic transformer architecture in the form of structural or\npositional encodings (PEs) is key to achieving these impressive results.\nHowever, designing such encodings is tricky and disparate attempts have been\nmade to engineer such encodings including Laplacian eigenvectors, relative\nrandom-walk probabilities (RRWP), spatial encodings, centrality encodings, edge\nencodings etc. In this work, we argue that such encodings may not be required\nat all, provided the attention mechanism itself incorporates information about\nthe graph structure. We introduce Eigenformer, which uses a novel\nspectrum-aware attention mechanism cognizant of the Laplacian spectrum of the\ngraph, and empirically show that it achieves performance comparable to SOTA\nMP-GNN architectures and Graph Transformers on a number of standard GNN\nbenchmark datasets, even surpassing the SOTA on some datasets. We also find\nthat our architecture is much faster to train in terms of number of epochs,\npresumably due to the innate graph inductive biases.\n","authors":["Ayush Garg"],"pdf_url":"https://arxiv.org/pdf/2401.17791v1.pdf","comment":"Independent Research"},{"id":"http://arxiv.org/abs/2401.17790v1","updated":"2024-01-31T12:32:18Z","published":"2024-01-31T12:32:18Z","title":"RADIN: Souping on a Budget","summary":" Model Soups, extending Stochastic Weights Averaging (SWA), combine models\nfine-tuned with different hyperparameters. Yet, their adoption is hindered by\ncomputational challenges due to subset selection issues. In this paper, we\npropose to speed up model soups by approximating soups performance using\naveraged ensemble logits performances. Theoretical insights validate the\ncongruence between ensemble logits and weight averaging soups across any mixing\nratios. Our Resource ADjusted soups craftINg (RADIN) procedure stands out by\nallowing flexible evaluation budgets, enabling users to adjust his budget of\nexploration adapted to his resources while increasing performance at lower\nbudget compared to previous greedy approach (up to 4% on ImageNet).\n","authors":["Thibaut Menes","Olivier Risser-Maroix"],"pdf_url":"https://arxiv.org/pdf/2401.17790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17789v1","updated":"2024-01-31T12:32:17Z","published":"2024-01-31T12:32:17Z","title":"Robustly overfitting latents for flexible neural image compression","summary":" Neural image compression has made a great deal of progress. State-of-the-art\nmodels are based on variational autoencoders and are outperforming classical\nmodels. Neural compression models learn to encode an image into a quantized\nlatent representation that can be efficiently sent to the decoder, which\ndecodes the quantized latent into a reconstructed image. While these models\nhave proven successful in practice, they lead to sub-optimal results due to\nimperfect optimization and limitations in the encoder and decoder capacity.\nRecent work shows how to use stochastic Gumbel annealing (SGA) to refine the\nlatents of pre-trained neural image compression models. We extend this idea by\nintroducing SGA+, which contains three different methods that build upon SGA.\nFurther, we give a detailed analysis of our proposed methods, show how they\nimprove performance, and show that they are less sensitive to hyperparameter\nchoices. Besides, we show how each method can be extended to three- instead of\ntwo-class rounding. Finally, we show how refinement of the latents with our\nbest-performing method improves the compression performance on the Tecnick\ndataset and how it can be deployed to partly move along the rate-distortion\ncurve.\n","authors":["Yura Perugachi-Diaz","Arwin Gansekoele","Sandjai Bhulai"],"pdf_url":"https://arxiv.org/pdf/2401.17789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17781v1","updated":"2024-01-31T12:23:55Z","published":"2024-01-31T12:23:55Z","title":"Vision-Assisted Digital Twin Creation for mmWave Beam Management","summary":" In the context of communication networks, digital twin technology provides a\nmeans to replicate the radio frequency (RF) propagation environment as well as\nthe system behaviour, allowing for a way to optimize the performance of a\ndeployed system based on simulations. One of the key challenges in the\napplication of Digital Twin technology to mmWave systems is the prevalent\nchannel simulators' stringent requirements on the accuracy of the 3D Digital\nTwin, reducing the feasibility of the technology in real applications. We\npropose a practical Digital Twin creation pipeline and a channel simulator,\nthat relies only on a single mounted camera and position information. We\ndemonstrate the performance benefits compared to methods that do not explicitly\nmodel the 3D environment, on downstream sub-tasks in beam acquisition, using\nthe real-world dataset of the DeepSense6G challenge\n","authors":["Maximilian Arnold","Bence Major","Fabio Valerio Massoli","Joseph B. Soriaga","Arash Behboodi"],"pdf_url":"https://arxiv.org/pdf/2401.17781v1.pdf","comment":"ICC2024 accepted paper. Copyright IEEE"},{"id":"http://arxiv.org/abs/2401.17780v1","updated":"2024-01-31T12:23:24Z","published":"2024-01-31T12:23:24Z","title":"A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with\n Uniform PAC Guarantees","summary":" We study a primal-dual reinforcement learning (RL) algorithm for the online\nconstrained Markov decision processes (CMDP) problem, wherein the agent\nexplores an optimal policy that maximizes return while satisfying constraints.\nDespite its widespread practical use, the existing theoretical literature on\nprimal-dual RL algorithms for this problem only provides sublinear regret\nguarantees and fails to ensure convergence to optimal policies. In this paper,\nwe introduce a novel policy gradient primal-dual algorithm with uniform\nprobably approximate correctness (Uniform-PAC) guarantees, simultaneously\nensuring convergence to optimal policies, sublinear regret, and polynomial\nsample complexity for any target accuracy. Notably, this represents the first\nUniform-PAC algorithm for the online CMDP problem. In addition to the\ntheoretical guarantees, we empirically demonstrate in a simple CMDP that our\nalgorithm converges to optimal policies, while an existing algorithm exhibits\noscillatory performance and constraint violation.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Masahiro Kato","Yuki Ichihara","Soichiro Nishimori","Akiyoshi Sannai","Sho Sonoda","Wataru Kumagai","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2401.17780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12459v2","updated":"2024-01-31T11:56:57Z","published":"2023-08-23T22:50:52Z","title":"Consistent Signal Reconstruction from Streaming Multivariate Time Series","summary":" Digitalizing real-world analog signals typically involves sampling in time\nand discretizing in amplitude. Subsequent signal reconstructions inevitably\nincur an error that depends on the amplitude resolution and the temporal\ndensity of the acquired samples. From an implementation viewpoint, consistent\nsignal reconstruction methods have proven a profitable error-rate decay as the\nsampling rate increases. Despite that, these results are obtained under offline\nsettings. Therefore, a research gap exists regarding methods for consistent\nsignal reconstruction from data streams. Solving this problem is of great\nimportance because such methods could run at a lower computational cost than\nthe existing offline ones or be used under real-time requirements without\nlosing the benefits of ensuring consistency. In this paper, we formalize for\nthe first time the concept of consistent signal reconstruction from streaming\ntime-series data. Then, we present a signal reconstruction method able to\nenforce consistency and also exploit the spatiotemporal dependencies of\nstreaming multivariate time-series data to further reduce the signal\nreconstruction error. Our experiments show that our proposed method achieves a\nfavorable error-rate decay with the sampling rate compared to a similar but\nnon-consistent reconstruction.\n","authors":["Emilio Ruiz-Moreno","Luis Miguel López-Ramos","Baltasar Beferull-Lozano"],"pdf_url":"https://arxiv.org/pdf/2308.12459v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.06353v3","updated":"2024-01-31T11:49:06Z","published":"2023-12-11T13:03:21Z","title":"Federated Full-Parameter Tuning of Billion-Sized Language Models with\n Communication Cost under 18 Kilobytes","summary":" Pre-trained large language models (LLMs) need fine-tuning to improve their\nresponsiveness to natural language instructions. Federated learning offers a\nway to fine-tune LLMs using the abundant data on end devices without\ncompromising data privacy. Most existing federated fine-tuning methods for LLMs\nrely on parameter-efficient fine-tuning techniques, which may not reach the\nperformance height possible with full-parameter tuning. However, federated\nfull-parameter tuning of LLMs is a non-trivial problem due to the immense\ncommunication cost. This work introduces FedKSeed that employs zeroth-order\noptimization with a finite set of random seeds. It significantly reduces\ntransmission requirements between the server and clients to just a few random\nseeds and scalar gradients, amounting to only a few thousand bytes, making\nfederated full-parameter tuning of billion-sized LLMs possible on devices.\nBuilding on it, we develop a strategy enabling probability-differentiated seed\nsampling, prioritizing perturbations with greater impact on model accuracy.\nExperiments across six scenarios with various LLMs, datasets and data\npartitions demonstrate that our approach outperforms existing federated LLM\nfine-tuning methods in both communication efficiency and new task\ngeneralization.\n","authors":["Zhen Qin","Daoyuan Chen","Bingchen Qian","Bolin Ding","Yaliang Li","Shuiguang Deng"],"pdf_url":"https://arxiv.org/pdf/2312.06353v3.pdf","comment":"Codes are available at\n https://github.com/alibaba/FederatedScope/tree/FedKSeed. We will continuously\n update the codebase and arXiv version"},{"id":"http://arxiv.org/abs/2401.17760v1","updated":"2024-01-31T11:37:14Z","published":"2024-01-31T11:37:14Z","title":"Regularized Linear Discriminant Analysis Using a Nonlinear Covariance\n Matrix Estimator","summary":" Linear discriminant analysis (LDA) is a widely used technique for data\nclassification. The method offers adequate performance in many classification\nproblems, but it becomes inefficient when the data covariance matrix is\nill-conditioned. This often occurs when the feature space's dimensionality is\nhigher than or comparable to the training data size. Regularized LDA (RLDA)\nmethods based on regularized linear estimators of the data covariance matrix\nhave been proposed to cope with such a situation. The performance of RLDA\nmethods is well studied, with optimal regularization schemes already proposed.\nIn this paper, we investigate the capability of a positive semidefinite\nridge-type estimator of the inverse covariance matrix that coincides with a\nnonlinear (NL) covariance matrix estimator. The estimator is derived by\nreformulating the score function of the optimal classifier utilizing linear\nestimation methods, which eventually results in the proposed NL-RLDA\nclassifier. We derive asymptotic and consistent estimators of the proposed\ntechnique's misclassification rate under the assumptions of a double-asymptotic\nregime and multivariate Gaussian model for the classes. The consistent\nestimator, coupled with a one-dimensional grid search, is used to set the value\nof the regularization parameter required for the proposed NL-RLDA classifier.\nPerformance evaluations based on both synthetic and real data demonstrate the\neffectiveness of the proposed classifier. The proposed technique outperforms\nstate-of-art methods over multiple datasets. When compared to state-of-the-art\nmethods across various datasets, the proposed technique exhibits superior\nperformance.\n","authors":["Maaz Mahadi","Tarig Ballal","Muhammad Moinuddin","Tareq Y. Al-Naffouri","Ubaid M. Al-Saggaf"],"pdf_url":"https://arxiv.org/pdf/2401.17760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17752v1","updated":"2024-01-31T11:26:03Z","published":"2024-01-31T11:26:03Z","title":"PF-GNN: Differentiable particle filtering based approximation of\n universal graph representations","summary":" Message passing Graph Neural Networks (GNNs) are known to be limited in\nexpressive power by the 1-WL color-refinement test for graph isomorphism. Other\nmore expressive models either are computationally expensive or need\npreprocessing to extract structural features from the graph. In this work, we\npropose to make GNNs universal by guiding the learning process with exact\nisomorphism solver techniques which operate on the paradigm of\nIndividualization and Refinement (IR), a method to artificially introduce\nasymmetry and further refine the coloring when 1-WL stops. Isomorphism solvers\ngenerate a search tree of colorings whose leaves uniquely identify the graph.\nHowever, the tree grows exponentially large and needs hand-crafted pruning\ntechniques which are not desirable from a learning perspective. We take a\nprobabilistic view and approximate the search tree of colorings (i.e.\nembeddings) by sampling multiple paths from root to leaves of the search tree.\nTo learn more discriminative representations, we guide the sampling process\nwith particle filter updates, a principled approach for sequential state\nestimation. Our algorithm is end-to-end differentiable, can be applied with any\nGNN as backbone and learns richer graph representations with only linear\nincrease in runtime. Experimental evaluation shows that our approach\nconsistently outperforms leading GNN models on both synthetic benchmarks for\nisomorphism detection as well as real-world datasets.\n","authors":["Mohammed Haroon Dupty","Yanfei Dong","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2401.17752v1.pdf","comment":"Published as a conference paper at ICLR 2022"},{"id":"http://arxiv.org/abs/2401.17743v1","updated":"2024-01-31T11:02:45Z","published":"2024-01-31T11:02:45Z","title":"Algorithmic Robust Forecast Aggregation","summary":" Forecast aggregation combines the predictions of multiple forecasters to\nimprove accuracy. However, the lack of knowledge about forecasters' information\nstructure hinders optimal aggregation. Given a family of information\nstructures, robust forecast aggregation aims to find the aggregator with\nminimal worst-case regret compared to the omniscient aggregator. Previous\napproaches for robust forecast aggregation rely on heuristic observations and\nparameter tuning. We propose an algorithmic framework for robust forecast\naggregation. Our framework provides efficient approximation schemes for general\ninformation aggregation with a finite family of possible information\nstructures. In the setting considered by Arieli et al. (2018) where two agents\nreceive independent signals conditioned on a binary state, our framework also\nprovides efficient approximation schemes by imposing Lipschitz conditions on\nthe aggregator or discrete conditions on agents' reports. Numerical experiments\ndemonstrate the effectiveness of our method by providing a nearly optimal\naggregator in the setting considered by Arieli et al. (2018).\n","authors":["Yongkang Guo","Jason D. Hartline","Zhihuan Huang","Yuqing Kong","Anant Shah","Fang-Yi Yu"],"pdf_url":"https://arxiv.org/pdf/2401.17743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16986v2","updated":"2024-01-31T11:01:06Z","published":"2024-01-30T13:15:59Z","title":"Causal Machine Learning for Cost-Effective Allocation of Development Aid","summary":" The Sustainable Development Goals (SDGs) of the United Nations provide a\nblueprint of a better future by 'leaving no one behind', and, to achieve the\nSDGs by 2030, poor countries require immense volumes of development aid. In\nthis paper, we develop a causal machine learning framework for predicting\nheterogeneous treatment effects of aid disbursements to inform effective aid\nallocation. Specifically, our framework comprises three components: (i) a\nbalancing autoencoder that uses representation learning to embed\nhigh-dimensional country characteristics while addressing treatment selection\nbias; (ii) a counterfactual generator to compute counterfactual outcomes for\nvarying aid volumes to address small sample-size settings; and (iii) an\ninference model that is used to predict heterogeneous treatment-response\ncurves. We demonstrate the effectiveness of our framework using data with\nofficial development aid earmarked to end HIV/AIDS in 105 countries, amounting\nto more than USD 5.2 billion. For this, we first show that our framework\nsuccessfully computes heterogeneous treatment-response curves using\nsemi-synthetic data. Then, we demonstrate our framework using real-world HIV\ndata. Our framework points to large opportunities for a more effective aid\nallocation, suggesting that the total number of new HIV infections could be\nreduced by up to 3.3% (~50,000 cases) compared to the current allocation\npractice.\n","authors":["Milan Kuzmanovic","Dennis Frauen","Tobias Hatt","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2401.16986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17739v1","updated":"2024-01-31T10:59:57Z","published":"2024-01-31T10:59:57Z","title":"Operator learning without the adjoint","summary":" There is a mystery at the heart of operator learning: how can one recover a\nnon-self-adjoint operator from data without probing the adjoint? Current\npractical approaches suggest that one can accurately recover an operator while\nonly using data generated by the forward action of the operator without access\nto the adjoint. However, naively, it seems essential to sample the action of\nthe adjoint. In this paper, we partially explain this mystery by proving that\nwithout querying the adjoint, one can approximate a family of non-self-adjoint\ninfinite-dimensional compact operators via projection onto a Fourier basis. We\nthen apply the result to recovering Green's functions of elliptic partial\ndifferential operators and derive an adjoint-free sample complexity bound.\nWhile existing theory justifies low sample complexity in operator learning,\nours is the first adjoint-free analysis that attempts to close the gap between\ntheory and practice.\n","authors":["Nicolas Boullé","Diana Halikias","Samuel E. Otto","Alex Townsend"],"pdf_url":"https://arxiv.org/pdf/2401.17739v1.pdf","comment":"49 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.17738v1","updated":"2024-01-31T10:58:59Z","published":"2024-01-31T10:58:59Z","title":"Harnessing Smartwatch Microphone Sensors for Cough Detection and\n Classification","summary":" This study investigates the potential of using smartwatches with built-in\nmicrophone sensors for monitoring coughs and detecting various cough types. We\nconducted a study involving 32 participants and collected 9 hours of audio data\nin a controlled manner. Afterward, we processed this data using a structured\napproach, resulting in 223 positive cough samples. We further improved the\ndataset through augmentation techniques and employed a specialized 1D CNN\nmodel. This model achieved an impressive accuracy rate of 98.49% while\nnon-walking and 98.2% while walking, showing smartwatches can detect cough.\nMoreover, our research successfully identified four distinct types of coughs\nusing clustering techniques.\n","authors":["Pranay Jaiswal","Haroon R. Lone"],"pdf_url":"https://arxiv.org/pdf/2401.17738v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2401.17737v1","updated":"2024-01-31T10:58:13Z","published":"2024-01-31T10:58:13Z","title":"Hierarchical Bias-Driven Stratification for Interpretable Causal Effect\n Estimation","summary":" Interpretability and transparency are essential for incorporating causal\neffect models from observational data into policy decision-making. They can\nprovide trust for the model in the absence of ground truth labels to evaluate\nthe accuracy of such models. To date, attempts at transparent causal effect\nestimation consist of applying post hoc explanation methods to black-box\nmodels, which are not interpretable. Here, we present BICauseTree: an\ninterpretable balancing method that identifies clusters where natural\nexperiments occur locally. Our approach builds on decision trees with a\ncustomized objective function to improve balancing and reduce treatment\nallocation bias. Consequently, it can additionally detect subgroups presenting\npositivity violations, exclude them, and provide a covariate-based definition\nof the target population we can infer from and generalize to. We evaluate the\nmethod's performance using synthetic and realistic datasets, explore its\nbias-interpretability tradeoff, and show that it is comparable with existing\napproaches.\n","authors":["Lucile Ter-Minassian","Liran Szlak","Ehud Karavani","Chris Holmes","Yishai Shimoni"],"pdf_url":"https://arxiv.org/pdf/2401.17737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17733v1","updated":"2024-01-31T10:54:34Z","published":"2024-01-31T10:54:34Z","title":"Towards Physical Plausibility in Neuroevolution Systems","summary":" The increasing usage of Artificial Intelligence (AI) models, especially Deep\nNeural Networks (DNNs), is increasing the power consumption during training and\ninference, posing environmental concerns and driving the need for more\nenergy-efficient algorithms and hardware solutions. This work addresses the\ngrowing energy consumption problem in Machine Learning (ML), particularly\nduring the inference phase. Even a slight reduction in power usage can lead to\nsignificant energy savings, benefiting users, companies, and the environment.\nOur approach focuses on maximizing the accuracy of Artificial Neural Network\n(ANN) models using a neuroevolutionary framework whilst minimizing their power\nconsumption. To do so, power consumption is considered in the fitness function.\nWe introduce a new mutation strategy that stochastically reintroduces modules\nof layers, with power-efficient modules having a higher chance of being chosen.\nWe introduce a novel technique that allows training two separate models in a\nsingle training step whilst promoting one of them to be more power efficient\nthan the other while maintaining similar accuracy. The results demonstrate a\nreduction in power consumption of ANN models by up to 29.2% without a\nsignificant decrease in predictive performance.\n","authors":["Gabriel Cortês","Nuno Lourenço","Penousal Machado"],"pdf_url":"https://arxiv.org/pdf/2401.17733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14440v2","updated":"2024-01-31T10:52:52Z","published":"2024-01-25T14:47:05Z","title":"Semantic Sensitivities and Inconsistent Predictions: Measuring the\n Fragility of NLI Models","summary":" Recent studies of the emergent capabilities of transformer-based Natural\nLanguage Understanding (NLU) models have indicated that they have an\nunderstanding of lexical and compositional semantics. We provide evidence that\nsuggests these claims should be taken with a grain of salt: we find that\nstate-of-the-art Natural Language Inference (NLI) models are sensitive towards\nminor semantics preserving surface-form variations, which lead to sizable\ninconsistent model decisions during inference. Notably, this behaviour differs\nfrom valid and in-depth comprehension of compositional semantics, however does\nneither emerge when evaluating model accuracy on standard benchmarks nor when\nprobing for syntactic, monotonic, and logically robust reasoning. We propose a\nnovel framework to measure the extent of semantic sensitivity. To this end, we\nevaluate NLI models on adversarially generated examples containing minor\nsemantics-preserving surface-form input noise. This is achieved using\nconditional text generation, with the explicit condition that the NLI model\npredicts the relationship between the original and adversarial inputs as a\nsymmetric equivalence entailment. We systematically study the effects of the\nphenomenon across NLI models for $\\textbf{in-}$ and $\\textbf{out-of-}$ domain\nsettings. Our experiments show that semantic sensitivity causes performance\ndegradations of $12.92\\%$ and $23.71\\%$ average over $\\textbf{in-}$ and\n$\\textbf{out-of-}$ domain settings, respectively. We further perform ablation\nstudies, analysing this phenomenon across models, datasets, and variations in\ninference and show that semantic sensitivity can lead to major inconsistency\nwithin model predictions.\n","authors":["Erik Arakelyan","Zhaoqi Liu","Isabelle Augenstein"],"pdf_url":"https://arxiv.org/pdf/2401.14440v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.15469v2","updated":"2024-01-31T10:17:28Z","published":"2024-01-27T17:43:08Z","title":"Wind speed super-resolution and validation: from ERA5 to CERRA via\n diffusion models","summary":" The Copernicus Regional Reanalysis for Europe, CERRA, is a high-resolution\nregional reanalysis dataset for the European domain. In recent years it has\nshown significant utility across various climate-related tasks, ranging from\nforecasting and climate change research to renewable energy prediction,\nresource management, air quality risk assessment, and the forecasting of rare\nevents, among others. Unfortunately, the availability of CERRA is lagging two\nyears behind the current date, due to constraints in acquiring the requisite\nexternal data and the intensive computational demands inherent in its\ngeneration. As a solution, this paper introduces a novel method using diffusion\nmodels to approximate CERRA downscaling in a data-driven manner, without\nadditional informations. By leveraging the lower resolution ERA5 dataset, which\nprovides boundary conditions for CERRA, we approach this as a super-resolution\ntask. Focusing on wind speed around Italy, our model, trained on existing CERRA\ndata, shows promising results, closely mirroring original CERRA data.\nValidation with in-situ observations further confirms the model's accuracy in\napproximating ground measurements.\n","authors":["Fabio Merizzi","Andrea Asperti","Stefano Colamonaco"],"pdf_url":"https://arxiv.org/pdf/2401.15469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09622v4","updated":"2024-01-31T09:51:15Z","published":"2022-05-19T15:37:26Z","title":"What Is Fairness? On the Role of Protected Attributes and Fictitious\n Worlds","summary":" A growing body of literature in fairness-aware ML (fairML) aspires to\nmitigate machine learning (ML)-related unfairness in automated decision-making\n(ADM) by defining metrics that measure fairness of an ML model and by proposing\nmethods that ensure that trained ML models achieve low values in those metrics.\nHowever, the underlying concept of fairness, i.e., the question of what\nfairness is, is rarely discussed, leaving a considerable gap between centuries\nof philosophical discussion and recent adoption of the concept in the ML\ncommunity. In this work, we try to bridge this gap by formalizing a consistent\nconcept of fairness and by translating the philosophical considerations into a\nformal framework for the training and evaluation of ML models in ADM systems.\nWe derive that fairness problems can already arise without the presence of\nprotected attributes (PAs), pointing out that fairness and predictive\nperformance are not irreconcilable counterparts, but rather that the latter is\nnecessary to achieve the former. Moreover, we argue why and how causal\nconsiderations are necessary when assessing fairness in the presence of PAs by\nproposing a fictitious, normatively desired (FiND) world where the PAs have no\ncausal effects. In practice, this FiND world must be approximated by a warped\nworld, for which the causal effects of the PAs must be removed from the\nreal-world data. Eventually, we achieve greater linguistic clarity for the\ndiscussion of fairML. We propose first algorithms for practical applications\nand present illustrative experiments on COMPAS data.\n","authors":["Ludwig Bothmann","Kristina Peters","Bernd Bischl"],"pdf_url":"https://arxiv.org/pdf/2205.09622v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17705v1","updated":"2024-01-31T09:49:46Z","published":"2024-01-31T09:49:46Z","title":"Predicting suicidal behavior among Indian adults using childhood trauma,\n mental health questionnaires and machine learning cascade ensembles","summary":" Among young adults, suicide is India's leading cause of death, accounting for\nan alarming national suicide rate of around 16%. In recent years, machine\nlearning algorithms have emerged to predict suicidal behavior using various\nbehavioral traits. But to date, the efficacy of machine learning algorithms in\npredicting suicidal behavior in the Indian context has not been explored in\nliterature. In this study, different machine learning algorithms and ensembles\nwere developed to predict suicide behavior based on childhood trauma, different\nmental health parameters, and other behavioral factors. The dataset was\nacquired from 391 individuals from a wellness center in India. Information\nregarding their childhood trauma, psychological wellness, and other mental\nhealth issues was acquired through standardized questionnaires. Results\nrevealed that cascade ensemble learning methods using a support vector machine,\ndecision trees, and random forest were able to classify suicidal behavior with\nan accuracy of 95.04% using data from childhood trauma and mental health\nquestionnaires. The study highlights the potential of using these machine\nlearning ensembles to identify individuals with suicidal tendencies so that\ntargeted interinterventions could be provided efficiently.\n","authors":["Akash K Rao","Gunjan Y Trivedi","Riri G Trivedi","Anshika Bajpai","Gajraj Singh Chauhan","Vishnu K Menon","Kathirvel Soundappan","Hemalatha Ramani","Neha Pandya","Varun Dutt"],"pdf_url":"https://arxiv.org/pdf/2401.17705v1.pdf","comment":"11 pages, presnted at the 4th International Conference on Frontiers\n in Computing and Systems (COMSYS 2023), Himachal Pradesh, October 2023"},{"id":"http://arxiv.org/abs/2401.17695v1","updated":"2024-01-31T09:31:28Z","published":"2024-01-31T09:31:28Z","title":"Datacube segmentation via Deep Spectral Clustering","summary":" Extended Vision techniques are ubiquitous in physics. However, the data cubes\nsteaming from such analysis often pose a challenge in their interpretation, due\nto the intrinsic difficulty in discerning the relevant information from the\nspectra composing the data cube.\n Furthermore, the huge dimensionality of data cube spectra poses a complex\ntask in its statistical interpretation; nevertheless, this complexity contains\na massive amount of statistical information that can be exploited in an\nunsupervised manner to outline some essential properties of the case study at\nhand, e.g.~it is possible to obtain an image segmentation via (deep) clustering\nof data-cube's spectra, performed in a suitably defined low-dimensional\nembedding space.\n To tackle this topic, we explore the possibility of applying unsupervised\nclustering methods in encoded space, i.e. perform deep clustering on the\nspectral properties of datacube pixels. A statistical dimensional reduction is\nperformed by an ad hoc trained (Variational) AutoEncoder, in charge of mapping\nspectra into lower dimensional metric spaces, while the clustering process is\nperformed by a (learnable) iterative K-Means clustering algorithm.\n We apply this technique to two different use cases, of different physical\norigins: a set of Macro mapping X-Ray Fluorescence (MA-XRF) synthetic data on\npictorial artworks, and a dataset of simulated astrophysical observations.\n","authors":["Alessandro Bombini","Fernando García-Avello Bofías","Caterina Bracci","Michele Ginolfi","Chiara Ruberto"],"pdf_url":"https://arxiv.org/pdf/2401.17695v1.pdf","comment":"20 pages, 10 figures, doi for code repository, dataset and trained\n model available and reported in the paper"},{"id":"http://arxiv.org/abs/2401.17675v1","updated":"2024-01-31T08:52:45Z","published":"2024-01-31T08:52:45Z","title":"Convergence analysis of t-SNE as a gradient flow for point cloud on a\n manifold","summary":" We present a theoretical foundation regarding the boundedness of the t-SNE\nalgorithm. t-SNE employs gradient descent iteration with Kullback-Leibler (KL)\ndivergence as the objective function, aiming to identify a set of points that\nclosely resemble the original data points in a high-dimensional space,\nminimizing KL divergence. Investigating t-SNE properties such as perplexity and\naffinity under a weak convergence assumption on the sampled dataset, we examine\nthe behavior of points generated by t-SNE under continuous gradient flow.\nDemonstrating that points generated by t-SNE remain bounded, we leverage this\ninsight to establish the existence of a minimizer for KL divergence.\n","authors":["Seonghyeon Jeong","Hau-Tieng Wu"],"pdf_url":"https://arxiv.org/pdf/2401.17675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16092v2","updated":"2024-01-31T08:33:37Z","published":"2024-01-29T12:02:28Z","title":"Multilingual Text-to-Image Generation Magnifies Gender Stereotypes and\n Prompt Engineering May Not Help You","summary":" Text-to-image generation models have recently achieved astonishing results in\nimage quality, flexibility, and text alignment and are consequently employed in\na fast-growing number of applications. Through improvements in multilingual\nabilities, a larger community now has access to this kind of technology. Yet,\nas we will show, multilingual models suffer similarly from (gender) biases as\nmonolingual models. Furthermore, the natural expectation is that these models\nwill provide similar results across languages, but this is not the case and\nthere are important differences between languages. Thus, we propose a novel\nbenchmark MAGBIG intending to foster research in multilingual models without\ngender bias. We investigate whether multilingual T2I models magnify gender bias\nwith MAGBIG. To this end, we use multilingual prompts requesting portrait\nimages of persons of a certain occupation or trait (using adjectives). Our\nresults show not only that models deviate from the normative assumption that\neach gender should be equally likely to be generated, but that there are also\nbig differences across languages. Furthermore, we investigate prompt\nengineering strategies, i.e. the use of indirect, neutral formulations, as a\npossible remedy for these biases. Unfortunately, they help only to a limited\nextent and result in worse text-to-image alignment. Consequently, this work\ncalls for more research into diverse representations across languages in image\ngenerators.\n","authors":["Felix Friedrich","Katharina Hämmerl","Patrick Schramowski","Jindrich Libovicky","Kristian Kersting","Alexander Fraser"],"pdf_url":"https://arxiv.org/pdf/2401.16092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17657v1","updated":"2024-01-31T08:21:35Z","published":"2024-01-31T08:21:35Z","title":"An attempt to generate new bridge types from latent space of\n energy-based model","summary":" Use energy-based model for bridge-type innovation. The loss function is\nexplained by the game theory, the logic is clear and the formula is simple and\nclear. Thus avoid the use of maximum likelihood estimation to explain the loss\nfunction and eliminate the need for Monte Carlo methods to solve the normalized\ndenominator. Assuming that the bridge-type population follows a Boltzmann\ndistribution, a neural network is constructed to represent the energy function.\nUse Langevin dynamics technology to generate a new sample with low energy\nvalue, thus a generative model of bridge-type based on energy is established.\nTrain energy function on symmetric structured image dataset of three span beam\nbridge, arch bridge, cable-stayed bridge, and suspension bridge to accurately\ncalculate the energy values of real and fake samples. Sampling from latent\nspace, using gradient descent algorithm, the energy function transforms the\nsampling points into low energy score samples, thereby generating new bridge\ntypes different from the dataset. Due to unstable and slow training in this\nattempt, the possibility of generating new bridge types is rare and the image\ndefinition of generated images is low.\n","authors":["Hongjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17657v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.17653v1","updated":"2024-01-31T08:13:35Z","published":"2024-01-31T08:13:35Z","title":"A primer on synthetic health data","summary":" Recent advances in deep generative models have greatly expanded the potential\nto create realistic synthetic health datasets. These synthetic datasets aim to\npreserve the characteristics, patterns, and overall scientific conclusions\nderived from sensitive health datasets without disclosing patient identity or\nsensitive information. Thus, synthetic data can facilitate safe data sharing\nthat supports a range of initiatives including the development of new\npredictive models, advanced health IT platforms, and general project ideation\nand hypothesis development. However, many questions and challenges remain,\nincluding how to consistently evaluate a synthetic dataset's similarity and\npredictive utility in comparison to the original real dataset and risk to\nprivacy when shared. Additional regulatory and governance issues have not been\nwidely addressed. In this primer, we map the state of synthetic health data,\nincluding generation and evaluation methods and tools, existing examples of\ndeployment, the regulatory and ethical landscape, access and governance\noptions, and opportunities for further development.\n","authors":["Jennifer Anne Bartell","Sander Boisen Valentin","Anders Krogh","Henning Langberg","Martin Bøgsted"],"pdf_url":"https://arxiv.org/pdf/2401.17653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12784v3","updated":"2024-01-31T08:07:23Z","published":"2023-12-20T06:10:27Z","title":"Fast Cell Library Characterization for Design Technology Co-Optimization\n Based on Graph Neural Networks","summary":" Design technology co-optimization (DTCO) plays a critical role in achieving\noptimal power, performance, and area (PPA) for advanced semiconductor process\ndevelopment. Cell library characterization is essential in DTCO flow, but\ntraditional methods are time-consuming and costly. To overcome these\nchallenges, we propose a graph neural network (GNN)-based machine learning\nmodel for rapid and accurate cell library characterization. Our model\nincorporates cell structures and demonstrates high prediction accuracy across\nvarious process-voltage-temperature (PVT) corners and technology parameters.\nValidation with 512 unseen technology corners and over one million test data\npoints shows accurate predictions of delay, power, and input pin capacitance\nfor 33 types of cells, with a mean absolute percentage error (MAPE) $\\le$ 0.95%\nand a speed-up of 100X compared with SPICE simulations. Additionally, we\ninvestigate system-level metrics such as worst negative slack (WNS), leakage\npower, and dynamic power using predictions obtained from the GNN-based model on\nunseen corners. Our model achieves precise predictions, with absolute error\n$\\le$3.0 ps for WNS, percentage errors $\\le$0.60% for leakage power, and\n$\\le$0.99% for dynamic power, when compared to golden reference. With the\ndeveloped model, we further proposed a fine-grained drive strength\ninterpolation methodology to enhance PPA for small-to-medium-scale designs,\nresulting in an approximate 1-3% improvement.\n","authors":["Tianliang Ma","Zhihui Deng","Xuguang Sun","Leilai Shao","Kainlu Low"],"pdf_url":"https://arxiv.org/pdf/2312.12784v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14442v2","updated":"2024-01-31T07:12:52Z","published":"2024-01-25T16:04:17Z","title":"Improving Antibody Humanness Prediction using Patent Data","summary":" We investigate the potential of patent data for improving the antibody\nhumanness prediction using a multi-stage, multi-loss training process.\nHumanness serves as a proxy for the immunogenic response to antibody\ntherapeutics, one of the major causes of attrition in drug discovery and a\nchallenging obstacle for their use in clinical settings. We pose the initial\nlearning stage as a weakly-supervised contrastive-learning problem, where each\nantibody sequence is associated with possibly multiple identifiers of function\nand the objective is to learn an encoder that groups them according to their\npatented properties. We then freeze a part of the contrastive encoder and\ncontinue training it on the patent data using the cross-entropy loss to predict\nthe humanness score of a given antibody sequence. We illustrate the utility of\nthe patent data and our approach by performing inference on three different\nimmunogenicity datasets, unseen during training. Our empirical results\ndemonstrate that the learned model consistently outperforms the alternative\nbaselines and establishes new state-of-the-art on five out of six inference\ntasks, irrespective of the used metric.\n","authors":["Talip Ucar","Aubin Ramon","Dino Oglic","Rebecca Croasdale-Wood","Tom Diethe","Pietro Sormanni"],"pdf_url":"https://arxiv.org/pdf/2401.14442v2.pdf","comment":"13 pages, 6 figures, Code: https://github.com/AstraZeneca/SelfPAD"},{"id":"http://arxiv.org/abs/2401.17629v1","updated":"2024-01-31T07:11:01Z","published":"2024-01-31T07:11:01Z","title":"Spatial-and-Frequency-aware Restoration method for Images based on\n Diffusion Models","summary":" Diffusion models have recently emerged as a promising framework for Image\nRestoration (IR), owing to their ability to produce high-quality\nreconstructions and their compatibility with established methods. Existing\nmethods for solving noisy inverse problems in IR, considers the pixel-wise\ndata-fidelity. In this paper, we propose SaFaRI, a spatial-and-frequency-aware\ndiffusion model for IR with Gaussian noise. Our model encourages images to\npreserve data-fidelity in both the spatial and frequency domains, resulting in\nenhanced reconstruction quality. We comprehensively evaluate the performance of\nour model on a variety of noisy inverse problems, including inpainting,\ndenoising, and super-resolution. Our thorough evaluation demonstrates that\nSaFaRI achieves state-of-the-art performance on both the ImageNet datasets and\nFFHQ datasets, outperforming existing zero-shot IR methods in terms of LPIPS\nand FID metrics.\n","authors":["Kyungsung Lee","Donggyu Lee","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2401.17629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15623v2","updated":"2024-01-31T06:58:51Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2401.17626v1","updated":"2024-01-31T06:58:26Z","published":"2024-01-31T06:58:26Z","title":"Generative AI to Generate Test Data Generators","summary":" Generating fake data is an essential dimension of modern software testing, as\ndemonstrated by the number and significance of data faking libraries. Yet,\ndevelopers of faking libraries cannot keep up with the wide range of data to be\ngenerated for different natural languages and domains. In this paper, we assess\nthe ability of generative AI for generating test data in different domains. We\ndesign three types of prompts for Large Language Models (LLMs), which perform\ntest data generation tasks at different levels of integrability: 1) raw test\ndata generation, 2) synthesizing programs in a specific language that generate\nuseful test data, and 3) producing programs that use state-of-the-art faker\nlibraries. We evaluate our approach by prompting LLMs to generate test data for\n11 domains. The results show that LLMs can successfully generate realistic test\ndata generators in a wide range of domains at all three levels of\nintegrability.\n","authors":["Benoit Baudry","Khashayar Etemadi","Sen Fang","Yogya Gamage","Yi Liu","Yuxin Liu","Martin Monperrus","Javier Ron","André Silva","Deepika Tiwari"],"pdf_url":"https://arxiv.org/pdf/2401.17626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17615v1","updated":"2024-01-31T05:59:38Z","published":"2024-01-31T05:59:38Z","title":"Graph Multi-Similarity Learning for Molecular Property Prediction","summary":" Effective molecular representation learning is essential for molecular\nproperty prediction. Contrastive learning, a prominent self-supervised approach\nfor molecular representation learning, relies on establishing positive and\nnegative pairs. However, this binary similarity categorization oversimplifies\nthe nature of complex molecular relationships and overlooks the degree of\nrelative similarities among molecules, posing challenges to the effectiveness\nand generality of representation learning. In response to this challenge, we\npropose the Graph Multi-Similarity Learning for Molecular Property Prediction\n(GraphMSL) framework. GraphMSL incorporates a generalized multi-similarity\nmetric in a continuous scale, capturing self-similarity and relative\nsimilarities. The unimodal multi-similarity metrics are derived from various\nchemical modalities, and the fusion of these metrics into a multimodal form\nsignificantly enhances the effectiveness of GraphMSL. In addition, the\nflexibility of fusion function can reshape the focus of the model to convey\ndifferent chemical semantics. GraphMSL proves effective in drug discovery\nevaluations through various downstream tasks and post-hoc analysis of learnt\nrepresentations. Its notable performance suggests significant potential for the\nexploration of new drug candidates.\n","authors":["Hao Xu","Zhengyang Zhou","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2401.17615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17612v1","updated":"2024-01-31T05:52:11Z","published":"2024-01-31T05:52:11Z","title":"IGCN: Integrative Graph Convolutional Networks for Multi-modal Data","summary":" Recent advances in Graph Neural Networks (GNN) have led to a considerable\ngrowth in graph data modeling for multi-modal data which contains various types\nof nodes and edges. Although some integrative prediction solutions have been\ndeveloped recently for network-structured data, these methods have some\nrestrictions. For a node classification task involving multi-modal data,\ncertain data modalities may perform better when predicting one class, while\nothers might excel in predicting a different class. Thus, to obtain a better\nlearning representation, advanced computational methodologies are required for\nthe integrative analysis of multi-modal data. Moreover, existing integrative\ntools lack a comprehensive and cohesive understanding of the rationale behind\ntheir specific predictions, making them unsuitable for enhancing model\ninterpretability. Addressing these restrictions, we introduce a novel\nintegrative neural network approach for multi-modal data networks, named\nIntegrative Graph Convolutional Networks (IGCN). IGCN learns node embeddings\nfrom multiple topologies and fuses the multiple node embeddings into a weighted\nform by assigning attention coefficients to the node embeddings. Our proposed\nattention mechanism helps identify which types of data receive more emphasis\nfor each sample to predict a certain class. Therefore, IGCN has the potential\nto unravel previously unknown characteristics within different node\nclassification tasks. We benchmarked IGCN on several datasets from different\ndomains, including a multi-omics dataset to predict cancer subtypes and a\nmulti-modal clinical dataset to predict the progression of Alzheimer's disease.\nExperimental results show that IGCN outperforms or is on par with the\nstate-of-the-art and baseline methods.\n","authors":["Cagri Ozdemir","Mohammad Al Olaimat","Yashu Vashishath","Serdar Bozdag","Alzheimer's Disease Neuroimaging Initiative"],"pdf_url":"https://arxiv.org/pdf/2401.17612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01145v3","updated":"2024-01-31T05:50:05Z","published":"2024-01-02T10:55:01Z","title":"HAAQI-Net: A non-intrusive neural music quality assessment model for\n hearing aids","summary":" This paper introduces HAAQI-Net, a non-intrusive deep learning model for\nmusic quality assessment tailored to hearing aid users. In contrast to\ntraditional methods like the Hearing Aid Audio Quality Index (HAAQI), HAAQI-Net\nutilizes a Bidirectional Long Short-Term Memory (BLSTM) with attention. It\ntakes an assessed music sample and a hearing loss pattern as input, generating\na predicted HAAQI score. The model employs the pre-trained Bidirectional\nEncoder representation from Audio Transformers (BEATs) for acoustic feature\nextraction. Comparing predicted scores with ground truth, HAAQI-Net achieves a\nLongitudinal Concordance Correlation (LCC) of 0.9368, Spearman's Rank\nCorrelation Coefficient (SRCC) of 0.9486, and Mean Squared Error (MSE) of\n0.0064. Notably, this high performance comes with a substantial reduction in\ninference time: from 62.52 seconds (by HAAQI) to 2.54 seconds (by HAAQI-Net),\nserving as an efficient music quality assessment model for hearing aid users.\n","authors":["Dyah A. M. G. Wisnu","Epri W. Pratiwi","Stefano Rini","Ryandhimas E. Zezario","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2401.01145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.15523v2","updated":"2024-01-31T05:30:22Z","published":"2022-05-31T03:47:08Z","title":"Variational Transfer Learning using Cross-Domain Latent Modulation","summary":" To successfully apply trained neural network models to new domains, powerful\ntransfer learning solutions are essential. We propose to introduce a novel\ncross-domain latent modulation mechanism to a variational autoencoder framework\nso as to achieve effective transfer learning. Our key idea is to procure deep\nrepresentations from one data domain and use it to influence the\nreparameterization of the latent variable of another domain. Specifically, deep\nrepresentations of the source and target domains are first extracted by a\nunified inference model and aligned by employing gradient reversal. The learned\ndeep representations are then cross-modulated to the latent encoding of the\nalternative domain, where consistency constraints are also applied. In the\nempirical validation that includes a number of transfer learning benchmark\ntasks for unsupervised domain adaptation and image-to-image translation, our\nmodel demonstrates competitive performance, which is also supported by evidence\nobtained from visualization.\n","authors":["Jinyong Hou","Jeremiah D. Deng","Stephen Cranefield","Xuejie Din"],"pdf_url":"https://arxiv.org/pdf/2205.15523v2.pdf","comment":"Under review. Extended version of a previous WACV paper\n (arXiv:2012.11727). 13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2201.09196v2","updated":"2024-01-31T05:30:08Z","published":"2022-01-23T06:45:47Z","title":"Learning to Predict Gradients for Semi-Supervised Continual Learning","summary":" A key challenge for machine intelligence is to learn new visual concepts\nwithout forgetting the previously acquired knowledge. Continual learning is\naimed towards addressing this challenge. However, there is a gap between\nexisting supervised continual learning and human-like intelligence, where human\nis able to learn from both labeled and unlabeled data. How unlabeled data\naffects learning and catastrophic forgetting in the continual learning process\nremains unknown. To explore these issues, we formulate a new semi-supervised\ncontinual learning method, which can be generically applied to existing\ncontinual learning models. Specifically, a novel gradient learner learns from\nlabeled data to predict gradients on unlabeled data. Hence, the unlabeled data\ncould fit into the supervised continual learning method. Different from\nconventional semi-supervised settings, we do not hypothesize that the\nunderlying classes, which are associated to the unlabeled data, are known to\nthe learning process. In other words, the unlabeled data could be very distinct\nfrom the labeled data. We evaluate the proposed method on mainstream continual\nlearning, adversarial continual learning, and semi-supervised learning tasks.\nThe proposed method achieves state-of-the-art performance on classification\naccuracy and backward transfer in the continual learning setting while\nachieving desired performance on classification accuracy in the semi-supervised\nlearning setting. This implies that the unlabeled images can enhance the\ngeneralizability of continual learning models on the predictive ability on\nunseen data and significantly alleviate catastrophic forgetting. The code is\navailable at \\url{https://github.com/luoyan407/grad_prediction.git}.\n","authors":["Yan Luo","Yongkang Wong","Mohan Kankanhalli","Qi Zhao"],"pdf_url":"https://arxiv.org/pdf/2201.09196v2.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems\n (TNNLS)"},{"id":"http://arxiv.org/abs/2401.15294v2","updated":"2024-01-31T05:23:56Z","published":"2024-01-27T04:42:50Z","title":"Integral Operator Approaches for Scattered Data Fitting on Spheres","summary":" This paper focuses on scattered data fitting problems on spheres. We study\nthe approximation performance of a class of weighted spectral filter\nalgorithms, including Tikhonov regularization, Landaweber iteration, spectral\ncut-off, and iterated Tikhonov, in fitting noisy data with possibly unbounded\nrandom noise. For the analysis, we develop an integral operator approach that\ncan be regarded as an extension of the widely used sampling inequality approach\nand norming set method in the community of scattered data fitting. After\nproviding an equivalence between the operator differences and quadrature rules,\nwe succeed in deriving optimal Sobolev-type error estimates of weighted\nspectral filter algorithms. Our derived error estimates do not suffer from the\nsaturation phenomenon for Tikhonov regularization in the literature,\nnative-space-barrier for existing error analysis and adapts to different\nembedding spaces. We also propose a divide-and-conquer scheme to equip weighted\nspectral filter algorithms to reduce their computational burden and present the\noptimal approximation error bounds.\n","authors":["Shao-Bo Lin"],"pdf_url":"https://arxiv.org/pdf/2401.15294v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16729v2","updated":"2024-01-31T05:11:54Z","published":"2024-01-30T04:01:18Z","title":"Widely Linear Matched Filter: A Lynchpin towards the Interpretability of\n Complex-valued CNNs","summary":" A recent study on the interpretability of real-valued convolutional neural\nnetworks (CNNs) {Stankovic_Mandic_2023CNN} has revealed a direct and physically\nmeaningful link with the task of finding features in data through matched\nfilters. However, applying this paradigm to illuminate the interpretability of\ncomplex-valued CNNs meets a formidable obstacle: the extension of matched\nfiltering to a general class of noncircular complex-valued data, referred to\nhere as the widely linear matched filter (WLMF), has been only implicit in the\nliterature. To this end, to establish the interpretability of the operation of\ncomplex-valued CNNs, we introduce a general WLMF paradigm, provide its solution\nand undertake analysis of its performance. For rigor, our WLMF solution is\nderived without imposing any assumption on the probability density of noise.\nThe theoretical advantages of the WLMF over its standard strictly linear\ncounterpart (SLMF) are provided in terms of their output signal-to-noise-ratios\n(SNRs), with WLMF consistently exhibiting enhanced SNR. Moreover, the lower\nbound on the SNR gain of WLMF is derived, together with condition to attain\nthis bound. This serves to revisit the convolution-activation-pooling chain in\ncomplex-valued CNNs through the lens of matched filtering, which reveals the\npotential of WLMFs to provide physical interpretability and enhance\nexplainability of general complex-valued CNNs. Simulations demonstrate the\nagreement between the theoretical and numerical results.\n","authors":["Qingchen Wang","Zhe Li","Zdenka Babic","Wei Deng","Ljubiša Stanković","Danilo P. Mandic"],"pdf_url":"https://arxiv.org/pdf/2401.16729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00162v3","updated":"2024-01-31T05:00:25Z","published":"2023-06-30T22:36:41Z","title":"What Do Self-Supervised Speech Models Know About Words?","summary":" Many self-supervised speech models (S3Ms) have been introduced over the last\nfew years, improving performance and data efficiency on various speech tasks.\nHowever, these empirical successes alone do not give a complete picture of what\nis learned during pre-training. Recent work has begun analyzing how S3Ms encode\ncertain properties, such as phonetic and speaker information, but we still lack\na proper understanding of knowledge encoded at the word level and beyond. In\nthis work, we use lightweight analysis methods to study segment-level\nlinguistic properties -- word identity, boundaries, pronunciation, syntactic\nfeatures, and semantic features -- encoded in S3Ms. We present a comparative\nstudy of layer-wise representations from ten S3Ms and find that (i) the\nframe-level representations within each word segment are not all equally\ninformative, and (ii) the pre-training objective and model size heavily\ninfluence the accessibility and distribution of linguistic information across\nlayers. We also find that on several tasks -- word discrimination, word\nsegmentation, and semantic sentence similarity -- S3Ms trained with visual\ngrounding outperform their speech-only counterparts. Finally, our task-based\nanalyses demonstrate improved performance on word segmentation and acoustic\nword discrimination while using simpler methods than prior work.\n","authors":["Ankita Pasad","Chung-Ming Chien","Shane Settle","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2307.00162v3.pdf","comment":"Pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2008.11945v6","updated":"2024-01-31T04:45:32Z","published":"2020-08-27T06:53:53Z","title":"Moderately Supervised Learning: Definition, Framework and Generality","summary":" Learning with supervision has achieved remarkable success in numerous\nartificial intelligence (AI) applications. In the current literature, by\nreferring to the properties of the labels prepared for the training dataset,\nlearning with supervision is categorized as supervised learning (SL) and weakly\nsupervised learning (WSL). SL concerns the situation where the training data\nset is assigned with ideal (complete, exact and accurate) labels, while WSL\nconcerns the situation where the training data set is assigned with non-ideal\n(incomplete, inexact or inaccurate) labels. However, various solutions for SL\ntasks have shown that the given labels are not always easy to learn, and the\ntransformation from the given labels to easy-to-learn targets can significantly\naffect the performance of the final SL solutions. Without considering the\nproperties of the transformation from the given labels to easy-to-learn\ntargets, the definition of SL conceals some details that can be critical to\nbuilding the appropriate solutions for specific SL tasks. Thus, for engineers\nin the AI application field, it is desirable to reveal these details\nsystematically. This article attempts to achieve this goal by expanding the\ncategorization of SL and investigating the sub-type moderately supervised\nlearning (MSL) that concerns the situation where the given labels are ideal,\nbut due to the simplicity in annotation, careful designs are required to\ntransform the given labels into easy-to-learn targets. From the perspectives of\nthe definition, framework and generality, we conceptualize MSL to present a\ncomplete fundamental basis to systematically analyse MSL tasks. At meantime,\nrevealing the relation between the conceptualization of MSL and the\nmathematicians' vision, this paper as well establishes a tutorial for AI\napplication engineers to refer to viewing a problem to be solved from the\nmathematicians' vision.\n","authors":["Yongquan Yang"],"pdf_url":"https://arxiv.org/pdf/2008.11945v6.pdf","comment":"This is the final published version (33 pages)"},{"id":"http://arxiv.org/abs/2401.17585v1","updated":"2024-01-31T04:12:59Z","published":"2024-01-31T04:12:59Z","title":"Propagation and Pitfalls: Reasoning-based Assessment of Knowledge\n Editing through Counterfactual Tasks","summary":" Current approaches of knowledge editing struggle to effectively propagate\nupdates to interconnected facts. In this work, we delve into the barriers that\nhinder the appropriate propagation of updated knowledge within these models for\naccurate reasoning. To support our analysis, we introduce a novel\nreasoning-based benchmark -- ReCoE (Reasoning-based Counterfactual Editing\ndataset) -- which covers six common reasoning schemes in real world. We conduct\na thorough analysis of existing knowledge editing techniques, including input\naugmentation, finetuning, and locate-and-edit. We found that all model editing\nmethods show notably low performance on this dataset, especially in certain\nreasoning schemes. Our analysis over the chain-of-thought generation of edited\nmodels further uncover key reasons behind the inadequacy of existing knowledge\nediting methods from a reasoning standpoint, involving aspects on fact-wise\nediting, fact recall ability, and coherence in generation. We will make our\nbenchmark publicly available.\n","authors":["Wenyue Hua","Jiang Guo","Mingwen Dong","Henghui Zhu","Patrick Ng","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17585v1.pdf","comment":"22 pages, 14 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.17583v1","updated":"2024-01-31T03:58:28Z","published":"2024-01-31T03:58:28Z","title":"Agile But Safe: Learning Collision-Free High-Speed Legged Locomotion","summary":" Legged robots navigating cluttered environments must be jointly agile for\nefficient task execution and safe to avoid collisions with obstacles or humans.\nExisting studies either develop conservative controllers (< 1.0 m/s) to ensure\nsafety, or focus on agility without considering potentially fatal collisions.\nThis paper introduces Agile But Safe (ABS), a learning-based control framework\nthat enables agile and collision-free locomotion for quadrupedal robots. ABS\ninvolves an agile policy to execute agile motor skills amidst obstacles and a\nrecovery policy to prevent failures, collaboratively achieving high-speed and\ncollision-free navigation. The policy switch in ABS is governed by a learned\ncontrol-theoretic reach-avoid value network, which also guides the recovery\npolicy as an objective function, thereby safeguarding the robot in a closed\nloop. The training process involves the learning of the agile policy, the\nreach-avoid value network, the recovery policy, and an exteroception\nrepresentation network, all in simulation. These trained modules can be\ndirectly deployed in the real world with onboard sensing and computation,\nleading to high-speed and collision-free navigation in confined indoor and\noutdoor spaces with both static and dynamic obstacles.\n","authors":["Tairan He","Chong Zhang","Wenli Xiao","Guanqi He","Changliu Liu","Guanya Shi"],"pdf_url":"https://arxiv.org/pdf/2401.17583v1.pdf","comment":"Project website: https://agile-but-safe.github.io/"},{"id":"http://arxiv.org/abs/2305.15002v2","updated":"2024-01-31T03:56:22Z","published":"2023-05-24T10:41:24Z","title":"A RelEntLess Benchmark for Modelling Graded Relations between Named\n Entities","summary":" Relations such as \"is influenced by\", \"is known for\" or \"is a competitor of\"\nare inherently graded: we can rank entity pairs based on how well they satisfy\nthese relations, but it is hard to draw a line between those pairs that satisfy\nthem and those that do not. Such graded relations play a central role in many\napplications, yet they are typically not covered by existing Knowledge Graphs.\nIn this paper, we consider the possibility of using Large Language Models\n(LLMs) to fill this gap. To this end, we introduce a new benchmark, in which\nentity pairs have to be ranked according to how much they satisfy a given\ngraded relation. The task is formulated as a few-shot ranking problem, where\nmodels only have access to a description of the relation and five prototypical\ninstances. We use the proposed benchmark to evaluate state-of-the-art relation\nembedding strategies as well as several recent LLMs, covering both publicly\navailable LLMs and closed models such as GPT-4. Overall, we find a strong\ncorrelation between model size and performance, with smaller Language Models\nstruggling to outperform a naive baseline. The results of the largest Flan-T5\nand OPT models are remarkably strong, although a clear gap with human\nperformance remains.\n","authors":["Asahi Ushio","Jose Camacho Collados","Steven Schockaert"],"pdf_url":"https://arxiv.org/pdf/2305.15002v2.pdf","comment":"EACL 2024 main conference"},{"id":"http://arxiv.org/abs/2311.12831v3","updated":"2024-01-31T03:53:31Z","published":"2023-10-02T06:06:32Z","title":"ECNR: Efficient Compressive Neural Representation of Time-Varying\n Volumetric Datasets","summary":" Due to its conceptual simplicity and generality, compressive neural\nrepresentation has emerged as a promising alternative to traditional\ncompression methods for managing massive volumetric datasets. The current\npractice of neural compression utilizes a single large multilayer perceptron\n(MLP) to encode the global volume, incurring slow training and inference. This\npaper presents an efficient compressive neural representation (ECNR) solution\nfor time-varying data compression, utilizing the Laplacian pyramid for adaptive\nsignal fitting. Following a multiscale structure, we leverage multiple small\nMLPs at each scale for fitting local content or residual blocks. By assigning\nsimilar blocks to the same MLP via size uniformization, we enable balanced\nparallelization among MLPs to significantly speed up training and inference.\nWorking in concert with the multiscale structure, we tailor a deep compression\nstrategy to compact the resulting model. We show the effectiveness of ECNR with\nmultiple datasets and compare it with state-of-the-art compression methods\n(mainly SZ3, TTHRESH, and neurcomp). The results position ECNR as a promising\nsolution for volumetric data compression.\n","authors":["Kaiyuan Tang","Chaoli Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12831v3.pdf","comment":"Accepted by IEEE PacificVis 2024 (conference papers track)"},{"id":"http://arxiv.org/abs/2401.17580v1","updated":"2024-01-31T03:51:30Z","published":"2024-01-31T03:51:30Z","title":"Graph Contrastive Learning with Cohesive Subgraph Awareness","summary":" Graph contrastive learning (GCL) has emerged as a state-of-the-art strategy\nfor learning representations of diverse graphs including social and biomedical\nnetworks. GCL widely uses stochastic graph topology augmentation, such as\nuniform node dropping, to generate augmented graphs. However, such stochastic\naugmentations may severely damage the intrinsic properties of a graph and\ndeteriorate the following representation learning process. We argue that\nincorporating an awareness of cohesive subgraphs during the graph augmentation\nand learning processes has the potential to enhance GCL performance. To this\nend, we propose a novel unified framework called CTAug, to seamlessly integrate\ncohesion awareness into various existing GCL mechanisms. In particular, CTAug\ncomprises two specialized modules: topology augmentation enhancement and graph\nlearning enhancement. The former module generates augmented graphs that\ncarefully preserve cohesion properties, while the latter module bolsters the\ngraph encoder's ability to discern subgraph patterns. Theoretical analysis\nshows that CTAug can strictly improve existing GCL mechanisms. Empirical\nexperiments verify that CTAug can achieve state-of-the-art performance for\ngraph representation learning, especially for graphs with high degrees. The\ncode is available at https://doi.org/10.5281/zenodo.10594093, or\nhttps://github.com/wuyucheng2002/CTAug.\n","authors":["Yucheng Wu","Leye Wang","Xiao Han","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2401.17580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17574v1","updated":"2024-01-31T03:39:07Z","published":"2024-01-31T03:39:07Z","title":"Scavenging Hyena: Distilling Transformers into Long Convolution Models","summary":" The rapid evolution of Large Language Models (LLMs), epitomized by\narchitectures like GPT-4, has reshaped the landscape of natural language\nprocessing. This paper introduces a pioneering approach to address the\nefficiency concerns associated with LLM pre-training, proposing the use of\nknowledge distillation for cross-architecture transfer. Leveraging insights\nfrom the efficient Hyena mechanism, our method replaces attention heads in\ntransformer models by Hyena, offering a cost-effective alternative to\ntraditional pre-training while confronting the challenge of processing long\ncontextual information, inherent in quadratic attention mechanisms. Unlike\nconventional compression-focused methods, our technique not only enhances\ninference speed but also surpasses pre-training in terms of both accuracy and\nefficiency. In the era of evolving LLMs, our work contributes to the pursuit of\nsustainable AI solutions, striking a balance between computational power and\nenvironmental impact.\n","authors":["Tokiniaina Raharison Ralambomihanta","Shahrad Mohammadzadeh","Mohammad Sami Nur Islam","Wassim Jabbour","Laurence Liang"],"pdf_url":"https://arxiv.org/pdf/2401.17574v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.17323v3","updated":"2024-01-31T03:35:44Z","published":"2023-05-27T01:56:09Z","title":"Some Primal-Dual Theory for Subgradient Methods for Strongly Convex\n Optimization","summary":" We consider (stochastic) subgradient methods for strongly convex but\npotentially nonsmooth non-Lipschitz optimization. We provide new equivalent\ndual descriptions (in the style of dual averaging) for the classic subgradient\nmethod, the proximal subgradient method, and the switching subgradient method.\nThese equivalences enable $O(1/T)$ convergence guarantees in terms of both\ntheir classic primal gap and a not previously analyzed dual gap for strongly\nconvex optimization. Consequently, our theory provides these classic methods\nwith simple, optimal stopping criteria and optimality certificates at no added\ncomputational cost. Our results apply to a wide range of stepsize selections\nand of non-Lipschitz ill-conditioned problems where the early iterations of the\nsubgradient method may diverge exponentially quickly (a phenomenon which, to\nthe best of our knowledge, no prior works address). Even in the presence of\nsuch undesirable behaviors, our theory still ensures and bounds eventual\nconvergence.\n","authors":["Benjamin Grimmer","Danlin Li"],"pdf_url":"https://arxiv.org/pdf/2305.17323v3.pdf","comment":"22 pages, major revision shortened the write-up and unified the\n analysis to be done just once in a single \"super\" setting"},{"id":"http://arxiv.org/abs/2401.17573v1","updated":"2024-01-31T03:35:08Z","published":"2024-01-31T03:35:08Z","title":"Tensor-based process control and monitoring for semiconductor\n manufacturing with unstable disturbances","summary":" With the development and popularity of sensors installed in manufacturing\nsystems, complex data are collected during manufacturing processes, which\nbrings challenges for traditional process control methods. This paper proposes\na novel process control and monitoring method for the complex structure of\nhigh-dimensional image-based overlay errors (modeled in tensor form), which are\ncollected in semiconductor manufacturing processes. The proposed method aims to\nreduce overlay errors using limited control recipes. We first build a\nhigh-dimensional process model and propose different tensor-on-vector\nregression algorithms to estimate parameters in the model to alleviate the\ncurse of dimensionality. Then, based on the estimate of tensor parameters, the\nexponentially weighted moving average (EWMA) controller for tensor data is\ndesigned whose stability is theoretically guaranteed. Considering the fact that\nlow-dimensional control recipes cannot compensate for all high-dimensional\ndisturbances on the image, control residuals are monitored to prevent\nsignificant drifts of uncontrollable high-dimensional disturbances. Through\nextensive simulations and real case studies, the performances of parameter\nestimation algorithms and the EWMA controller in tensor space are evaluated.\nCompared with existing image-based feedback controllers, the superiority of our\nmethod is verified especially when disturbances are not stable.\n","authors":["Yanrong Li","Juan Du","Fugee Tsung","Wei Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.17573v1.pdf","comment":"30 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.04836v2","updated":"2024-01-31T03:29:10Z","published":"2023-08-09T09:58:42Z","title":"Beyond Surprise: Improving Exploration Through Surprise Novelty","summary":" We present a new computing model for intrinsic rewards in reinforcement\nlearning that addresses the limitations of existing surprise-driven\nexplorations. The reward is the novelty of the surprise rather than the\nsurprise norm. We estimate the surprise novelty as retrieval errors of a memory\nnetwork wherein the memory stores and reconstructs surprises. Our surprise\nmemory (SM) augments the capability of surprise-based intrinsic motivators,\nmaintaining the agent's interest in exciting exploration while reducing\nunwanted attraction to unpredictable or noisy observations. Our experiments\ndemonstrate that the SM combined with various surprise predictors exhibits\nefficient exploring behaviors and significantly boosts the final performance in\nsparse reward environments, including Noisy-TV, navigation and challenging\nAtari games.\n","authors":["Hung Le","Kien Do","Dung Nguyen","Svetha Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2308.04836v2.pdf","comment":"17 pages including Appendix"},{"id":"http://arxiv.org/abs/2301.11673v4","updated":"2024-01-31T03:02:06Z","published":"2023-01-27T12:13:06Z","title":"Bayesian Self-Supervised Contrastive Learning","summary":" Recent years have witnessed many successful applications of contrastive\nlearning in diverse domains, yet its self-supervised version still remains many\nexciting challenges. As the negative samples are drawn from unlabeled datasets,\na randomly selected sample may be actually a false negative to an anchor,\nleading to incorrect encoder training. This paper proposes a new\nself-supervised contrastive loss called the BCL loss that still uses random\nsamples from the unlabeled data while correcting the resulting bias with\nimportance weights. The key idea is to design the desired sampling distribution\nfor sampling hard true negative samples under the Bayesian framework. The\nprominent advantage lies in that the desired sampling distribution is a\nparametric structure, with a location parameter for debiasing false negative\nand concentration parameter for mining hard negative, respectively. Experiments\nvalidate the effectiveness and superiority of the BCL loss.\n","authors":["Bin Liu","Bang Wang","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2301.11673v4.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2310.16592v2","updated":"2024-01-31T02:49:52Z","published":"2023-10-25T12:28:20Z","title":"Over-the-air Federated Policy Gradient","summary":" In recent years, over-the-air aggregation has been widely considered in\nlarge-scale distributed learning, optimization, and sensing. In this paper, we\npropose the over-the-air federated policy gradient algorithm, where all agents\nsimultaneously broadcast an analog signal carrying local information to a\ncommon wireless channel, and a central controller uses the received aggregated\nwaveform to update the policy parameters. We investigate the effect of noise\nand channel distortion on the convergence of the proposed algorithm, and\nestablish the complexities of communication and sampling for finding an\n$\\epsilon$-approximate stationary point. Finally, we present some simulation\nresults to show the effectiveness of the algorithm.\n","authors":["Huiwen Yang","Lingying Huang","Subhrakanti Dey","Ling Shi"],"pdf_url":"https://arxiv.org/pdf/2310.16592v2.pdf","comment":"To appear at IEEE ICC 2024"},{"id":"http://arxiv.org/abs/2401.17548v1","updated":"2024-01-31T02:26:09Z","published":"2024-01-31T02:26:09Z","title":"Rethinking Channel Dependence for Multivariate Time Series Forecasting:\n Learning from Leading Indicators","summary":" Recently, channel-independent methods have achieved state-of-the-art\nperformance in multivariate time series (MTS) forecasting. Despite reducing\noverfitting risks, these methods miss potential opportunities in utilizing\nchannel dependence for accurate predictions. We argue that there exist locally\nstationary lead-lag relationships between variates, i.e., some lagged variates\nmay follow the leading indicators within a short time period. Exploiting such\nchannel dependence is beneficial since leading indicators offer advance\ninformation that can be used to reduce the forecasting difficulty of the lagged\nvariates. In this paper, we propose a new method named LIFT that first\nefficiently estimates leading indicators and their leading steps at each time\nstep and then judiciously allows the lagged variates to utilize the advance\ninformation from leading indicators. LIFT plays as a plugin that can be\nseamlessly collaborated with arbitrary time series forecasting methods.\nExtensive experiments on six real-world datasets demonstrate that LIFT improves\nthe state-of-the-art methods by 5.5% in average forecasting performance.\n","authors":["Lifan Zhao","Yanyan Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17548v1.pdf","comment":"Accepted to ICLR 2024. Preprint version"},{"id":"http://arxiv.org/abs/2401.17546v1","updated":"2024-01-31T02:20:21Z","published":"2024-01-31T02:20:21Z","title":"Effective Multi-Stage Training Model For Edge Computing Devices In\n Intrusion Detection","summary":" Intrusion detection poses a significant challenge within expansive and\npersistently interconnected environments. As malicious code continues to\nadvance and sophisticated attack methodologies proliferate, various advanced\ndeep learning-based detection approaches have been proposed. Nevertheless, the\ncomplexity and accuracy of intrusion detection models still need further\nenhancement to render them more adaptable to diverse system categories,\nparticularly within resource-constrained devices, such as those embedded in\nedge computing systems. This research introduces a three-stage training\nparadigm, augmented by an enhanced pruning methodology and model compression\ntechniques. The objective is to elevate the system's effectiveness,\nconcurrently maintaining a high level of accuracy for intrusion detection.\nEmpirical assessments conducted on the UNSW-NB15 dataset evince that this\nsolution notably reduces the model's dimensions, while upholding accuracy\nlevels equivalent to similar proposals.\n","authors":["Thua Huynh Trong","Thanh Nguyen Hoang"],"pdf_url":"https://arxiv.org/pdf/2401.17546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17544v1","updated":"2024-01-31T02:18:27Z","published":"2024-01-31T02:18:27Z","title":"Trainable Fixed-Point Quantization for Deep Learning Acceleration on\n FPGAs","summary":" Quantization is a crucial technique for deploying deep learning models on\nresource-constrained devices, such as embedded FPGAs. Prior efforts mostly\nfocus on quantizing matrix multiplications, leaving other layers like BatchNorm\nor shortcuts in floating-point form, even though fixed-point arithmetic is more\nefficient on FPGAs. A common practice is to fine-tune a pre-trained model to\nfixed-point for FPGA deployment, but potentially degrading accuracy.\n This work presents QFX, a novel trainable fixed-point quantization approach\nthat automatically learns the binary-point position during model training.\nAdditionally, we introduce a multiplier-free quantization strategy within QFX\nto minimize DSP usage. QFX is implemented as a PyTorch-based library that\nefficiently emulates fixed-point arithmetic, supported by FPGA HLS, in a\ndifferentiable manner during backpropagation. With minimal effort, models\ntrained with QFX can readily be deployed through HLS, producing the same\nnumerical results as their software counterparts. Our evaluation shows that\ncompared to post-training quantization, QFX can quantize models trained with\nelement-wise layers quantized to fewer bits and achieve higher accuracy on both\nCIFAR-10 and ImageNet datasets. We further demonstrate the efficacy of\nmultiplier-free quantization using a state-of-the-art binarized neural network\naccelerator designed for an embedded FPGA (AMD Xilinx Ultra96 v2). We plan to\nrelease QFX in open-source format.\n","authors":["Dingyi Dai","Yichi Zhang","Jiahao Zhang","Zhanqiu Hu","Yaohui Cai","Qi Sun","Zhiru Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16655v2","updated":"2024-01-31T02:14:18Z","published":"2024-01-30T01:18:41Z","title":"Rademacher Complexity of Neural ODEs via Chen-Fliess Series","summary":" We show how continuous-depth neural ODE models can be framed as single-layer,\ninfinite-width nets using the Chen--Fliess series expansion for nonlinear ODEs.\nIn this net, the output \"weights\" are taken from the signature of the control\ninput -- a tool used to represent infinite-dimensional paths as a sequence of\ntensors -- which comprises iterated integrals of the control input over a\nsimplex. The \"features\" are taken to be iterated Lie derivatives of the output\nfunction with respect to the vector fields in the controlled ODE model. The\nmain result of this work applies this framework to derive compact expressions\nfor the Rademacher complexity of ODE models that map an initial condition to a\nscalar output at some terminal time. The result leverages the straightforward\nanalysis afforded by single-layer architectures. We conclude with some examples\ninstantiating the bound for some specific systems and discuss potential\nfollow-up work.\n","authors":["Joshua Hanson","Maxim Raginsky"],"pdf_url":"https://arxiv.org/pdf/2401.16655v2.pdf","comment":"14 pages; submitted to L4DC 2024"},{"id":"http://arxiv.org/abs/2401.17542v1","updated":"2024-01-31T02:09:21Z","published":"2024-01-31T02:09:21Z","title":"Data-Effective Learning: A Comprehensive Medical Benchmark","summary":" Data-effective learning aims to use data in the most impactful way to train\nAI models, which involves strategies that focus on data quality rather than\nquantity, ensuring the data used for training has high informational value.\nData-effective learning plays a profound role in accelerating AI training,\nreducing computational costs, and saving data storage, which is very important\nas the volume of medical data in recent years has grown beyond many people's\nexpectations. However, due to the lack of standards and comprehensive\nbenchmark, research on medical data-effective learning is poorly studied. To\naddress this gap, our paper introduces a comprehensive benchmark specifically\nfor evaluating data-effective learning in the medical field. This benchmark\nincludes a dataset with millions of data samples from 31 medical centers\n(DataDEL), a baseline method for comparison (MedDEL), and a new evaluation\nmetric (NormDEL) to objectively measure data-effective learning performance.\nOur extensive experimental results show the baseline MedDEL can achieve\nperformance comparable to the original large dataset with only 5% of the data.\nEstablishing such an open data-effective learning benchmark is crucial for the\nmedical AI research community because it facilitates efficient data use,\npromotes collaborative breakthroughs, and fosters the development of\ncost-effective, scalable, and impactful healthcare solutions. The project can\nbe accessed at\nhttps://github.com/shadow2469/Data-Effective-Learning-A-Comprehensive-Medical-Benchmark.git.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17541v1","updated":"2024-01-31T02:08:43Z","published":"2024-01-31T02:08:43Z","title":"Towards Understanding Variants of Invariant Risk Minimization through\n the Lens of Calibration","summary":" Machine learning models traditionally assume that training and test data are\nindependently and identically distributed. However, in real-world applications,\nthe test distribution often differs from training. This problem, known as\nout-of-distribution generalization, challenges conventional models. Invariant\nRisk Minimization (IRM) emerges as a solution, aiming to identify features\ninvariant across different environments to enhance out-of-distribution\nrobustness. However, IRM's complexity, particularly its bi-level optimization,\nhas led to the development of various approximate methods. Our study\ninvestigates these approximate IRM techniques, employing the Expected\nCalibration Error (ECE) as a key metric. ECE, which measures the reliability of\nmodel prediction, serves as an indicator of whether models effectively capture\nenvironment-invariant features. Through a comparative analysis of datasets with\ndistributional shifts, we observe that Information Bottleneck-based IRM, which\ncondenses representational information, achieves a balance in improving ECE\nwhile preserving accuracy relatively. This finding is pivotal, as it\ndemonstrates a feasible path to maintaining robustness without compromising\naccuracy. Nonetheless, our experiments also caution against\nover-regularization, which can diminish accuracy. This underscores the\nnecessity for a systematic approach in evaluating out-of-distribution\ngeneralization metrics, one that beyond mere accuracy to address the nuanced\ninterplay between accuracy and calibration.\n","authors":["Kotaro Yoshida","Hiroki Naganuma"],"pdf_url":"https://arxiv.org/pdf/2401.17541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13139v5","updated":"2024-01-31T02:00:13Z","published":"2023-10-19T20:32:25Z","title":"Graph Neural Networks with polynomial activations have limited\n expressivity","summary":" The expressivity of Graph Neural Networks (GNNs) can be entirely\ncharacterized by appropriate fragments of the first order logic. Namely, any\nquery of the two variable fragment of graded modal logic (GC2) interpreted over\nlabeled graphs can be expressed using a GNN whose size depends only on the\ndepth of the query. As pointed out by [Barcelo & Al., 2020, Grohe, 2021], this\ndescription holds for a family of activation functions, leaving the\npossibibility for a hierarchy of logics expressible by GNNs depending on the\nchosen activation function. In this article, we show that such hierarchy indeed\nexists by proving that GC2 queries cannot be expressed by GNNs with polynomial\nactivation functions. This implies a separation between polynomial and popular\nnon polynomial activations (such as Rectified Linear Units) and answers an open\nquestion formulated by [Grohe, 21].\n","authors":["Sammy Khalife"],"pdf_url":"https://arxiv.org/pdf/2310.13139v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14577v2","updated":"2024-01-31T01:53:05Z","published":"2024-01-26T00:32:31Z","title":"An Algorithm for Streaming Differentially Private Data","summary":" Much of the research in differential privacy has focused on offline\napplications with the assumption that all data is available at once. When these\nalgorithms are applied in practice to streams where data is collected over\ntime, this either violates the privacy guarantees or results in poor utility.\nWe derive an algorithm for differentially private synthetic streaming data\ngeneration, especially curated towards spatial datasets. Furthermore, we\nprovide a general framework for online selective counting among a collection of\nqueries which forms a basis for many tasks such as query answering and\nsynthetic data generation. The utility of our algorithm is verified on both\nreal-world and simulated datasets.\n","authors":["Girish Kumar","Thomas Strohmer","Roman Vershynin"],"pdf_url":"https://arxiv.org/pdf/2401.14577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15292v2","updated":"2024-01-31T01:52:46Z","published":"2024-01-27T04:24:19Z","title":"Adaptive Block Sparse Regularization under Arbitrary Linear Transform","summary":" We propose a convex signal reconstruction method for block sparsity under\narbitrary linear transform with unknown block structure. The proposed method is\na generalization of the existing method LOP-$\\ell_2$/$\\ell_1$ and can\nreconstruct signals with block sparsity under non-invertible transforms, unlike\nLOP-$\\ell_2$/$\\ell_1$. Our work broadens the scope of block sparse\nregularization, enabling more versatile and powerful applications across\nvarious signal processing domains. We derive an iterative algorithm for solving\nproposed method and provide conditions for its convergence to the optimal\nsolution. Numerical experiments demonstrate the effectiveness of the proposed\nmethod.\n","authors":["Takanobu Furuhashi","Hidekata Hontani","Tatsuya Yokota"],"pdf_url":"https://arxiv.org/pdf/2401.15292v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.17539v1","updated":"2024-01-31T01:51:29Z","published":"2024-01-31T01:51:29Z","title":"Enhancing Score-Based Sampling Methods with Ensembles","summary":" We introduce ensembles within score-based sampling methods to develop\ngradient-free approximate sampling techniques that leverage the collective\ndynamics of particle ensembles to compute approximate reverse diffusion drifts.\nWe introduce the underlying methodology, emphasizing its relationship with\ngenerative diffusion models and the previously introduced F\\\"ollmer sampler. We\ndemonstrate the efficacy of ensemble strategies through various examples,\nranging from low- to medium-dimensionality sampling problems, including\nmulti-modal and highly non-Gaussian probability distributions, and provide\ncomparisons to traditional methods like NUTS. Our findings highlight the\npotential of ensemble strategies for modeling complex probability distributions\nin situations where gradients are unavailable. Finally, we showcase its\napplication in the context of Bayesian inversion problems within the\ngeophysical sciences.\n","authors":["Tobias Bischoff","Bryan Riel"],"pdf_url":"https://arxiv.org/pdf/2401.17539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09882v2","updated":"2024-01-31T01:46:58Z","published":"2023-06-16T14:50:43Z","title":"Uncertainty Quantification via Spatial-Temporal Tweedie Model for\n Zero-inflated and Long-tail Travel Demand Prediction","summary":" Understanding Origin-Destination (O-D) travel demand is crucial for\ntransportation management. However, traditional spatial-temporal deep learning\nmodels grapple with addressing the sparse and long-tail characteristics in\nhigh-resolution O-D matrices and quantifying prediction uncertainty. This\ndilemma arises from the numerous zeros and over-dispersed demand patterns\nwithin these matrices, which challenge the Gaussian assumption inherent to\ndeterministic deep learning models. To address these challenges, we propose a\nnovel approach: the Spatial-Temporal Tweedie Graph Neural Network (STTD). The\nSTTD introduces the Tweedie distribution as a compelling alternative to the\ntraditional 'zero-inflated' model and leverages spatial and temporal embeddings\nto parameterize travel demand distributions. Our evaluations using real-world\ndatasets highlight STTD's superiority in providing accurate predictions and\nprecise confidence intervals, particularly in high-resolution scenarios.\n","authors":["Xinke Jiang","Dingyi Zhuang","Xianghui Zhang","Hao Chen","Jiayuan Luo","Xiaowei Gao"],"pdf_url":"https://arxiv.org/pdf/2306.09882v2.pdf","comment":"In proceeding of CIKM 2023. Doi:\n https://dl.acm.org/doi/10.1145/3583780.3615215"},{"id":"http://arxiv.org/abs/2401.11143v3","updated":"2024-01-31T01:22:43Z","published":"2024-01-20T06:42:32Z","title":"Gaussian Adaptive Attention is All You Need: Robust Contextual\n Representations Across Multiple Modalities","summary":" We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a\nnovel probabilistic attention framework, and the Gaussian Adaptive Transformer\n(GAT), designed to enhance information aggregation across multiple modalities,\nincluding Speech, Text and Vision. GAAM integrates learnable mean and variance\ninto its attention mechanism, implemented in a Multi-Headed framework enabling\nit to collectively model any Probability Distribution for dynamic recalibration\nof feature significance. This method demonstrates significant improvements,\nespecially with highly non-stationary data, surpassing the state-of-the-art\nattention techniques in model performance (up to approximately +20% in\naccuracy) by identifying key elements within the feature space. GAAM's\ncompatibility with dot-product-based attention models and relatively low number\nof parameters showcases its adaptability and potential to boost existing\nattention frameworks. Empirically, GAAM exhibits superior adaptability and\nefficacy across a diverse range of tasks, including emotion recognition in\nspeech, image classification, and text classification, thereby establishing its\nrobustness and versatility in handling multi-modal data. Furthermore, we\nintroduce the Importance Factor (IF), a new learning-based metric that enhances\nthe explainability of models trained with GAAM-based methods. Overall, GAAM\nrepresents an advancement towards development of better performing and more\nexplainable attention models across multiple modalities.\n","authors":["Georgios Ioannides","Aman Chadha","Aaron Elkins"],"pdf_url":"https://arxiv.org/pdf/2401.11143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13192v2","updated":"2024-01-31T01:16:00Z","published":"2024-01-24T02:36:52Z","title":"Generative Design of Crystal Structures by Point Cloud Representations\n and Diffusion Model","summary":" Efficiently generating energetically stable crystal structures has long been\na challenge in material design, primarily due to the immense arrangement of\natoms in a crystal lattice. To facilitate the discovery of stable material, we\npresent a framework for the generation of synthesizable materials, leveraging a\npoint cloud representation to encode intricate structural information. At the\nheart of this framework lies the introduction of a diffusion model as its\nfoundational pillar. To gauge the efficacy of our approach, we employ it to\nreconstruct input structures from our training datasets, rigorously validating\nits high reconstruction performance. Furthermore, we demonstrate the profound\npotential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely\nnew materials, emphasizing their synthesizability. Our research stands as a\nnoteworthy contribution to the advancement of materials design and synthesis\nthrough the cutting-edge avenue of generative design instead of the\nconventional substitution or experience-based discovery.\n","authors":["Zhelin Li","Rami Mrad","Runxian Jiao","Guan Huang","Jun Shan","Shibing Chu","Yuanping Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13192v2.pdf","comment":"I have submitted to a journal"},{"id":"http://arxiv.org/abs/2309.12862v3","updated":"2024-01-31T01:05:14Z","published":"2023-09-22T13:37:10Z","title":"Associative Transformer","summary":" Emerging from the pairwise attention in conventional Transformers, there is a\ngrowing interest in sparse attention mechanisms that align more closely with\nlocalized, contextual learning in the biological brain. Existing studies such\nas the Coordination method employ iterative cross-attention mechanisms with a\nbottleneck to enable the sparse association of inputs. However, these methods\nare parameter inefficient and fail in more complex relational reasoning tasks.\nTo this end, we propose Associative Transformer (AiT) to enhance the\nassociation among sparsely attended input patches, improving parameter\nefficiency and performance in relational reasoning tasks. AiT leverages a\nlearnable explicit memory, comprised of various specialized priors, with a\nbottleneck attention to facilitate the extraction of diverse localized\nfeatures. Moreover, we propose a novel associative memory-enabled patch\nreconstruction with a Hopfield energy function. The extensive experiments in\nfour image classification tasks with three different sizes of AiT demonstrate\nthat AiT requires significantly fewer parameters and attention layers while\noutperforming Vision Transformers and a broad range of sparse Transformers.\nAdditionally, AiT establishes new SOTA performance in the Sort-of-CLEVR\ndataset, outperforming the previous Coordination method.\n","authors":["Yuwei Sun","Hideya Ochiai","Zhirong Wu","Stephen Lin","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2309.12862v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17523v1","updated":"2024-01-31T00:43:30Z","published":"2024-01-31T00:43:30Z","title":"Game-Theoretic Unlearnable Example Generator","summary":" Unlearnable example attacks are data poisoning attacks aiming to degrade the\nclean test accuracy of deep learning by adding imperceptible perturbations to\nthe training samples, which can be formulated as a bi-level optimization\nproblem. However, directly solving this optimization problem is intractable for\ndeep neural networks. In this paper, we investigate unlearnable example attacks\nfrom a game-theoretic perspective, by formulating the attack as a nonzero sum\nStackelberg game. First, the existence of game equilibria is proved under the\nnormal setting and the adversarial training setting. It is shown that the game\nequilibrium gives the most powerful poison attack in that the victim has the\nlowest test accuracy among all networks within the same hypothesis space, when\ncertain loss functions are used. Second, we propose a novel attack method,\ncalled the Game Unlearnable Example (GUE), which has three main gradients. (1)\nThe poisons are obtained by directly solving the equilibrium of the Stackelberg\ngame with a first-order algorithm. (2) We employ an autoencoder-like generative\nnetwork model as the poison attacker. (3) A novel payoff function is introduced\nto evaluate the performance of the poison. Comprehensive experiments\ndemonstrate that GUE can effectively poison the model in various scenarios.\nFurthermore, the GUE still works by using a relatively small percentage of the\ntraining data to train the generator, and the poison generator can generalize\nto unseen data well. Our implementation code can be found at\nhttps://github.com/hong-xian/gue.\n","authors":["Shuang Liu","Yihan Wang","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2401.17523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.01327v5","updated":"2024-01-31T00:40:51Z","published":"2022-03-02T17:38:44Z","title":"Hyperspectral Pixel Unmixing with Latent Dirichlet Variational\n Autoencoder","summary":" We present a method for hyperspectral pixel {\\it unmixing}. The proposed\nmethod assumes that (1) {\\it abundances} can be encoded as Dirichlet\ndistributions and (2) spectra of {\\it endmembers} can be represented as\nmultivariate Normal distributions. The method solves the problem of abundance\nestimation and endmember extraction within a variational autoencoder setting\nwhere a Dirichlet bottleneck layer models the abundances, and the decoder\nperforms endmember extraction. The proposed method can also leverage transfer\nlearning paradigm, where the model is only trained on synthetic data containing\npixels that are linear combinations of one or more endmembers of interest. In\nthis case, we retrieve endmembers (spectra) from the United States Geological\nSurvey Spectral Library. The model thus trained can be subsequently used to\nperform pixel unmixing on \"real data\" that contains a subset of the endmembers\nused to generated the synthetic data. The model achieves state-of-the-art\nresults on several benchmarks: Cuprite, Urban Hydice and Samson. We also\npresent new synthetic dataset, OnTech-HSI-Syn-21, that can be used to study\nhyperspectral pixel unmixing methods. We showcase the transfer learning\ncapabilities of the proposed model on Cuprite and OnTech-HSI-Syn-21 datasets.\nIn summary, the proposed method can be applied for pixel unmixing a variety of\ndomains, including agriculture, forestry, mineralogy, analysis of materials,\nhealthcare, etc. Additionally, the proposed method eschews the need for\nlabelled data for training by leveraging the transfer learning paradigm, where\nthe model is trained on synthetic data generated using the endmembers present\nin the \"real\" data.\n","authors":["Kiran Mantripragada","Faisal Z. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2203.01327v5.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.17800v1","updated":"2024-01-31T12:51:26Z","published":"2024-01-31T12:51:26Z","title":"Dance-to-Music Generation with Encoder-based Textual Inversion of\n Diffusion Models","summary":" The harmonious integration of music with dance movements is pivotal in\nvividly conveying the artistic essence of dance. This alignment also\nsignificantly elevates the immersive quality of gaming experiences and\nanimation productions. While there has been remarkable advancement in creating\nhigh-fidelity music from textual descriptions, current methodologies mainly\nconcentrate on modulating overarching characteristics such as genre and\nemotional tone. They often overlook the nuanced management of temporal rhythm,\nwhich is indispensable in crafting music for dance, since it intricately aligns\nthe musical beats with the dancers' movements. Recognizing this gap, we propose\nan encoder-based textual inversion technique for augmenting text-to-music\nmodels with visual control, facilitating personalized music generation.\nSpecifically, we develop dual-path rhythm-genre inversion to effectively\nintegrate the rhythm and genre of a dance motion sequence into the textual\nspace of a text-to-music model. Contrary to the classical textual inversion\nmethod, which directly updates text embeddings to reconstruct a single target\nobject, our approach utilizes separate rhythm and genre encoders to obtain text\nembeddings for two pseudo-words, adapting to the varying rhythms and genres. To\nachieve a more accurate evaluation, we propose improved evaluation metrics for\nrhythm alignment. We demonstrate that our approach outperforms state-of-the-art\nmethods across multiple evaluation metrics. Furthermore, our method seamlessly\nadapts to in-the-wild data and effectively integrates with the inherent\ntext-guided generation capability of the pre-trained model. Samples are\navailable at \\url{https://youtu.be/D7XDwtH1YwE}.\n","authors":["Sifei Li","Weiming Dong","Yuxin Zhang","Fan Tang","Chongyang Ma","Oliver Deussen","Tong-Yee Lee","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.17800v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.17773v1","updated":"2024-01-31T12:12:56Z","published":"2024-01-31T12:12:56Z","title":"SNP-S3: Shared Network Pre-training and Significant Semantic\n Strengthening for Various Video-Text Tasks","summary":" We present a framework for learning cross-modal video representations by\ndirectly pre-training on raw data to facilitate various downstream video-text\ntasks. Our main contributions lie in the pre-training framework and proxy\ntasks. First, based on the shortcomings of two mainstream pixel-level\npre-training architectures (limited applications or less efficient), we propose\nShared Network Pre-training (SNP). By employing one shared BERT-type network to\nrefine textual and cross-modal features simultaneously, SNP is lightweight and\ncould support various downstream applications. Second, based on the intuition\nthat people always pay attention to several \"significant words\" when\nunderstanding a sentence, we propose the Significant Semantic Strengthening\n(S3) strategy, which includes a novel masking and matching proxy task to\npromote the pre-training performance. Experiments conducted on three downstream\nvideo-text tasks and six datasets demonstrate that, we establish a new\nstate-of-the-art in pixel-level video-text pre-training; we also achieve a\nsatisfactory balance between the pre-training efficiency and the fine-tuning\nperformance. The codebase are available at\nhttps://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/snps3_vtp.\n","authors":["Xingning Dong","Qingpei Guo","Tian Gan","Qing Wang","Jianlong Wu","Xiangyuan Ren","Yuan Cheng","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2401.17773v1.pdf","comment":"Accepted by TCSVT (IEEE Transactions on Circuits and Systems for\n Video Technology)"}]},"2024-02-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.01057v3","updated":"2024-02-01T18:57:20Z","published":"2023-12-02T08:04:29Z","title":"RLHF and IIA: Perverse Incentives","summary":" Existing algorithms for reinforcement learning from human feedback (RLHF) can\nincentivize responses at odds with preferences because they are based on models\nthat assume independence of irrelevant alternatives (IIA). The perverse\nincentives induced by IIA hinder innovations on query formats and learning\nalgorithms.\n","authors":["Wanqiao Xu","Shi Dong","Xiuyuan Lu","Grace Lam","Zheng Wen","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2312.01057v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00861v1","updated":"2024-02-01T18:56:18Z","published":"2024-02-01T18:56:18Z","title":"Evaluating Large Language Models for Generalization and Robustness via\n Data Compression","summary":" Existing methods for evaluating large language models face challenges such as\ndata contamination, sensitivity to prompts, and the high cost of benchmark\ncreation. To address this, we propose a lossless data compression based\nevaluation approach that tests how models' predictive abilities generalize\nafter their training cutoff. Specifically, we collect comprehensive test data\nspanning 83 months from 2017 to 2023 and split the data into training and\ntesting periods according to models' training data cutoff. We measure: 1) the\ncompression performance on the testing period as a measure of generalization on\nunseen data; and 2) the performance gap between the training and testing period\nas a measure of robustness. Our experiments test 14 representative large\nlanguage models with various sizes on sources including Wikipedia, news\narticles, code, arXiv papers, and multi-modal data. We find that the\ncompression rate of many models reduces significantly after their cutoff date,\nbut models such as Mistral and Llama-2 demonstrate a good balance between\nperformance and robustness. Results also suggest that models struggle to\ngeneralize on news and code data, but work especially well on arXiv papers. We\nalso find the context size and tokenization implementation have a big impact of\non the overall compression performance.\n","authors":["Yucheng Li","Yunhao Guo","Frank Guerin","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2402.00861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00858v1","updated":"2024-02-01T18:55:29Z","published":"2024-02-01T18:55:29Z","title":"Can Large Language Models Understand Context?","summary":" Understanding context is key to understanding human language, an ability\nwhich Large Language Models (LLMs) have been increasingly seen to demonstrate\nto an impressive extent. However, though the evaluation of LLMs encompasses\nvarious domains within the realm of Natural Language Processing, limited\nattention has been paid to probing their linguistic capability of understanding\ncontextual features. This paper introduces a context understanding benchmark by\nadapting existing datasets to suit the evaluation of generative models. This\nbenchmark comprises of four distinct tasks and nine datasets, all featuring\nprompts designed to assess the models' ability to understand context. First, we\nevaluate the performance of LLMs under the in-context learning pretraining\nscenario. Experimental results indicate that pre-trained dense models struggle\nwith understanding more nuanced contextual features when compared to\nstate-of-the-art fine-tuned models. Second, as LLM compression holds growing\nsignificance in both research and real-world applications, we assess the\ncontext understanding of quantized models under in-context-learning settings.\nWe find that 3-bit post-training quantization leads to varying degrees of\nperformance reduction on our benchmark. We conduct an extensive analysis of\nthese scenarios to substantiate our experimental results.\n","authors":["Yilun Zhu","Joel Ruben Antony Moniz","Shruti Bhargava","Jiarui Lu","Dhivya Piraviperumal","Site Li","Yuan Zhang","Hong Yu","Bo-Hsiang Tseng"],"pdf_url":"https://arxiv.org/pdf/2402.00858v1.pdf","comment":"Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2402.00856v1","updated":"2024-02-01T18:51:54Z","published":"2024-02-01T18:51:54Z","title":"Towards Efficient and Exact Optimization of Language Model Alignment","summary":" The alignment of language models with human preferences is vital for their\napplication in real-world tasks. The problem is formulated as optimizing the\nmodel's policy to maximize the expected reward that reflects human preferences\nwith minimal deviation from the initial policy. While considered as a\nstraightforward solution, reinforcement learning (RL) suffers from high\nvariance in policy updates, which impedes efficient policy improvement.\nRecently, direct preference optimization (DPO) was proposed to directly\noptimize the policy from preference data. Though simple to implement, DPO is\nderived based on the optimal policy that is not assured to be achieved in\npractice, which undermines its convergence to the intended solution.\n In this paper, we propose efficient exact optimization (EXO) of the alignment\nobjective. We prove that EXO is guaranteed to optimize in the same direction as\nthe RL algorithms asymptotically for arbitary parametrization of the policy,\nwhile enables efficient optimization by circumventing the complexities\nassociated with RL algorithms. We compare our method to DPO with both\ntheoretical and empirical analyses, and further demonstrate the advantages of\nour method over existing approaches on realistic human preference data.\n","authors":["Haozhe Ji","Cheng Lu","Yilin Niu","Pei Ke","Hongning Wang","Jun Zhu","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00856v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.00841v1","updated":"2024-02-01T18:31:34Z","published":"2024-02-01T18:31:34Z","title":"Tiny Titans: Can Smaller Large Language Models Punch Above Their Weight\n in the Real World for Meeting Summarization?","summary":" Large Language Models (LLMs) have demonstrated impressive capabilities to\nsolve a wide range of tasks without being explicitly fine-tuned on\ntask-specific datasets. However, deploying LLMs in the real world is not\ntrivial, as it requires substantial computing resources. In this paper, we\ninvestigate whether smaller, compact LLMs are a good alternative to the\ncomparatively Larger LLMs2 to address significant costs associated with\nutilizing LLMs in the real world. In this regard, we study the meeting\nsummarization task in a real-world industrial environment and conduct extensive\nexperiments by comparing the performance of fine-tuned compact LLMs (e.g.,\nFLAN-T5, TinyLLaMA, LiteLLaMA) with zero-shot larger LLMs (e.g., LLaMA-2,\nGPT-3.5, PaLM-2). We observe that most smaller LLMs, even after fine-tuning,\nfail to outperform larger zero-shot LLMs in meeting summarization datasets.\nHowever, a notable exception is FLAN-T5 (780M parameters), which performs on\npar or even better than many zero-shot Larger LLMs (from 7B to above 70B\nparameters), while being significantly smaller. This makes compact LLMs like\nFLAN-T5 a suitable cost-efficient solution for real-world industrial\ndeployment.\n","authors":["Xue-Yong Fu","Md Tahmid Rahman Laskar","Elena Khasanova","Cheng Chen","Shashi Bhushan TN"],"pdf_url":"https://arxiv.org/pdf/2402.00841v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2402.00838v1","updated":"2024-02-01T18:28:55Z","published":"2024-02-01T18:28:55Z","title":"OLMo: Accelerating the Science of Language Models","summary":" Language models (LMs) have become ubiquitous in both NLP research and in\ncommercial product offerings. As their commercial importance has surged, the\nmost powerful models have become closed off, gated behind proprietary\ninterfaces, with important details of their training data, architectures, and\ndevelopment undisclosed. Given the importance of these details in\nscientifically studying these models, including their biases and potential\nrisks, we believe it is essential for the research community to have access to\npowerful, truly open LMs. To this end, this technical report details the first\nrelease of OLMo, a state-of-the-art, truly Open Language Model and its\nframework to build and study the science of language modeling. Unlike most\nprior efforts that have only released model weights and inference code, we\nrelease OLMo and the whole framework, including training data and training and\nevaluation code. We hope this release will empower and strengthen the open\nresearch community and inspire a new wave of innovation.\n","authors":["Dirk Groeneveld","Iz Beltagy","Pete Walsh","Akshita Bhagia","Rodney Kinney","Oyvind Tafjord","Ananya Harsh Jha","Hamish Ivison","Ian Magnusson","Yizhong Wang","Shane Arora","David Atkinson","Russell Authur","Khyathi Raghavi Chandu","Arman Cohan","Jennifer Dumas","Yanai Elazar","Yuling Gu","Jack Hessel","Tushar Khot","William Merrill","Jacob Morrison","Niklas Muennighoff","Aakanksha Naik","Crystal Nam","Matthew E. Peters","Valentina Pyatkin","Abhilasha Ravichander","Dustin Schwenk","Saurabh Shah","Will Smith","Emma Strubell","Nishant Subramani","Mitchell Wortsman","Pradeep Dasigi","Nathan Lambert","Kyle Richardson","Luke Zettlemoyer","Jesse Dodge","Kyle Lo","Luca Soldaini","Noah A. Smith","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2402.00838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16736v2","updated":"2024-02-01T18:24:09Z","published":"2024-01-30T04:29:48Z","title":"Engineering A Large Language Model From Scratch","summary":" The proliferation of deep learning in natural language processing (NLP) has\nled to the development and release of innovative technologies capable of\nunderstanding and generating human language with remarkable proficiency.\nAtinuke, a Transformer-based neural network, optimises performance across\nvarious language tasks by utilising a unique configuration. The architecture\ninterweaves layers for processing sequential data with attention mechanisms to\ndraw meaningful affinities between inputs and outputs. Due to the configuration\nof its topology and hyperparameter tuning, it can emulate human-like language\nby extracting features and learning complex mappings. Atinuke is modular,\nextensible, and integrates seamlessly with existing machine learning pipelines.\nAdvanced matrix operations like softmax, embeddings, and multi-head attention\nenable nuanced handling of textual, acoustic, and visual signals. By unifying\nmodern deep learning techniques with software design principles and\nmathematical theory, the system achieves state-of-the-art results on natural\nlanguage tasks whilst remaining interpretable and robust.\n","authors":["Abiodun Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2401.16736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00835v1","updated":"2024-02-01T18:22:32Z","published":"2024-02-01T18:22:32Z","title":"ALISON: Fast and Effective Stylometric Authorship Obfuscation","summary":" Authorship Attribution (AA) and Authorship Obfuscation (AO) are two competing\ntasks of increasing importance in privacy research. Modern AA leverages an\nauthor's consistent writing style to match a text to its author using an AA\nclassifier. AO is the corresponding adversarial task, aiming to modify a text\nin such a way that its semantics are preserved, yet an AA model cannot\ncorrectly infer its authorship. To address privacy concerns raised by\nstate-of-the-art (SOTA) AA methods, new AO methods have been proposed but\nremain largely impractical to use due to their prohibitively slow training and\nobfuscation speed, often taking hours. To this challenge, we propose a\npractical AO method, ALISON, that (1) dramatically reduces training/obfuscation\ntime, demonstrating more than 10x faster obfuscation than SOTA AO methods, (2)\nachieves better obfuscation success through attacking three transformer-based\nAA methods on two benchmark datasets, typically performing 15% better than\ncompeting methods, (3) does not require direct signals from a target AA\nclassifier during obfuscation, and (4) utilizes unique stylometric features,\nallowing sound model interpretation for explainable obfuscation. We also\ndemonstrate that ALISON can effectively prevent four SOTA AA methods from\naccurately determining the authorship of ChatGPT-generated texts, all while\nminimally changing the original text semantics. To ensure the reproducibility\nof our findings, our code and data are available at:\nhttps://github.com/EricX003/ALISON.\n","authors":["Eric Xing","Saranya Venkatraman","Thai Le","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2402.00835v1.pdf","comment":"10 pages, 6 figures, 4 tables. To be published in the Proceedings of\n the 38th Annual AAAI Conference on Artificial Intelligence (AAAI-24)"},{"id":"http://arxiv.org/abs/2306.01879v3","updated":"2024-02-01T18:22:25Z","published":"2023-06-02T19:19:43Z","title":"Revisiting the Role of Language Priors in Vision-Language Models","summary":" Vision-language models (VLMs) are impactful in part because they can be\napplied to a variety of visual understanding tasks in a zero-shot fashion,\nwithout any fine-tuning. We study $\\textit{generative VLMs}$ that are trained\nfor next-word generation given an image. We explore their zero-shot performance\non the illustrative task of image-text retrieval across 8 popular\nvision-language benchmarks. Our first observation is that they can be\nrepurposed for discriminative tasks (such as image-text retrieval) by simply\ncomputing the match score of generating a particular text string given an\nimage. We call this probabilistic score the $\\textit{Visual Generative\nPre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces\nnear-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on\nothers. We analyze this behavior through a probabilistic lens, pointing out\nthat some benchmarks inadvertently capture unnatural language distributions by\ncreating adversarial but unlikely text captions. In fact, we demonstrate that\neven a \"blind\" language model that ignores any image evidence can sometimes\noutperform all prior art, reminiscent of similar challenges faced by the\nvisual-question answering (VQA) community many years ago. We derive a\nprobabilistic post-processing scheme that controls for the amount of linguistic\nbias in generative VLMs at test time without having to retrain or fine-tune the\nmodel. We show that the VisualGPTScore, when appropriately debiased, is a\nstrong zero-shot baseline for vision-language understanding, oftentimes\nproducing state-of-the-art accuracy.\n","authors":["Zhiqiu Lin","Xinyue Chen","Deepak Pathak","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2306.01879v3.pdf","comment":"Website: https://linzhiqiu.github.io/papers/visual_gpt_score/"},{"id":"http://arxiv.org/abs/2401.11864v4","updated":"2024-02-01T18:16:04Z","published":"2024-01-22T11:37:18Z","title":"Distilling Mathematical Reasoning Capabilities into Small Language\n Models","summary":" This work addresses the challenge of democratizing advanced Large Language\nModels (LLMs) by compressing their mathematical reasoning capabilities into\nsub-billion parameter Small Language Models (SLMs) without compromising\nperformance. We introduce Equation-of-Thought Distillation (EoTD), a novel\ntechnique that encapsulates the reasoning process into equation-based\nrepresentations to construct an EoTD dataset for fine-tuning SLMs.\nAdditionally, we propose the Ensemble Thoughts Distillation (ETD) framework to\nenhance the reasoning performance of SLMs. This involves creating a reasoning\ndataset with multiple thought processes, including Chain-of-Thought (CoT),\nProgram-of-Thought (PoT), and Equation-of-Thought (EoT), and using it for\nfine-tuning. Our experimental findings demonstrate that EoTD significantly\nboosts the reasoning abilities of SLMs, while ETD enables these models to\nachieve state-of-the-art reasoning performance.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11864v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01710v2","updated":"2024-02-01T17:57:37Z","published":"2023-05-02T18:23:50Z","title":"Stars Are All You Need: A Distantly Supervised Pyramid Network for\n Unified Sentiment Analysis","summary":" Data for the Rating Prediction (RP) sentiment analysis task such as star\nreviews are readily available. However, data for aspect-category detection\n(ACD) and aspect-category sentiment analysis (ACSA) is often desired because of\nthe fine-grained nature but are expensive to collect. In this work, we propose\nUnified Sentiment Analysis (Uni-SA) to understand aspect and review sentiment\nin a unified manner. Specifically, we propose a Distantly Supervised Pyramid\nNetwork (DSPN) to efficiently perform ACD, ACSA, and RP using only RP labels\nfor training. We evaluate DSPN on multi-aspect review datasets in English and\nChinese and find that in addition to the internal efficiency of sample size,\nDSPN also performs comparably well to a variety of benchmark models. We also\ndemonstrate the interpretability of DSPN's outputs on reviews to show the\npyramid structure inherent in unified sentiment analysis.\n","authors":["Wenchang Li","Yixing Chen","Shuang Zheng","Lei Wang","John P. Lalor"],"pdf_url":"https://arxiv.org/pdf/2305.01710v2.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.14163v2","updated":"2024-02-01T17:34:22Z","published":"2023-05-23T15:27:35Z","title":"Leveraging Open Information Extraction for More Robust Domain Transfer\n of Event Trigger Detection","summary":" Event detection is a crucial information extraction task in many domains,\nsuch as Wikipedia or news. The task typically relies on trigger detection (TD)\n-- identifying token spans in the text that evoke specific events. While the\nnotion of triggers should ideally be universal across domains, domain transfer\nfor TD from high- to low-resource domains results in significant performance\ndrops. We address the problem of negative transfer in TD by coupling triggers\nbetween domains using subject-object relations obtained from a rule-based open\ninformation extraction (OIE) system. We demonstrate that OIE relations injected\nthrough multi-task training can act as mediators between triggers in different\ndomains, enhancing zero- and few-shot TD domain transfer and reducing\nperformance drops, in particular when transferring from a high-resource source\ndomain (Wikipedia) to a low(er)-resource target domain (news). Additionally, we\ncombine this improved transfer with masked language modeling on the target\ndomain, observing further TD transfer gains. Finally, we demonstrate that the\ngains are robust to the choice of the OIE system.\n","authors":["David Dukić","Kiril Gashteovski","Goran Glavaš","Jan Šnajder"],"pdf_url":"https://arxiv.org/pdf/2305.14163v2.pdf","comment":"Accepted at EACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.00798v1","updated":"2024-02-01T17:30:50Z","published":"2024-02-01T17:30:50Z","title":"Formal-LLM: Integrating Formal Language and Natural Language for\n Controllable LLM-based Agents","summary":" Recent advancements on Large Language Models (LLMs) enable AI Agents to\nautomatically generate and execute multi-step plans to solve complex tasks.\nHowever, since LLM's content generation process is hardly controllable, current\nLLM-based agents frequently generate invalid or non-executable plans, which\njeopardizes the performance of the generated plans and corrupts users' trust in\nLLM-based agents. In response, this paper proposes a novel ``Formal-LLM''\nframework for LLM-based agents by integrating the expressiveness of natural\nlanguage and the precision of formal language. Specifically, the framework\nallows human users to express their requirements or constraints for the\nplanning process as an automaton. A stack-based LLM plan generation process is\nthen conducted under the supervision of the automaton to ensure that the\ngenerated plan satisfies the constraints, making the planning process\ncontrollable. We conduct experiments on both benchmark tasks and practical\nreal-life tasks, and our framework achieves over 50% overall performance\nincrease, which validates the feasibility and effectiveness of employing\nFormal-LLM to guide the plan generation of agents, preventing the agents from\ngenerating invalid and unsuccessful plans. Further, more controllable LLM-based\nagents can facilitate the broader utilization of LLM in application scenarios\nwhere high validity of planning is essential. The work is open-sourced at\nhttps://github.com/agiresearch/Formal-LLM.\n","authors":["Zelong Li","Wenyue Hua","Hao Wang","He Zhu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00798v1.pdf","comment":"21 pages, 6 figures; working in process, suggestions are welcome"},{"id":"http://arxiv.org/abs/2402.00794v1","updated":"2024-02-01T17:25:51Z","published":"2024-02-01T17:25:51Z","title":"ReAGent: Towards A Model-agnostic Feature Attribution Method for\n Generative Language Models","summary":" Feature attribution methods (FAs), such as gradients and attention, are\nwidely employed approaches to derive the importance of all input features to\nthe model predictions. Existing work in natural language processing has mostly\nfocused on developing and testing FAs for encoder-only language models (LMs) in\nclassification tasks. However, it is unknown if it is faithful to use these FAs\nfor decoder-only models on text generation, due to the inherent differences\nbetween model architectures and task settings respectively. Moreover, previous\nwork has demonstrated that there is no `one-wins-all' FA across models and\ntasks. This makes the selection of a FA computationally expensive for large LMs\nsince input importance derivation often requires multiple forward and backward\npasses including gradient computations that might be prohibitive even with\naccess to large compute. To address these issues, we present a model-agnostic\nFA for generative LMs called Recursive Attribution Generator (ReAGent). Our\nmethod updates the token importance distribution in a recursive manner. For\neach update, we compute the difference in the probability distribution over the\nvocabulary for predicting the next token between using the original input and\nusing a modified version where a part of the input is replaced with RoBERTa\npredictions. Our intuition is that replacing an important token in the context\nshould have resulted in a larger change in the model's confidence in predicting\nthe token than replacing an unimportant token. Our method can be universally\napplied to any generative LM without accessing internal model weights or\nadditional training and fine-tuning, as most other FAs require. We extensively\ncompare the faithfulness of ReAGent with seven popular FAs across six\ndecoder-only LMs of various sizes. The results show that our method\nconsistently provides more faithful token importance distributions.\n","authors":["Zhixue Zhao","Boxuan Shan"],"pdf_url":"https://arxiv.org/pdf/2402.00794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00786v1","updated":"2024-02-01T17:17:55Z","published":"2024-02-01T17:17:55Z","title":"CroissantLLM: A Truly Bilingual French-English Language Model","summary":" We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T\nEnglish and French tokens, to bring to the research and industrial community a\nhigh-performance, fully open-sourced bilingual model that runs swiftly on\nconsumer-grade local hardware. To that end, we pioneer the approach of training\nan intrinsically bilingual model with a 1:1 English-to-French pretraining data\nratio, a custom tokenizer, and bilingual finetuning datasets. We release the\ntraining dataset, notably containing a French split with manually curated,\nhigh-quality, and varied data sources. To assess performance outside of\nEnglish, we craft a novel benchmark, FrenchBench, consisting of an array of\nclassification and generation tasks, covering various orthogonal aspects of\nmodel performance in the French Language. Additionally, rooted in transparency\nand to foster further Large Language Model research, we release codebases, and\ndozens of checkpoints across various model sizes, training data distributions,\nand training steps, as well as fine-tuned Chat models, and strong translation\nmodels. We evaluate our model through the FMTI framework, and validate 81 % of\nthe transparency criteria, far beyond the scores of even most open initiatives.\nThis work enriches the NLP landscape, breaking away from previous\nEnglish-centric work in order to strengthen our understanding of\nmultilinguality in language models.\n","authors":["Manuel Faysse","Patrick Fernandes","Nuno Guerreiro","António Loison","Duarte Alves","Caio Corro","Nicolas Boizard","João Alves","Ricardo Rei","Pedro Martins","Antoni Bigata Casademunt","François Yvon","André Martins","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.00786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00746v1","updated":"2024-02-01T16:40:32Z","published":"2024-02-01T16:40:32Z","title":"Health-LLM: Personalized Retrieval-Augmented Disease Prediction Model","summary":" Artificial intelligence (AI) in healthcare has significantly advanced\nintelligent medical treatment. However, traditional intelligent healthcare is\nlimited by static data and unified standards, preventing full integration with\nindividual situations and other challenges. Hence, a more professional and\ndetailed intelligent healthcare method is needed for development. To this end,\nwe propose an innovative framework named Heath-LLM, which combines large-scale\nfeature extraction and medical knowledge trade-off scoring. Compared to\ntraditional health management methods, our approach has three main advantages.\nFirst, our method integrates health reports into a large model to provide\ndetailed task information. Second, professional medical expertise is used to\nadjust the weighted scores of health characteristics. Third, we use a\nsemi-automated feature extraction framework to enhance the analytical power of\nlanguage models and incorporate expert insights to improve the accuracy of\ndisease prediction. We have conducted disease prediction experiments on a large\nnumber of health reports to assess the effectiveness of Health-LLM. The results\nof the experiments indicate that the proposed method surpasses traditional\nmethods and has the potential to revolutionize disease prediction and\npersonalized health management. The code is available at\nhttps://github.com/jmyissb/HealthLLM.\n","authors":["Mingyu Jin","Qinkai Yu","Chong Zhang","Dong Shu","Suiyuan Zhu","Mengnan Du","Yongfeng Zhang","Yanda Meng"],"pdf_url":"https://arxiv.org/pdf/2402.00746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00745v1","updated":"2024-02-01T16:39:51Z","published":"2024-02-01T16:39:51Z","title":"Enhancing Ethical Explanations of Large Language Models through\n Iterative Symbolic Refinement","summary":" An increasing amount of research in Natural Language Inference (NLI) focuses\non the application and evaluation of Large Language Models (LLMs) and their\nreasoning capabilities. Despite their success, however, LLMs are still prone to\nfactual errors and inconsistencies in their explanations, offering limited\ncontrol and interpretability for inference in complex domains. In this paper,\nwe focus on ethical NLI, investigating how hybrid neuro-symbolic techniques can\nenhance the logical validity and alignment of ethical explanations produced by\nLLMs. Specifically, we present an abductive-deductive framework named\nLogic-Explainer, which integrates LLMs with an external backward-chaining\nsolver to refine step-wise natural language explanations and jointly verify\ntheir correctness, reduce incompleteness and minimise redundancy. An extensive\nempirical analysis demonstrates that Logic-Explainer can improve explanations\ngenerated via in-context learning methods and Chain-of-Thought (CoT) on\nchallenging ethical NLI tasks, while, at the same time, producing formal proofs\ndescribing and supporting models' reasoning. As ethical NLI requires\ncommonsense reasoning to identify underlying moral violations, our results\nsuggest the effectiveness of neuro-symbolic methods for multi-step NLI more\nbroadly, opening new opportunities to enhance the logical consistency,\nreliability, and alignment of LLMs.\n","authors":["Xin Quan","Marco Valentino","Louise A. Dennis","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2402.00745v1.pdf","comment":"Camera-ready for EACL 2024"},{"id":"http://arxiv.org/abs/2402.00744v1","updated":"2024-02-01T16:39:47Z","published":"2024-02-01T16:39:47Z","title":"BATON: Aligning Text-to-Audio Model with Human Preference Feedback","summary":" With the development of AI-Generated Content (AIGC), text-to-audio models are\ngaining widespread attention. However, it is challenging for these models to\ngenerate audio aligned with human preference due to the inherent information\ndensity of natural language and limited model understanding ability. To\nalleviate this issue, we formulate the BATON, a framework designed to enhance\nthe alignment between generated audio and text prompt using human preference\nfeedback. Our BATON comprises three key stages: Firstly, we curated a dataset\ncontaining both prompts and the corresponding generated audio, which was then\nannotated based on human feedback. Secondly, we introduced a reward model using\nthe constructed dataset, which can mimic human preference by assigning rewards\nto input text-audio pairs. Finally, we employed the reward model to fine-tune\nan off-the-shelf text-to-audio model. The experiment results demonstrate that\nour BATON can significantly improve the generation quality of the original\ntext-to-audio models, concerning audio integrity, temporal relationship, and\nalignment with human preference.\n","authors":["Huan Liao","Haonan Han","Kai Yang","Tianjiao Du","Rui Yang","Zunnan Xu","Qinmei Xu","Jingquan Liu","Jiasheng Lu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2402.00744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00743v1","updated":"2024-02-01T16:39:45Z","published":"2024-02-01T16:39:45Z","title":"Benefits of Transformer: In-Context Learning in Linear Regression Tasks\n with Unstructured Data","summary":" In practice, it is observed that transformer-based models can learn concepts\nin context in the inference stage. While existing literature, e.g.,\n\\citet{zhang2023trained,huang2023context}, provide theoretical explanations on\nthis in-context learning ability, they assume the input $x_i$ and the output\n$y_i$ for each sample are embedded in the same token (i.e., structured data).\nHowever, in reality, they are presented in two tokens (i.e., unstructured data\n\\cite{wibisono2023role}). In this case, this paper conducts experiments in\nlinear regression tasks to study the benefits of the architecture of\ntransformers and provides some corresponding theoretical intuitions to explain\nwhy the transformer can learn from unstructured data. We study the exact\ncomponents in a transformer that facilitate the in-context learning. In\nparticular, we observe that (1) a transformer with two layers of softmax\n(self-)attentions with look-ahead attention mask can learn from the prompt if\n$y_i$ is in the token next to $x_i$ for each example; (2) positional encoding\ncan further improve the performance; and (3) multi-head attention with a high\ninput embedding dimension has a better prediction performance than single-head\nattention.\n","authors":["Yue Xing","Xiaofeng Lin","Namjoon Suh","Qifan Song","Guang Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.00743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00742v1","updated":"2024-02-01T16:39:28Z","published":"2024-02-01T16:39:28Z","title":"Transforming and Combining Rewards for Aligning Large Language Models","summary":" A common approach for aligning language models to human preferences is to\nfirst learn a reward model from preference data, and then use this reward model\nto update the language model. We study two closely related problems that arise\nin this approach. First, any monotone transformation of the reward model\npreserves preference ranking; is there a choice that is ``better'' than others?\nSecond, we often wish to align language models to multiple properties: how\nshould we combine multiple reward models? Using a probabilistic interpretation\nof the alignment procedure, we identify a natural choice for transformation for\n(the common case of) rewards learned from Bradley-Terry preference models. This\nderived transformation has two important properties. First, it emphasizes\nimproving poorly-performing outputs, rather than outputs that already score\nwell. This mitigates both underfitting (where some prompts are not improved)\nand reward hacking (where the model learns to exploit misspecification of the\nreward model). Second, it enables principled aggregation of rewards by linking\nsummation to logical conjunction: the sum of transformed rewards corresponds to\nthe probability that the output is ``good'' in all measured properties, in a\nsense we make precise. Experiments aligning language models to be both helpful\nand harmless using RLHF show substantial improvements over the baseline\n(non-transformed) approach.\n","authors":["Zihao Wang","Chirag Nagpal","Jonathan Berant","Jacob Eisenstein","Alex D'Amour","Sanmi Koyejo","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2402.00742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11865v3","updated":"2024-02-01T16:32:38Z","published":"2023-07-21T19:09:37Z","title":"CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction\n Execution for Robots","summary":" This work explores the capacity of large language models (LLMs) to address\nproblems at the intersection of spatial planning and natural language\ninterfaces for navigation. We focus on following complex instructions that are\nmore akin to natural conversation than traditional explicit procedural\ndirectives typically seen in robotics. Unlike most prior work where navigation\ndirectives are provided as simple imperative commands (e.g., \"go to the\nfridge\"), we examine implicit directives obtained through conversational\ninteractions.We leverage the 3D simulator AI2Thor to create household query\nscenarios at scale, and augment it by adding complex language queries for 40\nobject types. We demonstrate that a robot using our method CARTIER\n(Cartographic lAnguage Reasoning Targeted at Instruction Execution for Robots)\ncan parse descriptive language queries up to 42% more reliably than existing\nLLM-enabled methods by exploiting the ability of LLMs to interpret the user\ninteraction in the context of the objects in the scenario.\n","authors":["Dmitriy Rivkin","Nikhil Kakodkar","Francois Hogan","Bobak H. Baghi","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2307.11865v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00723v1","updated":"2024-02-01T16:14:35Z","published":"2024-02-01T16:14:35Z","title":"Improving Semantic Control in Discrete Latent Spaces with Transformer\n Quantized Variational Autoencoders","summary":" Achieving precise semantic control over the latent spaces of Variational\nAutoEncoders (VAEs) holds significant value for downstream tasks in NLP as the\nunderlying generative mechanisms could be better localised, explained and\nimproved upon. Recent research, however, has struggled to achieve consistent\nresults, primarily due to the inevitable loss of semantic information in the\nvariational bottleneck and limited control over the decoding mechanism. To\novercome these challenges, we investigate discrete latent spaces in Vector\nQuantized Variational AutoEncoders (VQVAEs) to improve semantic control and\ngeneration in Transformer-based VAEs. In particular, We propose T5VQVAE, a\nnovel model that leverages the controllability of VQVAEs to guide the\nself-attention mechanism in T5 at the token-level, exploiting its full\ngeneralization capabilities. Experimental results indicate that T5VQVAE\noutperforms existing state-of-the-art VAE models, including Optimus, in terms\nof controllability and preservation of semantic information across different\ntasks such as auto-encoding of sentences and mathematical expressions, text\ntransfer, and inference. Moreover, T5VQVAE exhibits improved inference\ncapabilities, suggesting potential applications for downstream natural language\nand symbolic reasoning tasks.\n","authors":["Yingji Zhang","Danilo S. Carvalho","Marco Valentino","Ian Pratt-Hartmann","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2402.00723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00711v1","updated":"2024-02-01T16:06:35Z","published":"2024-02-01T16:06:35Z","title":"Explaining Text Classifiers with Counterfactual Representations","summary":" One well motivated explanation method for classifiers leverages\ncounterfactuals which are hypothetical events identical to real observations in\nall aspects except for one categorical feature. Constructing such\ncounterfactual poses specific challenges for texts, however, as some attribute\nvalues may not necessarily align with plausible real-world events. In this\npaper we propose a simple method for generating counterfactuals by intervening\nin the space of text representations which bypasses this limitation. We argue\nthat our interventions are minimally disruptive and that they are theoretically\nsound as they align with counterfactuals as defined in Pearl's causal inference\nframework. To validate our method, we first conduct experiments on a synthetic\ndataset of counterfactuals, allowing for a direct comparison between classifier\npredictions based on ground truth counterfactuals (obtained through explicit\ntext interventions) and our counterfactuals, derived through interventions in\nthe representation space. Second, we study a real world scenario where our\ncounterfactuals can be leveraged both for explaining a classifier and for bias\nmitigation.\n","authors":["Pirmin Lemberger","Antoine Saillenfest"],"pdf_url":"https://arxiv.org/pdf/2402.00711v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.00707v1","updated":"2024-02-01T16:04:04Z","published":"2024-02-01T16:04:04Z","title":"Non-Exchangeable Conformal Language Generation with Nearest Neighbors","summary":" Quantifying uncertainty in automatically generated text is important for\nletting humans check potential hallucinations and making systems more reliable.\nConformal prediction is an attractive framework to provide predictions imbued\nwith statistical guarantees, however, its application to text generation is\nchallenging since any i.i.d. assumptions are not realistic. In this paper, we\nbridge this gap by leveraging recent results on non-exchangeable conformal\nprediction, which still ensures bounds on coverage. The result,\nnon-exchangeable conformal nucleus sampling, is a novel extension of the\nconformal prediction framework to generation based on nearest neighbors. Our\nmethod can be used post-hoc for an arbitrary model without extra training and\nsupplies token-level, calibrated prediction sets equipped with statistical\nguarantees. Experiments in machine translation and language modeling show\nencouraging results in generation quality. By also producing tighter prediction\nsets with good coverage, we thus give a more theoretically principled way to\nperform sampling with conformal guarantees.\n","authors":["Dennis Ulmer","Chrysoula Zerva","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2402.00707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07303v2","updated":"2024-02-01T16:00:03Z","published":"2023-05-12T08:16:06Z","title":"Multi-Relational Hyperbolic Word Embeddings from Natural Language\n Definitions","summary":" Natural language definitions possess a recursive, self-explanatory semantic\nstructure that can support representation learning methods able to preserve\nexplicit conceptual relations and constraints in the latent space. This paper\npresents a multi-relational model that explicitly leverages such a structure to\nderive word embeddings from definitions. By automatically extracting the\nrelations linking defined and defining terms from dictionaries, we demonstrate\nhow the problem of learning word embeddings can be formalised via a\ntranslational framework in Hyperbolic space and used as a proxy to capture the\nglobal semantic structure of definitions. An extensive empirical analysis\ndemonstrates that the framework can help imposing the desired structural\nconstraints while preserving the semantic mapping required for controllable and\ninterpretable traversal. Moreover, the experiments reveal the superiority of\nthe Hyperbolic word embeddings over the Euclidean counterparts and demonstrate\nthat the multi-relational approach can obtain competitive results when compared\nto state-of-the-art neural models, with the advantage of being intrinsically\nmore efficient and interpretable.\n","authors":["Marco Valentino","Danilo S. Carvalho","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2305.07303v2.pdf","comment":"Accepted at the 18th Conference of the European Chapter of the\n Association for Computational Linguistics (EACL 2024)"},{"id":"http://arxiv.org/abs/2402.00667v1","updated":"2024-02-01T15:30:19Z","published":"2024-02-01T15:30:19Z","title":"Improving Weak-to-Strong Generalization with Scalable Oversight and\n Ensemble Learning","summary":" This paper presents a follow-up study to OpenAI's recent superalignment work\non Weak-to-Strong Generalization (W2SG). Superalignment focuses on ensuring\nthat high-level AI systems remain consistent with human values and intentions\nwhen dealing with complex, high-risk tasks. The W2SG framework has opened new\npossibilities for empirical research in this evolving field. Our study\nsimulates two phases of superalignment under the W2SG framework: the\ndevelopment of general superhuman models and the progression towards\nsuperintelligence. In the first phase, based on human supervision, the quality\nof weak supervision is enhanced through a combination of scalable oversight and\nensemble learning, reducing the capability gap between weak teachers and strong\nstudents. In the second phase, an automatic alignment evaluator is employed as\nthe weak supervisor. By recursively updating this auto aligner, the\ncapabilities of the weak teacher models are synchronously enhanced, achieving\nweak-to-strong supervision over stronger student models.We also provide an\ninitial validation of the proposed approach for the first phase. Using the SciQ\ntask as example, we explore ensemble learning for weak teacher models through\nbagging and boosting. Scalable oversight is explored through two auxiliary\nsettings: human-AI interaction and AI-AI debate. Additionally, the paper\ndiscusses the impact of improved weak supervision on enhancing weak-to-strong\ngeneralization based on in-context learning. Experiment code and dataset will\nbe released at https://github.com/ADaM-BJTU/W2SG.\n","authors":["Jitao Sang","Yuhang Wang","Jing Zhang","Yanxu Zhu","Chao Kong","Junhong Ye","Shuyu Wei","Jinlin Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.00667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00658v1","updated":"2024-02-01T15:18:33Z","published":"2024-02-01T15:18:33Z","title":"Learning Planning-based Reasoning by Trajectories Collection and Process\n Reward Synthesizing","summary":" Large Language Models (LLMs) have demonstrated significant potential in\nhandling complex reasoning tasks through step-by-step rationale generation.\nHowever, recent studies have raised concerns regarding the hallucination and\nflaws in their reasoning process. Substantial efforts are being made to improve\nthe reliability and faithfulness of the generated rationales. Some approaches\nmodel reasoning as planning, while others focus on annotating for process\nsupervision. Nevertheless, the planning-based search process often results in\nhigh latency due to the frequent assessment of intermediate reasoning states\nand the extensive exploration space. Additionally, supervising the reasoning\nprocess with human annotation is costly and challenging to scale for LLM\ntraining. To address these issues, in this paper, we propose a framework to\nlearn planning-based reasoning through direct preference optimization (DPO) on\ncollected trajectories, which are ranked according to synthesized process\nrewards. Our results on challenging logical reasoning benchmarks demonstrate\nthe effectiveness of our learning framework, showing that our 7B model can\nsurpass the strong counterparts like GPT-3.5-Turbo.\n","authors":["Fangkai Jiao","Chengwei Qin","Zhengyuan Liu","Nancy F. Chen","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2402.00658v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.00632v1","updated":"2024-02-01T14:46:35Z","published":"2024-02-01T14:46:35Z","title":"Prosody in Cascade and Direct Speech-to-Text Translation: a case study\n on Korean Wh-Phrases","summary":" Speech-to-Text Translation (S2TT) has typically been addressed with cascade\nsystems, where speech recognition systems generate a transcription that is\nsubsequently passed to a translation model. While there has been a growing\ninterest in developing direct speech translation systems to avoid propagating\nerrors and losing non-verbal content, prior work in direct S2TT has struggled\nto conclusively establish the advantages of integrating the acoustic signal\ndirectly into the translation process. This work proposes using contrastive\nevaluation to quantitatively measure the ability of direct S2TT systems to\ndisambiguate utterances where prosody plays a crucial role. Specifically, we\nevaluated Korean-English translation systems on a test set containing\nwh-phrases, for which prosodic features are necessary to produce translations\nwith the correct intent, whether it's a statement, a yes/no question, a\nwh-question, and more. Our results clearly demonstrate the value of direct\ntranslation systems over cascade translation models, with a notable 12.9%\nimprovement in overall accuracy in ambiguous cases, along with up to a 15.6%\nincrease in F1 scores for one of the major intent categories. To the best of\nour knowledge, this work stands as the first to provide quantitative evidence\nthat direct S2TT models can effectively leverage prosody. The code for our\nevaluation is openly accessible and freely available for review and\nutilisation.\n","authors":["Giulio Zhou","Tsz Kin Lam","Alexandra Birch","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2402.00632v1.pdf","comment":"Accepted at Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2402.00620v1","updated":"2024-02-01T14:30:39Z","published":"2024-02-01T14:30:39Z","title":"Actor Identification in Discourse: A Challenge for LLMs?","summary":" The identification of political actors who put forward claims in public\ndebate is a crucial step in the construction of discourse networks, which are\nhelpful to analyze societal debates. Actor identification is, however, rather\nchallenging: Often, the locally mentioned speaker of a claim is only a pronoun\n(\"He proposed that [claim]\"), so recovering the canonical actor name requires\ndiscourse understanding. We compare a traditional pipeline of dedicated NLP\ncomponents (similar to those applied to the related task of coreference) with a\nLLM, which appears a good match for this generation task. Evaluating on a\ncorpus of German actors in newspaper reports, we find surprisingly that the LLM\nperforms worse. Further analysis reveals that the LLM is very good at\nidentifying the right reference, but struggles to generate the correct\ncanonical form. This points to an underlying issue in LLMs with controlling\ngenerated output. Indeed, a hybrid model combining the LLM with a classifier to\nnormalize its output substantially outperforms both initial models.\n","authors":["Ana Barić","Sean Papay","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2402.00620v1.pdf","comment":"Proceedings of the EACL 2024 workshop on Computational Models of\n Discourse (St. Julian's, Malta)"},{"id":"http://arxiv.org/abs/2308.10263v2","updated":"2024-02-01T14:19:50Z","published":"2023-08-20T13:20:54Z","title":"Scaling up Discovery of Latent Concepts in Deep NLP Models","summary":" Despite the revolution caused by deep NLP models, they remain black boxes,\nnecessitating research to understand their decision-making processes. A recent\nwork by Dalvi et al. (2022) carried out representation analysis through the\nlens of clustering latent spaces within pre-trained models (PLMs), but that\napproach is limited to small scale due to the high cost of running\nAgglomerative hierarchical clustering. This paper studies clustering algorithms\nin order to scale the discovery of encoded concepts in PLM representations to\nlarger datasets and models. We propose metrics for assessing the quality of\ndiscovered latent concepts and use them to compare the studied clustering\nalgorithms. We found that K-Means-based concept discovery significantly\nenhances efficiency while maintaining the quality of the obtained concepts.\nFurthermore, we demonstrate the practicality of this newfound efficiency by\nscaling latent concept discovery to LLMs and phrasal concepts.\n","authors":["Majd Hawasly","Fahim Dalvi","Nadir Durrani"],"pdf_url":"https://arxiv.org/pdf/2308.10263v2.pdf","comment":"14 pages, accepted to The 18th Conference of the European Chapter of\n the Association for Computational Linguistics (EACL 2024)"},{"id":"http://arxiv.org/abs/2401.06034v2","updated":"2024-02-01T13:10:15Z","published":"2024-01-11T16:48:00Z","title":"LinguAlchemy: Fusing Typological and Geographical Elements for Unseen\n Language Generalization","summary":" Pretrained language models (PLMs) have shown remarkable generalization toward\nmultiple tasks and languages. Nonetheless, the generalization of PLMs towards\nunseen languages is poor, resulting in significantly worse language\nperformance, or even generating nonsensical responses that are comparable to a\nrandom baseline. This limitation has been a longstanding problem of PLMs\nraising the problem of diversity and equal access to language modeling\ntechnology. In this work, we solve this limitation by introducing LinguAlchemy,\na regularization technique that incorporates various aspects of languages\ncovering typological, geographical, and phylogenetic constraining the resulting\nrepresentation of PLMs to better characterize the corresponding linguistics\nconstraints. LinguAlchemy significantly improves the accuracy performance of\nmBERT and XLM-R on unseen languages by ~18% and ~2%, respectively compared to\nfully finetuned models and displaying a high degree of unseen language\ngeneralization. We further introduce AlchemyScale and AlchemyTune, extension of\nLinguAlchemy which adjusts the linguistic regularization weights automatically,\nalleviating the need for hyperparameter search. LinguAlchemy enables better\ncross-lingual generalization to unseen languages which is vital for better\ninclusivity and accessibility of PLMs.\n","authors":["Muhammad Farid Adilazuarda","Samuel Cahyawijaya","Alham Fikri Aji","Genta Indra Winata","Ayu Purwarianti"],"pdf_url":"https://arxiv.org/pdf/2401.06034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00559v1","updated":"2024-02-01T12:46:45Z","published":"2024-02-01T12:46:45Z","title":"A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for\n Verifiers of Reasoning Chains","summary":" Prompting language models to provide step-by-step answers (e.g.,\n\"Chain-of-Thought\") is the prominent approach for complex reasoning tasks,\nwhere more accurate reasoning chains typically improve downstream task\nperformance. Recent literature discusses automatic methods to verify reasoning\nsteps to evaluate and improve their correctness. However, no fine-grained\nstep-level datasets are available to enable thorough evaluation of such\nverification methods, hindering progress in this direction. We introduce\nReveal: Reasoning Verification Evaluation, a new dataset to benchmark automatic\nverifiers of complex Chain-of-Thought reasoning in open-domain question\nanswering settings. Reveal includes comprehensive labels for the relevance,\nattribution to evidence passages, and logical correctness of each reasoning\nstep in a language model's answer, across a wide variety of datasets and\nstate-of-the-art language models.\n","authors":["Alon Jacovi","Yonatan Bitton","Bernd Bohnet","Jonathan Herzig","Or Honovich","Michael Tseng","Michael Collins","Roee Aharoni","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2402.00559v1.pdf","comment":"https://huggingface.co/datasets/google/reveal"},{"id":"http://arxiv.org/abs/2402.00530v1","updated":"2024-02-01T11:57:53Z","published":"2024-02-01T11:57:53Z","title":"Superfiltering: Weak-to-Strong Data Filtering for Fast\n Instruction-Tuning","summary":" Instruction tuning is critical to improve LLMs but usually suffers from\nlow-quality and redundant data. Data filtering for instruction tuning has\nproved important in improving both the efficiency and performance of the tuning\nprocess. But it also leads to extra cost and computation due to the involvement\nof LLMs in this process. To reduce the filtering cost, we study Superfiltering:\nCan we use a smaller and weaker model to select data for finetuning a larger\nand stronger model? Despite the performance gap between weak and strong\nlanguage models, we find their highly consistent capability to perceive\ninstruction difficulty and data selection results. This enables us to use a\nmuch smaller and more efficient model to filter the instruction data used to\ntrain a larger language model. Not only does it largely speed up the data\nfiltering, but the filtered-data-finetuned LLM achieves even better performance\non standard benchmarks. Extensive experiments validate the efficacy and\nefficiency of our approach.\n","authors":["Ming Li","Yong Zhang","Shwai He","Zhitao Li","Hongyu Zhao","Jianzong Wang","Ning Cheng","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.00530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13514v2","updated":"2024-02-01T11:47:57Z","published":"2023-05-22T22:07:50Z","title":"Small Language Models Improve Giants by Rewriting Their Outputs","summary":" Despite the impressive performance of large language models (LLMs), they\noften lag behind specialized models in various tasks. LLMs only use a fraction\nof the existing training data for in-context learning, while task-specific\nmodels harness the full dataset for fine-tuning. In this work, we tackle the\nproblem of leveraging training data to improve the performance of LLMs without\nfine-tuning. Our approach directly targets LLM predictions without requiring\naccess to their weights. We create a pool of candidates from the LLM through\nfew-shot prompting and we employ a compact model, the LM-corrector (LMCor),\nspecifically trained to merge these candidates to produce an enhanced output.\nOur experiments on four natural language generation tasks demonstrate that even\na small LMCor model (250M) substantially improves the few-shot performance of\nLLMs (62B), matching and even outperforming standard fine-tuning. Furthermore,\nwe illustrate the robustness of LMCor against different prompts, thereby\nminimizing the need for extensive prompt engineering. Finally, we show that\nLMCor can be seamlessly integrated with different LLMs at inference, serving as\na plug-and-play module to improve their performance.\n","authors":["Giorgos Vernikos","Arthur Bražinskas","Jakub Adamek","Jonathan Mallinson","Aliaksei Severyn","Eric Malmi"],"pdf_url":"https://arxiv.org/pdf/2305.13514v2.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2402.00518v1","updated":"2024-02-01T11:39:04Z","published":"2024-02-01T11:39:04Z","title":"EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit\n Large Language Models","summary":" This work introduces EE-Tuning, a lightweight and economical solution to\ntraining/tuning early-exit large language models (LLMs). In contrast to the\ncommon approach of full-parameter pre-training, EE-Tuning augments any\npre-trained (and possibly fine-tuned) standard LLM with additional early-exit\nlayers that are tuned in a parameter-efficient manner, which requires\nsignificantly less computational resources and training data. Our\nimplementation of EE-Tuning achieves outstanding training efficiency via\nextensive performance optimizations, as well as scalability due to its full\ncompatibility with 3D parallelism. Results of systematic experiments validate\nthe efficacy of EE-Tuning, confirming that effective early-exit LLM inference\ncan be achieved with a limited training budget. In hope of making early-exit\nLLMs accessible to the community, we release the source code of our\nimplementation of EE-Tuning at https://github.com/pan-x-c/EE-LLM.\n","authors":["Xuchen Pan","Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.00518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00474v1","updated":"2024-02-01T10:26:27Z","published":"2024-02-01T10:26:27Z","title":"SA-MDKIF: A Scalable and Adaptable Medical Domain Knowledge Injection\n Framework for Large Language Models","summary":" Recent advances in large language models (LLMs) have demonstrated exceptional\nperformance in various natural language processing (NLP) tasks. However, their\neffective application in the medical domain is hampered by a lack of medical\ndomain knowledge. In this study, we present SA-MDKIF, a scalable and adaptable\nframework that aims to inject medical knowledge into general-purpose LLMs\nthrough instruction tuning, thereby enabling adaptability for various\ndownstream tasks. SA-MDKIF consists of two stages: skill training and skill\nadaptation. In the first stage, we define 12 basic medical skills and use\nAdaLoRA to train these skills based on uniformly formatted instructional\ndatasets that we have constructed. In the next stage, we train the skill router\nusing task-specific downstream data and use this router to integrate the\nacquired skills with LLMs during inference. Experimental results on 9 different\nmedical tasks show that SA-MDKIF improves performance by 10-20% compared to the\noriginal LLMs. Notably, this improvement is particularly pronounced for unseen\nmedical tasks, showing an improvement of up to 30%.\n","authors":["Tianhan Xu","Zhe Hu","Ling Chen","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2402.00474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00453v1","updated":"2024-02-01T09:43:30Z","published":"2024-02-01T09:43:30Z","title":"Instruction Makes a Difference","summary":" We introduce Instruction Document Visual Question Answering (iDocVQA) dataset\nand Large Language Document (LLaDoc) model, for training Language-Vision (LV)\nmodels for document analysis and predictions on document images, respectively.\nUsually, deep neural networks for the DocVQA task are trained on datasets\nlacking instructions. We show that using instruction-following datasets\nimproves performance. We compare performance across document-related datasets\nusing the recent state-of-the-art (SotA) Large Language and Vision Assistant\n(LLaVA)1.5 as the base model. We also evaluate the performance of the derived\nmodels for object hallucination using the Polling-based Object Probing\nEvaluation (POPE) dataset. The results show that instruction-tuning performance\nranges from 11X to 32X of zero-shot performance and from 0.1% to 4.2% over\nnon-instruction (traditional task) finetuning. Despite the gains, these still\nfall short of human performance (94.36%), implying there's much room for\nimprovement.\n","authors":["Tosin Adewumi","Nudrat Habib","Lama Alkhaled","Elisa Barney"],"pdf_url":"https://arxiv.org/pdf/2402.00453v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.00446v1","updated":"2024-02-01T09:24:33Z","published":"2024-02-01T09:24:33Z","title":"Improving Dialog Safety using Socially Aware Contrastive Learning","summary":" State-of-the-art conversational AI systems raise concerns due to their\npotential risks of generating unsafe, toxic, unethical, or dangerous content.\nPrevious works have developed datasets to teach conversational agents the\nappropriate social paradigms to respond effectively to specifically designed\nhazardous content. However, models trained on these adversarial datasets still\nstruggle to recognize subtle unsafe situations that appear naturally in\nconversations or introduce an inappropriate response in a casual context. To\nunderstand the extent of this problem, we study prosociality in both\nadversarial and casual dialog contexts and audit the response quality of\ngeneral-purpose language models in terms of propensity to produce unsafe\ncontent. We propose a dual-step fine-tuning process to address these issues\nusing a socially aware n-pair contrastive loss. Subsequently, we train a base\nmodel that integrates prosocial behavior by leveraging datasets like Moral\nIntegrity Corpus (MIC) and ProsocialDialog. Experimental results on several\ndialog datasets demonstrate the effectiveness of our approach in generating\nsocially appropriate responses.\n","authors":["Souvik Das","Rohini K. Srihari"],"pdf_url":"https://arxiv.org/pdf/2402.00446v1.pdf","comment":"SCI-CHAT@EACL2024"},{"id":"http://arxiv.org/abs/2402.00421v1","updated":"2024-02-01T08:37:13Z","published":"2024-02-01T08:37:13Z","title":"From PARIS to LE-PARIS: Toward Patent Response Automation with\n Recommender Systems and Collaborative Large Language Models","summary":" In patent prosecution, timely and effective responses to Office Actions (OAs)\nare crucial for acquiring patents, yet past automation and AI research have\nscarcely addressed this aspect. To address this gap, our study introduces the\nPatent Office Action Response Intelligence System (PARIS) and its advanced\nversion, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are\ndesigned to expedite the efficiency of patent attorneys in collaboratively\nhandling OA responses. The systems' key features include the construction of an\nOA Topics Database, development of Response Templates, and implementation of\nRecommender Systems and LLM-based Response Generation. Our validation involves\na multi-paradigmatic analysis using the USPTO Office Action database and\nlongitudinal data of attorney interactions with our systems over six years.\nThrough five studies, we examine the constructiveness of OA topics (studies 1\nand 2) using topic modeling and the proposed Delphi process, the efficacy of\nour proposed hybrid recommender system tailored for OA (both LLM-based and\nnon-LLM-based) (study 3), the quality of response generation (study 4), and the\npractical value of the systems in real-world scenarios via user studies (study\n5). Results demonstrate that both PARIS and LE-PARIS significantly meet key\nmetrics and positively impact attorney performance.\n","authors":["Jung-Mei Chu","Hao-Cheng Lo","Jieh Hsiang","Chun-Chieh Cho"],"pdf_url":"https://arxiv.org/pdf/2402.00421v1.pdf","comment":"14 pages, 4 figures, summitted to a journal"},{"id":"http://arxiv.org/abs/2402.00414v1","updated":"2024-02-01T08:15:28Z","published":"2024-02-01T08:15:28Z","title":"Prompt-Time Symbolic Knowledge Capture with Large Language Models","summary":" Augmenting large language models (LLMs) with user-specific knowledge is\ncrucial for real-world applications, such as personal AI assistants. However,\nLLMs inherently lack mechanisms for prompt-driven knowledge capture. This paper\ninvestigates utilizing the existing LLM capabilities to enable prompt-driven\nknowledge capture, with a particular emphasis on knowledge graphs. We address\nthis challenge by focusing on prompt-to-triple (P2T) generation. We explore\nthree methods: zero-shot prompting, few-shot prompting, and fine-tuning, and\nthen assess their performance via a specialized synthetic dataset. Our code and\ndatasets are publicly available at https://github.com/HaltiaAI/paper-PTSKC.\n","authors":["Tolga Çöplü","Arto Bendiken","Andrii Skomorokhov","Eduard Bateiko","Stephen Cobb","Joshua J. Bouw"],"pdf_url":"https://arxiv.org/pdf/2402.00414v1.pdf","comment":"8 pages, 5 figures, 1 table preprint. Under review"},{"id":"http://arxiv.org/abs/2402.00412v1","updated":"2024-02-01T08:11:56Z","published":"2024-02-01T08:11:56Z","title":"Hidding the Ghostwriters: An Adversarial Evaluation of AI-Generated\n Student Essay Detection","summary":" Large language models (LLMs) have exhibited remarkable capabilities in text\ngeneration tasks. However, the utilization of these models carries inherent\nrisks, including but not limited to plagiarism, the dissemination of fake news,\nand issues in educational exercises. Although several detectors have been\nproposed to address these concerns, their effectiveness against adversarial\nperturbations, specifically in the context of student essay writing, remains\nlargely unexplored. This paper aims to bridge this gap by constructing\nAIG-ASAP, an AI-generated student essay dataset, employing a range of text\nperturbation methods that are expected to generate high-quality essays while\nevading detection. Through empirical experiments, we assess the performance of\ncurrent AIGC detectors on the AIG-ASAP dataset. The results reveal that the\nexisting detectors can be easily circumvented using straightforward automatic\nadversarial attacks. Specifically, we explore word substitution and sentence\nsubstitution perturbation methods that effectively evade detection while\nmaintaining the quality of the generated essays. This highlights the urgent\nneed for more accurate and robust methods to detect AI-generated student essays\nin the education domain.\n","authors":["Xinlin Peng","Ying Zhou","Ben He","Le Sun","Yingfei Sun"],"pdf_url":"https://arxiv.org/pdf/2402.00412v1.pdf","comment":"Accepted by EMNLP 2023 Main conference, Oral Presentation"},{"id":"http://arxiv.org/abs/2402.00402v1","updated":"2024-02-01T07:48:50Z","published":"2024-02-01T07:48:50Z","title":"Investigating Bias Representations in Llama 2 Chat via Activation\n Steering","summary":" We address the challenge of societal bias in Large Language Models (LLMs),\nfocusing on the Llama 2 7B Chat model. As LLMs are increasingly integrated into\ndecision-making processes with substantial societal impact, it becomes\nimperative to ensure these models do not reinforce existing biases. Our\napproach employs activation steering to probe for and mitigate biases related\nto gender, race, and religion. This method manipulates model activations to\ndirect responses towards or away from biased outputs, utilizing steering\nvectors derived from the StereoSet dataset and custom GPT4 generated gender\nbias prompts. Our findings reveal inherent gender bias in Llama 2 7B Chat,\npersisting even after Reinforcement Learning from Human Feedback (RLHF). We\nalso observe a predictable negative correlation between bias and the model's\ntendency to refuse responses. Significantly, our study uncovers that RLHF tends\nto increase the similarity in the model's representation of different forms of\nsocietal biases, which raises questions about the model's nuanced understanding\nof different forms of bias. This work also provides valuable insights into\neffective red-teaming strategies for LLMs using activation steering,\nparticularly emphasizing the importance of integrating a refusal vector.\n","authors":["Dawn Lu","Nina Rimsky"],"pdf_url":"https://arxiv.org/pdf/2402.00402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00396v1","updated":"2024-02-01T07:32:24Z","published":"2024-02-01T07:32:24Z","title":"Efficient Exploration for LLMs","summary":" We present evidence of substantial benefit from efficient exploration in\ngathering human feedback to improve large language models. In our experiments,\nan agent sequentially generates queries while fitting a reward model to the\nfeedback received. Our best-performing agent generates queries using double\nThompson sampling, with uncertainty represented by an epistemic neural network.\nOur results demonstrate that efficient exploration enables high levels of\nperformance with far fewer queries. Further, both uncertainty estimation and\nthe choice of exploration scheme play critical roles.\n","authors":["Vikranth Dwaracherla","Seyed Mohammad Asghari","Botao Hao","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2402.00396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00385v1","updated":"2024-02-01T07:05:45Z","published":"2024-02-01T07:05:45Z","title":"Computational Morphology and Lexicography Modeling of Modern Standard\n Arabic Nominals","summary":" Modern Standard Arabic (MSA) nominals present many morphological and lexical\nmodeling challenges that have not been consistently addressed previously. This\npaper attempts to define the space of such challenges, and leverage a recently\nproposed morphological framework to build a comprehensive and extensible model\nfor MSA nominals. Our model design addresses the nominals' intricate\nmorphotactics, as well as their paradigmatic irregularities. Our implementation\nshowcases enhanced accuracy and consistency compared to a commonly used MSA\nmorphological analyzer and generator. We make our models publicly available.\n","authors":["Christian Khairallah","Reham Marzouk","Salam Khalifa","Mayar Nassar","Nizar Habash"],"pdf_url":"https://arxiv.org/pdf/2402.00385v1.pdf","comment":"Findings of the Association for Computational Linguistics: EACL 2024"},{"id":"http://arxiv.org/abs/2402.00371v1","updated":"2024-02-01T06:21:19Z","published":"2024-02-01T06:21:19Z","title":"What Does the Bot Say? Opportunities and Risks of Large Language Models\n in Social Media Bot Detection","summary":" Social media bot detection has always been an arms race between advancements\nin machine learning bot detectors and adversarial bot strategies to evade\ndetection. In this work, we bring the arms race to the next level by\ninvestigating the opportunities and risks of state-of-the-art large language\nmodels (LLMs) in social bot detection. To investigate the opportunities, we\ndesign novel LLM-based bot detectors by proposing a\nmixture-of-heterogeneous-experts framework to divide and conquer diverse user\ninformation modalities. To illuminate the risks, we explore the possibility of\nLLM-guided manipulation of user textual and structured information to evade\ndetection. Extensive experiments with three LLMs on two datasets demonstrate\nthat instruction tuning on merely 1,000 annotated examples produces specialized\nLLMs that outperform state-of-the-art baselines by up to 9.1% on both datasets,\nwhile LLM-guided manipulation strategies could significantly bring down the\nperformance of existing bot detectors by up to 29.6% and harm the calibration\nand reliability of bot detection systems.\n","authors":["Shangbin Feng","Herun Wan","Ningnan Wang","Zhaoxuan Tan","Minnan Luo","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2402.00371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00367v1","updated":"2024-02-01T06:11:49Z","published":"2024-02-01T06:11:49Z","title":"Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM\n Collaboration","summary":" Despite efforts to expand the knowledge of large language models (LLMs),\nknowledge gaps -- missing or outdated information in LLMs -- might always\npersist given the evolving nature of knowledge. In this work, we study\napproaches to identify LLM knowledge gaps and abstain from answering questions\nwhen knowledge gaps are present. We first adapt existing approaches to model\ncalibration or adaptation through fine-tuning/prompting and analyze their\nability to abstain from generating low-confidence outputs. Motivated by their\nfailures in self-reflection and over-reliance on held-out sets, we propose two\nnovel approaches that are based on model collaboration, i.e., LLMs probing\nother LLMs for knowledge gaps, either cooperatively or competitively. Extensive\nexperiments with three LLMs on four QA tasks featuring diverse knowledge\ndomains demonstrate that both cooperative and competitive approaches to\nunveiling LLM knowledge gaps achieve up to 19.3% improvements on abstain\naccuracy against the strongest baseline. Further analysis reveals that our\nproposed mechanisms could help identify failure cases in retrieval augmentation\nand pinpoint knowledge gaps in multi-hop reasoning.\n","authors":["Shangbin Feng","Weijia Shi","Yike Wang","Wenxuan Ding","Vidhisha Balachandran","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2402.00367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00533v4","updated":"2024-02-01T06:10:00Z","published":"2023-10-01T00:52:24Z","title":"SELF: Self-Evolution with Language Feedback","summary":" Large Language Models (LLMs) have demonstrated remarkable versatility across\nvarious domains. To further advance LLMs, we propose 'SELF' (Self-Evolution\nwith Language Feedback), a novel approach that enables LLMs to self-improve\nthrough self-reflection, akin to human learning processes. SELF initiates with\na meta-skill learning process that equips the LLMs with capabilities for\nself-feedback and self-refinement. Subsequently, the model undergoes an\niterative process of self-evolution. In each iteration, it utilizes an\nunlabeled dataset of instructions to generate initial responses. These\nresponses are enhanced through self-feedback and self-refinement. The model is\nthen fine-tuned using this enhanced data. The model undergoes progressive\nimprovement through this iterative self-evolution process. Moreover, the SELF\nframework enables the model to apply self-refinement during inference, which\nfurther improves response quality. Our experiments in mathematics and general\ntasks demonstrate that SELF can enhance the capabilities of LLMs without human\nintervention. The SELF framework indicates a promising direction for the\nautonomous evolution of LLMs, transitioning them from passive information\nreceivers to active participants in their development.\n","authors":["Jianqiao Lu","Wanjun Zhong","Wenyong Huang","Yufei Wang","Qi Zhu","Fei Mi","Baojun Wang","Weichao Wang","Xingshan Zeng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00533v4.pdf","comment":"20 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2402.00345v1","updated":"2024-02-01T05:20:07Z","published":"2024-02-01T05:20:07Z","title":"IndiVec: An Exploration of Leveraging Large Language Models for Media\n Bias Detection with Fine-Grained Bias Indicators","summary":" This study focuses on media bias detection, crucial in today's era of\ninfluential social media platforms shaping individual attitudes and opinions.\nIn contrast to prior work that primarily relies on training specific models\ntailored to particular datasets, resulting in limited adaptability and subpar\nperformance on out-of-domain data, we introduce a general bias detection\nframework, IndiVec, built upon large language models. IndiVec begins by\nconstructing a fine-grained media bias database, leveraging the robust\ninstruction-following capabilities of large language models and vector database\ntechniques. When confronted with new input for bias detection, our framework\nautomatically selects the most relevant indicator from the vector database and\nemploys majority voting to determine the input's bias label. IndiVec excels\ncompared to previous methods due to its adaptability (demonstrating consistent\nperformance across diverse datasets from various sources) and explainability\n(providing explicit top-k indicators to interpret bias predictions).\nExperimental results on four political bias datasets highlight IndiVec's\nsignificant superiority over baselines. Furthermore, additional experiments and\nanalysis provide profound insights into the framework's effectiveness.\n","authors":["Luyang Lin","Lingzhi Wang","Xiaoyan Zhao","Jing Li","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2402.00345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13063v2","updated":"2024-02-01T04:57:05Z","published":"2023-09-14T20:46:48Z","title":"Using Large Language Models to Generate, Validate, and Apply User Intent\n Taxonomies","summary":" Log data can reveal valuable information about how users interact with Web\nsearch services, what they want, and how satisfied they are. However, analyzing\nuser intents in log data is not easy, especially for emerging forms of Web\nsearch such as AI-driven chat. To understand user intents from log data, we\nneed a way to label them with meaningful categories that capture their\ndiversity and dynamics. Existing methods rely on manual or machine-learned\nlabeling, which are either expensive or inflexible for large and dynamic\ndatasets. We propose a novel solution using large language models (LLMs), which\ncan generate rich and relevant concepts, descriptions, and examples for user\nintents. However, using LLMs to generate a user intent taxonomy and apply it\nfor log analysis can be problematic for two main reasons: (1) such a taxonomy\nis not externally validated; and (2) there may be an undesirable feedback loop.\nTo address this, we propose a new methodology with human experts and assessors\nto verify the quality of the LLM-generated taxonomy. We also present an\nend-to-end pipeline that uses an LLM with human-in-the-loop to produce, refine,\nand apply labels for user intent analysis in log data. We demonstrate its\neffectiveness by uncovering new insights into user intents from search and chat\nlogs from the Microsoft Bing commercial search engine. The proposed work's\nnovelty stems from the method for generating purpose-driven user intent\ntaxonomies with strong validation. This method not only helps remove\nmethodological and practical bottlenecks from intent-focused research, but also\nprovides a new framework for generating, validating, and applying other kinds\nof taxonomies in a scalable and adaptable way with minimal human effort.\n","authors":["Chirag Shah","Ryen W. White","Reid Andersen","Georg Buscher","Scott Counts","Sarkar Snigdha Sarathi Das","Ali Montazer","Sathish Manivannan","Jennifer Neville","Xiaochuan Ni","Nagu Rangan","Tara Safavi","Siddharth Suri","Mengting Wan","Leijie Wang","Longqi Yang"],"pdf_url":"https://arxiv.org/pdf/2309.13063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07324v2","updated":"2024-02-01T04:34:07Z","published":"2024-01-14T16:17:07Z","title":"Small LLMs Are Weak Tool Learners: A Multi-LLM Agent","summary":" Large Language Model (LLM) agents significantly extend the capabilities of\nstandalone LLMs, empowering them to interact with external tools (e.g., APIs,\nfunctions) and complete complex tasks in a self-directed fashion. The challenge\nof tool use demands that LLMs not only understand user queries and generate\nanswers but also excel in task planning, memory management, tool invocation,\nand result summarization. While traditional approaches focus on training a\nsingle LLM with all these capabilities, performance limitations become\napparent, particularly with smaller models. Moreover, the entire LLM may\nrequire retraining when tools are updated. To overcome these challenges, we\npropose a novel strategy that decomposes the aforementioned capabilities into a\nplanner, caller, and summarizer. Each component is implemented by a single LLM\nthat focuses on a specific capability and collaborates with other components to\naccomplish the task. This modular framework facilitates individual updates and\nthe potential use of smaller LLMs for building each capability. To effectively\ntrain this framework, we introduce a two-stage training paradigm. First, we\nfine-tune a backbone LLM on the entire dataset without discriminating\nsub-tasks, providing the model with a comprehensive understanding of the task.\nSecond, the fine-tuned LLM is used to instantiate the planner, caller, and\nsummarizer respectively, which are continually fine-tuned on respective\nsub-tasks. Evaluation across various tool-use benchmarks illustrates that our\nproposed multi-LLM framework surpasses the traditional single-LLM approach,\nhighlighting its efficacy and advantages in tool learning.\n","authors":["Weizhou Shen","Chenliang Li","Hongzhan Chen","Ming Yan","Xiaojun Quan","Hehong Chen","Ji Zhang","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2401.07324v2.pdf","comment":"On progress, github repo: https://github.com/X-PLUG/Multi-LLM-Agent"},{"id":"http://arxiv.org/abs/2307.14117v2","updated":"2024-02-01T04:30:38Z","published":"2023-07-26T11:34:53Z","title":"Leveraging Implicit Feedback from Deployment Data in Dialogue","summary":" We study improving social conversational agents by learning from natural\ndialogue between users and a deployed model, without extra annotations. To\nimplicitly measure the quality of a machine-generated utterance, we leverage\nsignals like user response length, sentiment and reaction of the future human\nutterances in the collected dialogue episodes. Our experiments use the publicly\nreleased deployment data from BlenderBot (Xu et al., 2023). Human evaluation\nindicates improvements in our new models over baseline responses; however, we\nfind that some proxy signals can lead to more generations with undesirable\nproperties as well. For example, optimizing for conversation length can lead to\nmore controversial or unfriendly generations compared to the baseline, whereas\noptimizing for positive sentiment or reaction can decrease these behaviors.\n","authors":["Richard Yuanzhe Pang","Stephen Roller","Kyunghyun Cho","He He","Jason Weston"],"pdf_url":"https://arxiv.org/pdf/2307.14117v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2402.00322v1","updated":"2024-02-01T04:15:59Z","published":"2024-02-01T04:15:59Z","title":"Bias in Opinion Summarisation from Pre-training to Adaptation: A Case\n Study in Political Bias","summary":" Opinion summarisation aims to summarise the salient information and opinions\npresented in documents such as product reviews, discussion forums, and social\nmedia texts into short summaries that enable users to effectively understand\nthe opinions therein. Generating biased summaries has the risk of potentially\nswaying public opinion. Previous studies focused on studying bias in opinion\nsummarisation using extractive models, but limited research has paid attention\nto abstractive summarisation models. In this study, using political bias as a\ncase study, we first establish a methodology to quantify bias in abstractive\nmodels, then trace it from the pre-trained models to the task of summarising\nsocial media opinions using different models and adaptation methods. We find\nthat most models exhibit intrinsic bias. Using a social media text\nsummarisation dataset and contrasting various adaptation methods, we find that\ntuning a smaller number of parameters is less biased compared to standard\nfine-tuning; however, the diversity of topics in training data used for\nfine-tuning is critical.\n","authors":["Nannan Huang","Haytham Fayek","Xiuzhen Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00322v1.pdf","comment":"15 pages, 1 figure, 6 tables, Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2311.11482v4","updated":"2024-02-01T04:12:52Z","published":"2023-11-20T01:51:13Z","title":"Meta Prompting for AGI Systems","summary":" This paper presents a comprehensive study of Meta Prompting, an innovative\ntechnique reshaping the utilization of large language models (LLMs),\nmulti-modal foundation models, and AI systems in problem-solving and data\ninteraction. Grounded in type theory and category theory, Meta Prompting\nemphasizes the structure and syntax of information over traditional\ncontent-centric methods. The paper explores the formal definitions of Meta\nPrompting (MP), sets it apart from Few-Shot Prompting, and underlines its\neffectiveness in various AI applications. A key focus is applying Meta\nPrompting for complex reasoning (MP-CR) tasks, showing how it effectively\ndeconstructs intricate problems into simpler sub-problems, enhancing token\nefficiency, and enabling more equitable problem-solving comparisons, especially\nagainst few-shot prompting methods. Additionally, the paper introduces Meta\nPrompting for prompting tasks, allowing LLMs to self-generate new prompts in a\nrecursive, metaprogramming-like manner. This approach marks a significant leap\nin AI's autonomous and adaptive capabilities. The paper also introduces the\nintegration of Meta Prompting into multi-modal foundation model settings,\ntackling the challenges and opportunities of incorporating varied data types\nsuch as images, audio, and video within the structured Meta Prompting\nframework. Empirical experiments, including solving the Game of 24 tasks with\n100% success rate, demonstrate the MP-CR Agent's enhanced reasoning\ncapabilities, achieving high accuracy and efficiency, and showcasing Meta\nPrompting's transformative impact on AI problem-solving. (The code is available\nat https://github.com/meta-prompting/meta-prompting)\n","authors":["Yifan Zhang","Yang Yuan","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2311.11482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14113v2","updated":"2024-02-01T03:47:28Z","published":"2024-01-25T11:47:58Z","title":"On the Affinity, Rationality, and Diversity of Hierarchical Topic\n Modeling","summary":" Hierarchical topic modeling aims to discover latent topics from a corpus and\norganize them into a hierarchy to understand documents with desirable semantic\ngranularity. However, existing work struggles with producing topic hierarchies\nof low affinity, rationality, and diversity, which hampers document\nunderstanding. To overcome these challenges, we in this paper propose Transport\nPlan and Context-aware Hierarchical Topic Model (TraCo). Instead of early\nsimple topic dependencies, we propose a transport plan dependency method. It\nconstrains dependencies to ensure their sparsity and balance, and also\nregularizes topic hierarchy building with them. This improves affinity and\ndiversity of hierarchies. We further propose a context-aware disentangled\ndecoder. Rather than previously entangled decoding, it distributes different\nsemantic granularity to topics at different levels by disentangled decoding.\nThis facilitates the rationality of hierarchies. Experiments on benchmark\ndatasets demonstrate that our method surpasses state-of-the-art baselines,\neffectively improving the affinity, rationality, and diversity of hierarchical\ntopic modeling with better performance on downstream tasks.\n","authors":["Xiaobao Wu","Fengjun Pan","Thong Nguyen","Yichao Feng","Chaoqun Liu","Cong-Duy Nguyen","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2401.14113v2.pdf","comment":"Accepted to AAAI2024 conference. Our code is available at\n https://github.com/bobxwu/TraCo"},{"id":"http://arxiv.org/abs/2401.06320v2","updated":"2024-02-01T02:08:28Z","published":"2024-01-12T01:54:08Z","title":"Zero-shot Generative Large Language Models for Systematic Review\n Screening Automation","summary":" Systematic reviews are crucial for evidence-based medicine as they\ncomprehensively analyse published research findings on specific questions.\nConducting such reviews is often resource- and time-intensive, especially in\nthe screening phase, where abstracts of publications are assessed for inclusion\nin a review. This study investigates the effectiveness of using zero-shot large\nlanguage models~(LLMs) for automatic screening. We evaluate the effectiveness\nof eight different LLMs and investigate a calibration technique that uses a\npredefined recall threshold to determine whether a publication should be\nincluded in a systematic review. Our comprehensive evaluation using five\nstandard test collections shows that instruction fine-tuning plays an important\nrole in screening, that calibration renders LLMs practical for achieving a\ntargeted recall, and that combining both with an ensemble of zero-shot models\nsaves significant screening time compared to state-of-the-art approaches.\n","authors":["Shuai Wang","Harrisen Scells","Shengyao Zhuang","Martin Potthast","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.06320v2.pdf","comment":"Accepted to ECIR2024 full paper (findings)"},{"id":"http://arxiv.org/abs/2402.00271v1","updated":"2024-02-01T01:45:46Z","published":"2024-02-01T01:45:46Z","title":"A Crucial Parameter for Rank-Frequency Relation in Natural Languages","summary":" $f \\propto r^{-\\alpha} \\cdot (r+\\gamma)^{-\\beta}$ has been empirically shown\nmore precise than a na\\\"ive power law $f\\propto r^{-\\alpha}$ to model the\nrank-frequency ($r$-$f$) relation of words in natural languages. This work\nshows that the only crucial parameter in the formulation is $\\gamma$, which\ndepicts the resistance to vocabulary growth on a corpus. A method of parameter\nestimation by searching an optimal $\\gamma$ is proposed, where a ``zeroth\nword'' is introduced technically for the calculation. The formulation and\nparameters are further discussed with several case studies.\n","authors":["Chenchen Ding"],"pdf_url":"https://arxiv.org/pdf/2402.00271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17429v2","updated":"2024-02-01T01:39:39Z","published":"2023-12-29T01:42:43Z","title":"Commonsense for Zero-Shot Natural Language Video Localization","summary":" Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited\npromising results in training NLVL models exclusively with raw video data by\ndynamically generating video segments and pseudo-query annotations. However,\nexisting pseudo-queries often lack grounding in the source video, resulting in\nunstructured and disjointed content. In this paper, we investigate the\neffectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we\npresent CORONET, a zero-shot NLVL framework that leverages commonsense to\nbridge the gap between videos and generated pseudo-queries via a commonsense\nenhancement module. CORONET employs Graph Convolution Networks (GCN) to encode\ncommonsense information extracted from a knowledge graph, conditioned on the\nvideo, and cross-attention mechanisms to enhance the encoded video and\npseudo-query representations prior to localization. Through empirical\nevaluations on two benchmark datasets, we demonstrate that CORONET surpasses\nboth zero-shot and weakly supervised baselines, achieving improvements up to\n32.13% across various recall thresholds and up to 6.33% in mIoU. These results\nunderscore the significance of leveraging commonsense reasoning for zero-shot\nNLVL.\n","authors":["Meghana Holla","Ismini Lourentzou"],"pdf_url":"https://arxiv.org/pdf/2312.17429v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.01472v2","updated":"2024-02-01T01:32:48Z","published":"2024-01-03T00:13:52Z","title":"A First Look at Information Highlighting in Stack Overflow Answers","summary":" Context: Navigating the knowledge of Stack Overflow (SO) remains challenging.\nTo make the posts vivid to users, SO allows users to write and edit posts with\nMarkdown or HTML so that users can leverage various formatting styles (e.g.,\nbold, italic, and code) to highlight the important information. Nonetheless,\nthere have been limited studies on the highlighted information. Objective: We\ncarried out the first large-scale exploratory study on the information\nhighlighted in SO answers in our recent study. To extend our previous study, we\ndevelop approaches to automatically recommend highlighted content with\nformatting styles using neural network architectures initially designed for the\nNamed Entity Recognition task. Method: In this paper, we studied 31,169,429\nanswers of Stack Overflow. For training recommendation models, we choose CNN\nand BERT models for each type of formatting (i.e., Bold, Italic, Code, and\nHeading) using the information highlighting dataset we collected from SO\nanswers. Results: Our models based on CNN architecture achieve precision\nranging from 0.71 to 0.82. The trained model for automatic code content\nhighlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming\nthe trained models for other formatting styles. The BERT models have even lower\nrecalls and F1 scores than the CNN models. Our analysis of failure cases\nindicates that the majority of the failure cases are missing identification\n(i.e., the model misses the content that is supposed to be highlighted) due to\nthe models tend to learn the frequently highlighted words while struggling to\nlearn less frequent words. Conclusion: Our findings suggest that it is possible\nto develop recommendation models for highlighting information for answers with\ndifferent formatting styles on Stack Overflow.\n","authors":["Shahla Shaan Ahmed","Shaowei Wang","Yuan Tian"," Tse-Hsun"," Chen","Haoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.01472v2.pdf","comment":"This work is submitted to Information and Software Technology Journal"},{"id":"http://arxiv.org/abs/2402.00263v1","updated":"2024-02-01T01:23:07Z","published":"2024-02-01T01:23:07Z","title":"Does \\textsc{DetectGPT} Fully Utilize Perturbation? Selective\n Perturbation on Model-Based Contrastive Learning Detector would be Better","summary":" The burgeoning capabilities of large language models (LLMs) have raised\ngrowing concerns about abuse. DetectGPT, a zero-shot metric-based unsupervised\nmachine-generated text detector, first introduces perturbation and shows great\nperformance improvement. However, DetectGPT's random perturbation strategy\nmight introduce noise, limiting the distinguishability and further performance\nimprovements. Moreover, its logit regression module relies on setting the\nthreshold, which harms the generalizability and applicability of individual or\nsmall-batch inputs. Hence, we propose a novel detector, \\modelname{}, which\nuses selective strategy perturbation to relieve the important information loss\ncaused by random masking, and multi-pair contrastive learning to capture the\nimplicit pattern information during perturbation, facilitating few-shot\nperformance. The experiments show that \\modelname{} outperforms the SOTA method\nby 1.20\\% in accuracy on average on four public datasets. We further analyze\nthe effectiveness, robustness, and generalization of our perturbation method.\n","authors":["Shengchao Liu","Xiaoming Liu","Yichen Wang","Zehua Cheng","Chengzhengxu Li","Zhaohan Zhang","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2402.00263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15316v2","updated":"2024-02-01T00:50:41Z","published":"2024-01-27T06:29:07Z","title":"UNSEE: Unsupervised Non-contrastive Sentence Embeddings","summary":" We present UNSEE: Unsupervised Non-Contrastive Sentence Embeddings, a novel\napproach that outperforms SimCSE in the Massive Text Embedding benchmark. Our\nexploration begins by addressing the challenge of representation collapse, a\nphenomenon observed when contrastive objectives in SimCSE are replaced with\nnon-contrastive objectives. To counter this issue, we propose a straightforward\nsolution known as the target network, effectively mitigating representation\ncollapse. The introduction of the target network allows us to leverage\nnon-contrastive objectives, maintaining training stability while achieving\nperformance improvements comparable to contrastive objectives. Our method has\nachieved peak performance in non-contrastive sentence embeddings through\nmeticulous fine-tuning and optimization. This comprehensive effort has yielded\nsuperior sentence representation models, showcasing the effectiveness of our\napproach.\n","authors":["Ömer Veysel Çağatan"],"pdf_url":"https://arxiv.org/pdf/2401.15316v2.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2402.00253v1","updated":"2024-02-01T00:33:21Z","published":"2024-02-01T00:33:21Z","title":"A Survey on Hallucination in Large Vision-Language Models","summary":" Recent development of Large Vision-Language Models (LVLMs) has attracted\ngrowing attention within the AI landscape for its practical implementation\npotential. However, ``hallucination'', or more specifically, the misalignment\nbetween factual visual content and corresponding textual generation, poses a\nsignificant challenge of utilizing LVLMs. In this comprehensive survey, we\ndissect LVLM-related hallucinations in an attempt to establish an overview and\nfacilitate future mitigation. Our scrutiny starts with a clarification of the\nconcept of hallucinations in LVLMs, presenting a variety of hallucination\nsymptoms and highlighting the unique challenges inherent in LVLM\nhallucinations. Subsequently, we outline the benchmarks and methodologies\ntailored specifically for evaluating hallucinations unique to LVLMs.\nAdditionally, we delve into an investigation of the root causes of these\nhallucinations, encompassing insights from the training data and model\ncomponents. We also critically review existing methods for mitigating\nhallucinations. The open questions and future directions pertaining to\nhallucinations within LVLMs are discussed to conclude this survey.\n","authors":["Hanchao Liu","Wenyuan Xue","Yifei Chen","Dapeng Chen","Xiutian Zhao","Ke Wang","Liping Hou","Rongjun Li","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2402.00253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00251v1","updated":"2024-02-01T00:23:31Z","published":"2024-02-01T00:23:31Z","title":"Efficient Non-Parametric Uncertainty Quantification for Black-Box Large\n Language Models and Decision Planning","summary":" Step-by-step decision planning with large language models (LLMs) is gaining\nattention in AI agent development. This paper focuses on decision planning with\nuncertainty estimation to address the hallucination problem in language models.\nExisting approaches are either white-box or computationally demanding, limiting\nuse of black-box proprietary LLMs within budgets. The paper's first\ncontribution is a non-parametric uncertainty quantification method for LLMs,\nefficiently estimating point-wise dependencies between input-decision on the\nfly with a single inference, without access to token logits. This estimator\ninforms the statistical interpretation of decision trustworthiness. The second\ncontribution outlines a systematic design for a decision-making agent,\ngenerating actions like ``turn on the bathroom light'' based on user prompts\nsuch as ``take a bath''. Users will be asked to provide preferences when more\nthan one action has high estimated point-wise dependencies. In conclusion, our\nuncertainty estimation and decision-making agent design offer a cost-efficient\napproach for AI agent development.\n","authors":["Yao-Hung Hubert Tsai","Walter Talbott","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01065v1","updated":"2024-02-01T23:46:05Z","published":"2024-02-01T23:46:05Z","title":"Evaluation Methodology for Large Language Models for Multilingual\n Document Question and Answer","summary":" With the widespread adoption of Large Language Models (LLMs), in this paper\nwe investigate the multilingual capability of these models. Our preliminary\nresults show that, translating the native language context, question and answer\ninto a high resource language produced the best results.\n","authors":["Adar Kahana","Jaya Susan Mathew","Said Bleik","Jeremy Reynolds","Oren Elisha"],"pdf_url":"https://arxiv.org/pdf/2402.01065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01053v1","updated":"2024-02-01T22:56:39Z","published":"2024-02-01T22:56:39Z","title":"Plan-Grounded Large Language Models for Dual Goal Conversational\n Settings","summary":" Training Large Language Models (LLMs) to follow user instructions has been\nshown to supply the LLM with ample capacity to converse fluently while being\naligned with humans. Yet, it is not completely clear how an LLM can lead a\nplan-grounded conversation in mixed-initiative settings where instructions flow\nin both directions of the conversation, i.e. both the LLM and the user provide\ninstructions to one another. In this paper, we tackle a dual goal\nmixed-initiative conversational setting where the LLM not only grounds the\nconversation on an arbitrary plan but also seeks to satisfy both a procedural\nplan and user instructions. The LLM is then responsible for guiding the user\nthrough the plan and, at the same time, adapting to new circumstances,\nanswering questions, and activating safety guardrails when needed. We propose a\nnovel LLM that grounds the dialogue on a procedural plan, can take the dialogue\ninitiative, and enforces guardrails on the system's behavior, while also\nimproving the LLM's responses to unexpected user behavior. Experiments in\ncontrolled settings and with real users show that the best-performing model,\nwhich we call PlanLLM, achieves a 2.1x improvement over a strong baseline.\nMoreover, experiments also show good generalization to unseen domains.\n","authors":["Diogo Glória-Silva","Rafael Ferreira","Diogo Tavares","David Semedo","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2402.01053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01051v1","updated":"2024-02-01T22:54:31Z","published":"2024-02-01T22:54:31Z","title":"Generation, Distillation and Evaluation of Motivational\n Interviewing-Style Reflections with a Foundational Language Model","summary":" Large Foundational Language Models are capable of performing many tasks at a\nhigh level but are difficult to deploy in many applications because of their\nsize and proprietary ownership. Many will be motivated to distill specific\ncapabilities of foundational models into smaller models that can be owned and\ncontrolled. In the development of a therapeutic chatbot, we wish to distill a\ncapability known as reflective listening, in which a therapist produces\nreflections of client speech. These reflections either restate what a client\nhas said, or connect what was said to a relevant observation, idea or guess\nthat encourages and guides the client to continue contemplation. In this paper,\nwe present a method for distilling the generation of reflections from a\nFoundational Language Model (GPT-4) into smaller models. We first show that\nGPT-4, using zero-shot prompting, can generate reflections at near 100% success\nrate, superior to all previous methods. Using reflections generated by GPT-4,\nwe fine-tune different sizes of the GPT-2 family. The GPT-2-small model\nachieves 83% success on a hold-out test set and the GPT-2 XL achieves 90%\nsuccess. We also show that GPT-4 can help in the labor-intensive task of\nevaluating the quality of the distilled models, using it as a zero-shot\nclassifier. Using triple-human review as a guide, the classifier achieves a\nCohen-Kappa of 0.66, a substantial inter-rater reliability figure.\n","authors":["Andrew Brown","Jiading Zhu","Mohamed Abdelwahab","Alec Dong","Cindy Wang","Jonathan Rose"],"pdf_url":"https://arxiv.org/pdf/2402.01051v1.pdf","comment":"Accepted to EACL 2024 Long Paper"},{"id":"http://arxiv.org/abs/2312.02783v2","updated":"2024-02-01T22:51:24Z","published":"2023-12-05T14:14:27Z","title":"Large Language Models on Graphs: A Comprehensive Survey","summary":" Large language models (LLMs), such as GPT4 and LLaMA, are creating\nsignificant advancements in natural language processing, due to their strong\ntext encoding/decoding ability and newly found emergent capability (e.g.,\nreasoning). While LLMs are mainly designed to process pure texts, there are\nmany real-world scenarios where text data is associated with rich structure\ninformation in the form of graphs (e.g., academic networks, and e-commerce\nnetworks) or scenarios where graph data is paired with rich textual information\n(e.g., molecules with descriptions). Besides, although LLMs have shown their\npure text-based reasoning ability, it is underexplored whether such ability can\nbe generalized to graphs (i.e., graph-based reasoning). In this paper, we\nprovide a systematic review of scenarios and techniques related to large\nlanguage models on graphs. We first summarize potential scenarios of adopting\nLLMs on graphs into three categories, namely pure graphs, text-attributed\ngraphs, and text-paired graphs. We then discuss detailed techniques for\nutilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM\nas Aligner, and compare the advantages and disadvantages of different schools\nof models. Furthermore, we discuss the real-world applications of such methods\nand summarize open-source codes and benchmark datasets. Finally, we conclude\nwith potential future research directions in this fast-growing field. The\nrelated source can be found at\nhttps://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs.\n","authors":["Bowen Jin","Gang Liu","Chi Han","Meng Jiang","Heng Ji","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2312.02783v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2401.09407v2","updated":"2024-02-01T22:35:29Z","published":"2024-01-17T18:45:13Z","title":"Deciphering Textual Authenticity: A Generalized Strategy through the\n Lens of Large Language Semantics for Detecting Human vs. Machine-Generated\n Text","summary":" With the recent proliferation of Large Language Models (LLMs), there has been\nan increasing demand for tools to detect machine-generated text. The effective\ndetection of machine-generated text face two pertinent problems: First, they\nare severely limited in generalizing against real-world scenarios, where\nmachine-generated text is produced by a variety of generators, including but\nnot limited to GPT-4 and Dolly, and spans diverse domains, ranging from\nacademic manuscripts to social media posts. Second, existing detection\nmethodologies treat texts produced by LLMs through a restrictive binary\nclassification lens, neglecting the nuanced diversity of artifacts generated by\ndifferent LLMs. In this work, we undertake a systematic study on the detection\nof machine-generated text in real-world scenarios. We first study the\neffectiveness of state-of-the-art approaches and find that they are severely\nlimited against text produced by diverse generators and domains in the real\nworld. Furthermore, t-SNE visualizations of the embeddings from a pretrained\nLLM's encoder show that they cannot reliably distinguish between human and\nmachine-generated text. Based on our findings, we introduce a novel system,\nT5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder\ncombined with LLM embedding sub-clustering to address the text produced by\ndiverse generators and domains in the real world. We evaluate our approach\nacross 9 machine-generated text systems and 9 domains and find that our\napproach provides state-of-the-art generalization ability, with an average\nincrease in F1 score on machine-generated text of 19.6\\% on unseen generators\nand domains compared to the top performing existing approaches and correctly\nattributes the generator of text with an accuracy of 93.6\\%.\n","authors":["Mazal Bethany","Brandon Wherry","Emet Bethany","Nishant Vishwamitra","Anthony Rios","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2401.09407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01035v1","updated":"2024-02-01T21:49:34Z","published":"2024-02-01T21:49:34Z","title":"Getting the most out of your tokenizer for pre-training and domain\n adaptation","summary":" Tokenization is an understudied and often neglected component of modern LLMs.\nMost published works use a single tokenizer for all experiments, often borrowed\nfrom another model, without performing ablations or analysis to optimize\ntokenization. Moreover, the tokenizer is generally kept unchanged when\nfine-tuning a base model. In this paper, we show that the size,\npre-tokenization regular expression, and training data of a tokenizer can\nsignificantly impact the model's generation speed, effective context size,\nmemory usage, and downstream performance. We train specialized Byte-Pair\nEncoding code tokenizers, and conduct extensive ablations on the impact of\ntokenizer design on the performance of LLMs for code generation tasks such as\nHumanEval and MBPP, and provide recommendations for tokenizer hyper-parameters\nselection and switching the tokenizer in a pre-trained LLM. We perform our\nexperiments on models trained from scratch and from pre-trained models,\nverifying their applicability to a wide range of use-cases. We find that when\nfine-tuning on more than 50 billion tokens, we can specialize the tokenizer of\na pre-trained LLM to obtain large gains in generation speed and effective\ncontext size.\n","authors":["Gautier Dagan","Gabriele Synnaeve","Baptiste Rozière"],"pdf_url":"https://arxiv.org/pdf/2402.01035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01032v1","updated":"2024-02-01T21:44:11Z","published":"2024-02-01T21:44:11Z","title":"Repeat After Me: Transformers are Better than State Space Models at\n Copying","summary":" Transformers are the dominant architecture for sequence modeling, but there\nis growing interest in models that use a fixed-size latent state that does not\ndepend on the sequence length, which we refer to as \"generalized state space\nmodels\" (GSSMs). In this paper we show that while GSSMs are promising in terms\nof inference-time efficiency, they are limited compared to transformer models\non tasks that require copying from the input context. We start with a\ntheoretical analysis of the simple task of string copying and prove that a two\nlayer transformer can copy strings of exponential length while GSSMs are\nfundamentally limited by their fixed-size latent state. Empirically, we find\nthat transformers outperform GSSMs in terms of efficiency and generalization on\nsynthetic tasks that require copying the context. Finally, we evaluate\npretrained large language models and find that transformer models dramatically\noutperform state space models at copying and retrieving information from\ncontext. Taken together, these results suggest a fundamental gap between\ntransformers and GSSMs on tasks of practical interest.\n","authors":["Samy Jelassi","David Brandfonbrener","Sham M. Kakade","Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2402.01032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03731v4","updated":"2024-02-01T21:39:50Z","published":"2023-04-30T11:54:40Z","title":"Working Memory Capacity of ChatGPT: An Empirical Study","summary":" Working memory is a critical aspect of both human intelligence and artificial\nintelligence, serving as a workspace for the temporary storage and manipulation\nof information. In this paper, we systematically assess the working memory\ncapacity of ChatGPT, a large language model developed by OpenAI, by examining\nits performance in verbal and spatial n-back tasks under various conditions.\nOur experiments reveal that ChatGPT has a working memory capacity limit\nstrikingly similar to that of humans. Furthermore, we investigate the impact of\ndifferent instruction strategies on ChatGPT's performance and observe that the\nfundamental patterns of a capacity limit persist. From our empirical findings,\nwe propose that n-back tasks may serve as tools for benchmarking the working\nmemory capacity of large language models and hold potential for informing\nfuture efforts aimed at enhancing AI working memory.\n","authors":["Dongyu Gong","Xingchen Wan","Dingmin Wang"],"pdf_url":"https://arxiv.org/pdf/2305.03731v4.pdf","comment":"Accepted at the 38th AAAI Conference on Artificial Intelligence\n (AAAI-24)"},{"id":"http://arxiv.org/abs/2402.01030v1","updated":"2024-02-01T21:38:58Z","published":"2024-02-01T21:38:58Z","title":"Executable Code Actions Elicit Better LLM Agents","summary":" Large Language Model (LLM) agents, capable of performing a broad range of\nactions, such as invoking tools and controlling robots, show great potential in\ntackling real-world challenges. LLM agents are typically prompted to produce\nactions by generating JSON or text in a pre-defined format, which is usually\nlimited by constrained action space (e.g., the scope of pre-defined tools) and\nrestricted flexibility (e.g., inability to compose multiple tools). This work\nproposes to use executable Python code to consolidate LLM agents' actions into\na unified action space (CodeAct). Integrated with a Python interpreter, CodeAct\ncan execute code actions and dynamically revise prior actions or emit new\nactions upon new observations through multi-turn interactions. Our extensive\nanalysis of 17 LLMs on API-Bank and a newly curated benchmark shows that\nCodeAct outperforms widely used alternatives (up to 20% higher success rate).\nThe encouraging performance of CodeAct motivates us to build an open-source LLM\nagent that interacts with environments by executing interpretable code and\ncollaborates with users using natural language. To this end, we collect an\ninstruction-tuning dataset CodeActInstruct that consists of 7k multi-turn\ninteractions using CodeAct. We show that it can be used with existing data to\nimprove models in agent-oriented tasks without compromising their general\ncapability. CodeActAgent, finetuned from Llama2 and Mistral, is integrated with\nPython interpreter and uniquely tailored to perform sophisticated tasks (e.g.,\nmodel training) using existing libraries and autonomously self-debug.\n","authors":["Xingyao Wang","Yangyi Chen","Lifan Yuan","Yizhe Zhang","Yunzhu Li","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2402.01030v1.pdf","comment":"Code, data, model, and demo are available at\n https://github.com/xingyaoww/code-act"},{"id":"http://arxiv.org/abs/2402.01025v1","updated":"2024-02-01T21:27:19Z","published":"2024-02-01T21:27:19Z","title":"Graph-based Clustering for Detecting Semantic Change Across Time and\n Languages","summary":" Despite the predominance of contextualized embeddings in NLP, approaches to\ndetect semantic change relying on these embeddings and clustering methods\nunderperform simpler counterparts based on static word embeddings. This stems\nfrom the poor quality of the clustering methods to produce sense clusters --\nwhich struggle to capture word senses, especially those with low frequency.\nThis issue hinders the next step in examining how changes in word senses in one\nlanguage influence another. To address this issue, we propose a graph-based\nclustering approach to capture nuanced changes in both high- and low-frequency\nword senses across time and languages, including the acquisition and loss of\nthese senses over time. Our experimental results show that our approach\nsubstantially surpasses previous approaches in the SemEval2020 binary\nclassification task across four languages. Moreover, we showcase the ability of\nour approach as a versatile visualization tool to detect semantic changes in\nboth intra-language and inter-language setups. We make our code and data\npublicly available.\n","authors":["Xianghe Ma","Michael Strube","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01025v1.pdf","comment":"EACL2024 Camera Ready (20 pages)"},{"id":"http://arxiv.org/abs/2402.01019v1","updated":"2024-02-01T21:13:04Z","published":"2024-02-01T21:13:04Z","title":"Domain-Independent Deception: A New Taxonomy and Linguistic Analysis","summary":" Internet-based economies and societies are drowning in deceptive attacks.\nThese attacks take many forms, such as fake news, phishing, and job scams,\nwhich we call ``domains of deception.'' Machine-learning and\nnatural-language-processing researchers have been attempting to ameliorate this\nprecarious situation by designing domain-specific detectors. Only a few recent\nworks have considered domain-independent deception. We collect these disparate\nthreads of research and investigate domain-independent deception. First, we\nprovide a new computational definition of deception and break down deception\ninto a new taxonomy. Then, we analyze the debate on linguistic cues for\ndeception and supply guidelines for systematic reviews. Finally, we investigate\ncommon linguistic features and give evidence for knowledge transfer across\ndifferent forms of deception.\n","authors":["Rakesh M. Verma","Nachum Dershowitz","Victor Zeng","Dainis Boumber","Xuting Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01019v1.pdf","comment":"33 pages. arXiv admin note: text overlap with arXiv:2207.01738"},{"id":"http://arxiv.org/abs/2402.01018v1","updated":"2024-02-01T21:10:44Z","published":"2024-02-01T21:10:44Z","title":"HR-MultiWOZ: A Task Oriented Dialogue (TOD) Dataset for HR LLM Agent","summary":" Recent advancements in Large Language Models (LLMs) have been reshaping\nNatural Language Processing (NLP) task in several domains. Their use in the\nfield of Human Resources (HR) has still room for expansions and could be\nbeneficial for several time consuming tasks. Examples such as time-off\nsubmissions, medical claims filing, and access requests are noteworthy, but\nthey are by no means the sole instances. However, the aforementioned\ndevelopments must grapple with the pivotal challenge of constructing a\nhigh-quality training dataset. On one hand, most conversation datasets are\nsolving problems for customers not employees. On the other hand, gathering\nconversations with HR could raise privacy concerns. To solve it, we introduce\nHR-Multiwoz, a fully-labeled dataset of 550 conversations spanning 10 HR\ndomains to evaluate LLM Agent. Our work has the following contributions: (1) It\nis the first labeled open-sourced conversation dataset in the HR domain for NLP\nresearch. (2) It provides a detailed recipe for the data generation procedure\nalong with data analysis and human evaluations. The data generation pipeline is\ntransferable and can be easily adapted for labeled conversation data generation\nin other domains. (3) The proposed data-collection pipeline is mostly based on\nLLMs with minimal human involvement for annotation, which is time and\ncost-efficient.\n","authors":["Weijie Xu","Zicheng Huang","Wenxiang Hu","Xi Fang","Rajesh Kumar Cherukuri","Naumaan Nayyar","Lorenzo Malandri","Srinivasan H. Sengamedu"],"pdf_url":"https://arxiv.org/pdf/2402.01018v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.17498v2","updated":"2024-02-01T20:43:02Z","published":"2024-01-30T23:08:26Z","title":"Improving QA Model Performance with Cartographic Inoculation","summary":" QA models are faced with complex and open-ended contextual reasoning\nproblems, but can often learn well-performing solution heuristics by exploiting\ndataset-specific patterns in their training data. These patterns, or \"dataset\nartifacts\", reduce the model's ability to generalize to real-world QA problems.\nUtilizing an ElectraSmallDiscriminator model trained for QA, we analyze the\nimpacts and incidence of dataset artifacts using an adversarial challenge set\ndesigned to confuse models reliant on artifacts for prediction. Extending\nexisting work on methods for mitigating artifact impacts, we propose\ncartographic inoculation, a novel method that fine-tunes models on an optimized\nsubset of the challenge data to reduce model reliance on dataset artifacts. We\nshow that by selectively fine-tuning a model on ambiguous adversarial examples\nfrom a challenge set, significant performance improvements can be made on the\nfull challenge dataset with minimal loss of model generalizability to other\nchallenging environments and QA datasets.\n","authors":["Allen Chen","Okan Tanrikulu"],"pdf_url":"https://arxiv.org/pdf/2401.17498v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.15378v4","updated":"2024-02-01T20:28:11Z","published":"2024-01-27T10:50:11Z","title":"A RAG-based Question Answering System Proposal for Understanding Islam:\n MufassirQAS LLM","summary":" Challenges exist in learning and understanding religions, such as the\ncomplexity and depth of religious doctrines and teachings. Chatbots as\nquestion-answering systems can help in solving these challenges. LLM chatbots\nuse NLP techniques to establish connections between topics and accurately\nrespond to complex questions. These capabilities make it perfect for\nenlightenment on religion as a question-answering chatbot. However, LLMs also\ntend to generate false information, known as hallucination. Also, the chatbots'\nresponses can include content that insults personal religious beliefs,\ninterfaith conflicts, and controversial or sensitive topics. It must avoid such\ncases without promoting hate speech or offending certain groups of people or\ntheir beliefs. This study uses a vector database-based Retrieval Augmented\nGeneration (RAG) approach to enhance the accuracy and transparency of LLMs. Our\nquestion-answering system is called \"MufassirQAS\". We created a database\nconsisting of several open-access books that include Turkish context. These\nbooks contain Turkish translations and interpretations of Islam. This database\nis utilized to answer religion-related questions and ensure our answers are\ntrustworthy. The relevant part of the dataset, which LLM also uses, is\npresented along with the answer. We have put careful effort into creating\nsystem prompts that give instructions to prevent harmful, offensive, or\ndisrespectful responses to respect people's values and provide reliable\nresults. The system answers and shares additional information, such as the page\nnumber from the respective book and the articles referenced for obtaining the\ninformation. MufassirQAS and ChatGPT are also tested with sensitive questions.\nWe got better performance with our system. Study and enhancements are still in\nprogress. Results and future works are given.\n","authors":["Ahmet Yusuf Alan","Enis Karaarslan","Ömer Aydin"],"pdf_url":"https://arxiv.org/pdf/2401.15378v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00978v1","updated":"2024-02-01T19:49:44Z","published":"2024-02-01T19:49:44Z","title":"An Information-Theoretic Approach to Analyze NLP Classification Tasks","summary":" Understanding the importance of the inputs on the output is useful across\nmany tasks. This work provides an information-theoretic framework to analyse\nthe influence of inputs for text classification tasks. Natural language\nprocessing (NLP) tasks take either a single element input or multiple element\ninputs to predict an output variable, where an element is a block of text. Each\ntext element has two components: an associated semantic meaning and a\nlinguistic realization. Multiple-choice reading comprehension (MCRC) and\nsentiment classification (SC) are selected to showcase the framework. For MCRC,\nit is found that the context influence on the output compared to the question\ninfluence reduces on more challenging datasets. In particular, more challenging\ncontexts allow a greater variation in complexity of questions. Hence, test\ncreators need to carefully consider the choice of the context when designing\nmultiple-choice questions for assessment. For SC, it is found the semantic\nmeaning of the input text dominates (above 80\\% for all datasets considered)\ncompared to its linguistic realisation when determining the sentiment. The\nframework is made available at:\nhttps://github.com/WangLuran/nlp-element-influence\n","authors":["Luran Wang","Mark Gales","Vatsal Raina"],"pdf_url":"https://arxiv.org/pdf/2402.00978v1.pdf","comment":"21 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2402.00969v1","updated":"2024-02-01T19:38:32Z","published":"2024-02-01T19:38:32Z","title":"SPARQL Generation with Entity Pre-trained GPT for KG Question Answering","summary":" Knowledge Graphs popularity has been rapidly growing in last years. All that\nknowledge is available for people to query it through the many online databases\non the internet. Though, it would be a great achievement if non-programmer\nusers could access whatever information they want to know. There has been a lot\nof effort oriented to solve this task using natural language processing tools\nand creativity encouragement by way of many challenges. Our approach focuses on\nassuming a correct entity linking on the natural language questions and\ntraining a GPT model to create SPARQL queries from them. We managed to isolate\nwhich property of the task can be the most difficult to solve at few or\nzero-shot and we proposed pre-training on all entities (under CWA) to improve\nthe performance. We obtained a 62.703% accuracy of exact SPARQL matches on\ntesting at 3-shots, a F1 of 0.809 on the entity linking challenge and a F1 of\n0.009 on the question answering challenge.\n","authors":["Diego Bustamante","Hideaki Takeda"],"pdf_url":"https://arxiv.org/pdf/2402.00969v1.pdf","comment":"7 pages, 1 figure, 2 tables. For the implementation, see\n https://github.com/DiegoEmilio01/SPARQL-generation-with-entity-pre-trained-GPT-for-KG-Question-Answering"},{"id":"http://arxiv.org/abs/2402.00956v1","updated":"2024-02-01T19:25:50Z","published":"2024-02-01T19:25:50Z","title":"Exploring Spatial Schema Intuitions in Large Language and Vision Models","summary":" Despite the ubiquity of large language models (LLMs) in AI research, the\nquestion of embodiment in LLMs remains underexplored, distinguishing them from\nembodied systems in robotics where sensory perception directly informs physical\naction. Our investigation navigates the intriguing terrain of whether LLMs,\ndespite their non-embodied nature, effectively capture implicit human\nintuitions about fundamental, spatial building blocks of language. We employ\ninsights from spatial cognitive foundations developed through early\nsensorimotor experiences, guiding our exploration through the reproduction of\nthree psycholinguistic experiments. Surprisingly, correlations between model\noutputs and human responses emerge, revealing adaptability without a tangible\nconnection to embodied experiences. Notable distinctions include polarized\nlanguage model responses and reduced correlations in vision language models.\nThis research contributes to a nuanced understanding of the interplay between\nlanguage, spatial experiences, and the computations made by large language\nmodels. More at https://cisnlp.github.io/Spatial_Schemas/\n","authors":["Philipp Wicke","Lennart Wachowiak"],"pdf_url":"https://arxiv.org/pdf/2402.00956v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.11244v2","updated":"2024-02-01T19:05:44Z","published":"2023-10-17T13:12:32Z","title":"Entity Matching using Large Language Models","summary":" Entity Matching is the task of deciding whether two entity descriptions refer\nto the same real-world entity. It is a central step in most data integration\npipelines and an enabler for many e-commerce applications which require to\nmatch products offers from different vendors. State-of-the-art entity matching\nmethods rely on pre-trained language models (PLMs) such as BERT or RoBERTa. Two\nmajor drawbacks of these models for entity matching are that (i) the models\nrequire significant amounts of task-specific training data and (ii) the\nfine-tuned models are not robust concerning out-of-distribution entities. We\ninvestigate using generative large language models (LLMs) for entity matching\nas a less task-specific training data dependent and more robust alternative to\nPLM-based matchers. Our study covers hosted LLMs as well as open-source LLMs\nwhich can be run locally. We evaluate these models in a zero-shot scenario as\nwell as a scenario where task-specific training data is available. We compare\ndifferent prompt designs as well as the prompt sensitivity of the models and\nshow that there is no single best prompt but the prompt is akin to a\nhyperparameter that needs to be estimated for each model/dataset combination.\nWe further investigate (i) the selection of in-context demonstrations, (ii) the\ngeneration of matching rules, as well as (iii) fine-tuning a hosted LLM using\nthe same pool of training data. Our experiments show that the best LLMs require\nno or only a few training examples to reach a similar performance as fine-tuned\nPLMs. They further exhibit a higher robustness to unseen entities, which makes\nthem especially suited to use cases where no training data is available. We\nshow that for use cases that do not allow data to be shared with third parties,\nopen-source LLMs can be a viable alternative to hosted LLMs given that a small\namount of training data or matching knowledge...\n","authors":["Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.11244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17435v2","updated":"2024-02-01T18:57:18Z","published":"2024-01-30T20:49:47Z","title":"Can Large Language Models Replace Economic Choice Prediction Labs?","summary":" Economic choice prediction is an essential challenging task, often\nconstrained by the difficulties in acquiring human choice data. Indeed,\nexperimental economics studies had focused mostly on simple choice settings.\nThe AI community has recently contributed to that effort in two ways:\nconsidering whether LLMs can substitute for humans in the above-mentioned\nsimple choice prediction settings, and the study through ML lens of more\nelaborated but still rigorous experimental economics settings, employing\nincomplete information, repetitive play, and natural language communication,\nnotably language-based persuasion games. This leaves us with a major\ninspiration: can LLMs be used to fully simulate the economic environment and\ngenerate data for efficient human choice prediction, substituting for the\nelaborated economic lab studies? We pioneer the study of this subject,\ndemonstrating its feasibility. In particular, we show that a model trained\nsolely on LLM-generated data can effectively predict human behavior in a\nlanguage-based persuasion game, and can even outperform models trained on\nactual human data.\n","authors":["Eilam Shapira","Omer Madmon","Roi Reichart","Moshe Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2401.17435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00913v1","updated":"2024-02-01T10:58:10Z","published":"2024-02-01T10:58:10Z","title":"Institutional Platform for Secure Self-Service Large Language Model\n Exploration","summary":" This paper introduces a user-friendly platform developed by the University of\nKentucky Center for Applied AI, designed to make large, customized language\nmodels (LLMs) more accessible. By capitalizing on recent advancements in\nmulti-LoRA inference, the system efficiently accommodates custom adapters for a\ndiverse range of users and projects. The paper outlines the system's\narchitecture and key features, encompassing dataset curation, model training,\nsecure inference, and text-based feature extraction.\n We illustrate the establishment of a tenant-aware computational network using\nagent-based methods, securely utilizing islands of isolated resources as a\nunified system. The platform strives to deliver secure LLM services,\nemphasizing process and data isolation, end-to-end encryption, and role-based\nresource authentication. This contribution aligns with the overarching goal of\nenabling simplified access to cutting-edge AI models and technology in support\nof scientific discovery.\n","authors":["V. K. Cody Bumgardner","Mitchell A. Klusty","W. Vaiden Logan","Samuel E. Armstrong","Caylin Hickey","Jeff Talbert"],"pdf_url":"https://arxiv.org/pdf/2402.00913v1.pdf","comment":"10 pages 11 figures, 5 listings, 4 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.00867v1","updated":"2024-02-01T18:59:56Z","published":"2024-02-01T18:59:56Z","title":"AToM: Amortized Text-to-Mesh using 2D Diffusion","summary":" We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh\nframework optimized across multiple text prompts simultaneously. In contrast to\nexisting text-to-3D methods that often entail time-consuming per-prompt\noptimization and commonly output representations other than polygonal meshes,\nAToM directly generates high-quality textured meshes in less than 1 second with\naround 10 times reduction in the training cost, and generalizes to unseen\nprompts. Our key idea is a novel triplane-based text-to-mesh architecture with\na two-stage amortized optimization strategy that ensures stable training and\nenables scalability. Through extensive experiments on various prompt\nbenchmarks, AToM significantly outperforms state-of-the-art amortized\napproaches with over 4 times higher accuracy (in DF415 dataset) and produces\nmore distinguishable and higher-quality 3D outputs. AToM demonstrates strong\ngeneralizability, offering finegrained 3D assets for unseen interpolated\nprompts without further optimization during inference, unlike per-prompt\nsolutions.\n","authors":["Guocheng Qian","Junli Cao","Aliaksandr Siarohin","Yash Kant","Chaoyang Wang","Michael Vasilkovsky","Hsin-Ying Lee","Yuwei Fang","Ivan Skorokhodov","Peiye Zhuang","Igor Gilitschenski","Jian Ren","Bernard Ghanem","Kfir Aberman","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2402.00867v1.pdf","comment":"19 pages with appendix and references. Webpage:\n https://snap-research.github.io/AToM/"},{"id":"http://arxiv.org/abs/2402.00868v1","updated":"2024-02-01T18:59:56Z","published":"2024-02-01T18:59:56Z","title":"We're Not Using Videos Effectively: An Updated Domain Adaptive Video\n Segmentation Baseline","summary":" There has been abundant work in unsupervised domain adaptation for semantic\nsegmentation (DAS) seeking to adapt a model trained on images from a labeled\nsource domain to an unlabeled target domain. While the vast majority of prior\nwork has studied this as a frame-level Image-DAS problem, a few Video-DAS works\nhave sought to additionally leverage the temporal signal present in adjacent\nframes. However, Video-DAS works have historically studied a distinct set of\nbenchmarks from Image-DAS, with minimal cross-benchmarking. In this work, we\naddress this gap. Surprisingly, we find that (1) even after carefully\ncontrolling for data and model architecture, state-of-the-art Image-DAS methods\n(HRDA and HRDA+MIC)} outperform Video-DAS methods on established Video-DAS\nbenchmarks (+14.5 mIoU on Viper$\\rightarrow$CityscapesSeq, +19.0 mIoU on\nSynthia$\\rightarrow$CityscapesSeq), and (2) naive combinations of Image-DAS and\nVideo-DAS techniques only lead to marginal improvements across datasets. To\navoid siloed progress between Image-DAS and Video-DAS, we open-source our\ncodebase with support for a comprehensive set of Video-DAS and Image-DAS\nmethods on a common benchmark. Code available at\nhttps://github.com/SimarKareer/UnifiedVideoDA\n","authors":["Simar Kareer","Vivek Vijaykumar","Harsh Maheshwari","Prithvijit Chattopadhyay","Judy Hoffman","Viraj Prabhu"],"pdf_url":"https://arxiv.org/pdf/2402.00868v1.pdf","comment":"TMLR 2024"},{"id":"http://arxiv.org/abs/2402.00865v1","updated":"2024-02-01T18:59:22Z","published":"2024-02-01T18:59:22Z","title":"Towards Optimal Feature-Shaping Methods for Out-of-Distribution\n Detection","summary":" Feature shaping refers to a family of methods that exhibit state-of-the-art\nperformance for out-of-distribution (OOD) detection. These approaches\nmanipulate the feature representation, typically from the penultimate layer of\na pre-trained deep learning model, so as to better differentiate between\nin-distribution (ID) and OOD samples. However, existing feature-shaping methods\nusually employ rules manually designed for specific model architectures and OOD\ndatasets, which consequently limit their generalization ability. To address\nthis gap, we first formulate an abstract optimization framework for studying\nfeature-shaping methods. We then propose a concrete reduction of the framework\nwith a simple piecewise constant shaping function and show that existing\nfeature-shaping methods approximate the optimal solution to the concrete\noptimization problem. Further, assuming that OOD data is inaccessible, we\npropose a formulation that yields a closed-form solution for the piecewise\nconstant shaping function, utilizing solely the ID data. Through extensive\nexperiments, we show that the feature-shaping function optimized by our method\nimproves the generalization ability of OOD detection across a large variety of\ndatasets and model architectures.\n","authors":["Qinyu Zhao","Ming Xu","Kartik Gupta","Akshay Asthana","Liang Zheng","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2402.00865v1.pdf","comment":"ICLR 2024. Project page: https://github.com/Qinyu-Allen-Zhao/OptFSOOD"},{"id":"http://arxiv.org/abs/2402.00864v1","updated":"2024-02-01T18:59:09Z","published":"2024-02-01T18:59:09Z","title":"ViCA-NeRF: View-Consistency-Aware 3D Editing of Neural Radiance Fields","summary":" We introduce ViCA-NeRF, the first view-consistency-aware method for 3D\nediting with text instructions. In addition to the implicit neural radiance\nfield (NeRF) modeling, our key insight is to exploit two sources of\nregularization that explicitly propagate the editing information across\ndifferent views, thus ensuring multi-view consistency. For geometric\nregularization, we leverage the depth information derived from NeRF to\nestablish image correspondences between different views. For learned\nregularization, we align the latent codes in the 2D diffusion model between\nedited and unedited images, enabling us to edit key views and propagate the\nupdate throughout the entire scene. Incorporating these two strategies, our\nViCA-NeRF operates in two stages. In the initial stage, we blend edits from\ndifferent views to create a preliminary 3D edit. This is followed by a second\nstage of NeRF training, dedicated to further refining the scene's appearance.\nExperimental results demonstrate that ViCA-NeRF provides more flexible,\nefficient (3 times faster) editing with higher levels of consistency and\ndetails, compared with the state of the art. Our code is publicly available.\n","authors":["Jiahua Dong","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.00864v1.pdf","comment":"Neurips2023; project page: https://github.com/Dongjiahua/VICA-NeRF"},{"id":"http://arxiv.org/abs/2402.00863v1","updated":"2024-02-01T18:58:44Z","published":"2024-02-01T18:58:44Z","title":"Geometry Transfer for Stylizing Radiance Fields","summary":" Shape and geometric patterns are essential in defining stylistic identity.\nHowever, current 3D style transfer methods predominantly focus on transferring\ncolors and textures, often overlooking geometric aspects. In this paper, we\nintroduce Geometry Transfer, a novel method that leverages geometric\ndeformation for 3D style transfer. This technique employs depth maps to extract\na style guide, subsequently applied to stylize the geometry of radiance fields.\nMoreover, we propose new techniques that utilize geometric cues from the 3D\nscene, thereby enhancing aesthetic expressiveness and more accurately\nreflecting intended styles. Our extensive experiments show that Geometry\nTransfer enables a broader and more expressive range of stylizations, thereby\nsignificantly expanding the scope of 3D style transfer.\n","authors":["Hyunyoung Jung","Seonghyeon Nam","Nikolaos SarafianosSungjoo Yoo","Alexander Sorkine-Hornung","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.00863v1.pdf","comment":"project page: https://hyblue.github.io/geo-srf/"},{"id":"http://arxiv.org/abs/2311.03076v3","updated":"2024-02-01T18:47:18Z","published":"2023-11-06T13:01:17Z","title":"SugarViT -- Multi-objective Regression of UAV Images with Vision\n Transformers and Deep Label Distribution Learning Demonstrated on Disease\n Severity Prediction in Sugar Beet","summary":" Remote sensing and artificial intelligence are pivotal technologies of\nprecision agriculture nowadays. The efficient retrieval of large-scale field\nimagery combined with machine learning techniques shows success in various\ntasks like phenotyping, weeding, cropping, and disease control. This work will\nintroduce a machine learning framework for automatized large-scale\nplant-specific trait annotation for the use case disease severity scoring for\nCercospora Leaf Spot (CLS) in sugar beet. With concepts of Deep Label\nDistribution Learning (DLDL), special loss functions, and a tailored model\narchitecture, we develop an efficient Vision Transformer based model for\ndisease severity scoring called SugarViT. One novelty in this work is the\ncombination of remote sensing data with environmental parameters of the\nexperimental sites for disease severity prediction. Although the model is\nevaluated on this special use case, it is held as generic as possible to also\nbe applicable to various image-based classification and regression tasks. With\nour framework, it is even possible to learn models on multi-objective problems\nas we show by a pretraining on environmental metadata.\n","authors":["Maurice Günder","Facundo Ramón Ispizua Yamati","Abel Andree Barreto Alcántara","Anne-Katrin Mahlein","Rafet Sifa","Christian Bauckhage"],"pdf_url":"https://arxiv.org/pdf/2311.03076v3.pdf","comment":"submitted to Computers and Electronics in Agriculture"},{"id":"http://arxiv.org/abs/2401.09923v2","updated":"2024-02-01T18:43:06Z","published":"2024-01-18T12:13:06Z","title":"MAMBA: Multi-level Aggregation via Memory Bank for Video Object\n Detection","summary":" State-of-the-art video object detection methods maintain a memory structure,\neither a sliding window or a memory queue, to enhance the current frame using\nattention mechanisms. However, we argue that these memory structures are not\nefficient or sufficient because of two implied operations: (1) concatenating\nall features in memory for enhancement, leading to a heavy computational cost;\n(2) frame-wise memory updating, preventing the memory from capturing more\ntemporal information. In this paper, we propose a multi-level aggregation\narchitecture via memory bank called MAMBA. Specifically, our memory bank\nemploys two novel operations to eliminate the disadvantages of existing\nmethods: (1) light-weight key-set construction which can significantly reduce\nthe computational cost; (2) fine-grained feature-wise updating strategy which\nenables our method to utilize knowledge from the whole video. To better enhance\nfeatures from complementary levels, i.e., feature maps and proposals, we\nfurther propose a generalized enhancement operation (GEO) to aggregate\nmulti-level features in a unified manner. We conduct extensive evaluations on\nthe challenging ImageNetVID dataset. Compared with existing state-of-the-art\nmethods, our method achieves superior performance in terms of both speed and\naccuracy. More remarkably, MAMBA achieves mAP of 83.7/84.6% at 12.6/9.1 FPS\nwith ResNet-101. Code is available at\nhttps://github.com/guanxiongsun/vfe.pytorch.\n","authors":["Guanxiong Sun","Yang Hua","Guosheng Hu","Neil Robertson"],"pdf_url":"https://arxiv.org/pdf/2401.09923v2.pdf","comment":"update code url https://github.com/guanxiongsun/vfe.pytorch"},{"id":"http://arxiv.org/abs/2402.00847v1","updated":"2024-02-01T18:38:55Z","published":"2024-02-01T18:38:55Z","title":"BootsTAP: Bootstrapped Training for Tracking-Any-Point","summary":" To endow models with greater understanding of physics and motion, it is\nuseful to enable them to perceive how solid surfaces move and deform in real\nscenes. This can be formalized as Tracking-Any-Point (TAP), which requires the\nalgorithm to be able to track any point corresponding to a solid surface in a\nvideo, potentially densely in space and time. Large-scale ground-truth training\ndata for TAP is only available in simulation, which currently has limited\nvariety of objects and motion. In this work, we demonstrate how large-scale,\nunlabeled, uncurated real-world data can improve a TAP model with minimal\narchitectural changes, using a self-supervised student-teacher setup. We\ndemonstrate state-of-the-art performance on the TAP-Vid benchmark surpassing\nprevious results by a wide margin: for example, TAP-Vid-DAVIS performance\nimproves from 61.3% to 66.4%, and TAP-Vid-Kinetics from 57.2% to 61.5%.\n","authors":["Carl Doersch","Yi Yang","Dilara Gokay","Pauline Luc","Skanda Koppula","Ankush Gupta","Joseph Heyward","Ross Goroshin","João Carreira","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2402.00847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01879v3","updated":"2024-02-01T18:22:25Z","published":"2023-06-02T19:19:43Z","title":"Revisiting the Role of Language Priors in Vision-Language Models","summary":" Vision-language models (VLMs) are impactful in part because they can be\napplied to a variety of visual understanding tasks in a zero-shot fashion,\nwithout any fine-tuning. We study $\\textit{generative VLMs}$ that are trained\nfor next-word generation given an image. We explore their zero-shot performance\non the illustrative task of image-text retrieval across 8 popular\nvision-language benchmarks. Our first observation is that they can be\nrepurposed for discriminative tasks (such as image-text retrieval) by simply\ncomputing the match score of generating a particular text string given an\nimage. We call this probabilistic score the $\\textit{Visual Generative\nPre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces\nnear-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on\nothers. We analyze this behavior through a probabilistic lens, pointing out\nthat some benchmarks inadvertently capture unnatural language distributions by\ncreating adversarial but unlikely text captions. In fact, we demonstrate that\neven a \"blind\" language model that ignores any image evidence can sometimes\noutperform all prior art, reminiscent of similar challenges faced by the\nvisual-question answering (VQA) community many years ago. We derive a\nprobabilistic post-processing scheme that controls for the amount of linguistic\nbias in generative VLMs at test time without having to retrain or fine-tune the\nmodel. We show that the VisualGPTScore, when appropriately debiased, is a\nstrong zero-shot baseline for vision-language understanding, oftentimes\nproducing state-of-the-art accuracy.\n","authors":["Zhiqiu Lin","Xinyue Chen","Deepak Pathak","Pengchuan Zhang","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2306.01879v3.pdf","comment":"Website: https://linzhiqiu.github.io/papers/visual_gpt_score/"},{"id":"http://arxiv.org/abs/2402.00827v1","updated":"2024-02-01T18:14:42Z","published":"2024-02-01T18:14:42Z","title":"Emo-Avatar: Efficient Monocular Video Style Avatar through Texture\n Rendering","summary":" Artistic video portrait generation is a significant and sought-after task in\nthe fields of computer graphics and vision. While various methods have been\ndeveloped that integrate NeRFs or StyleGANs with instructional editing models\nfor creating and editing drivable portraits, these approaches face several\nchallenges. They often rely heavily on large datasets, require extensive\ncustomization processes, and frequently result in reduced image quality. To\naddress the above problems, we propose the Efficient Monotonic Video Style\nAvatar (Emo-Avatar) through deferred neural rendering that enhances StyleGAN's\ncapacity for producing dynamic, drivable portrait videos. We proposed a\ntwo-stage deferred neural rendering pipeline. In the first stage, we utilize\nfew-shot PTI initialization to initialize the StyleGAN generator through\nseveral extreme poses sampled from the video to capture the consistent\nrepresentation of aligned faces from the target portrait. In the second stage,\nwe propose a Laplacian pyramid for high-frequency texture sampling from UV maps\ndeformed by dynamic flow of expression for motion-aware texture prior\nintegration to provide torso features to enhance StyleGAN's ability to generate\ncomplete and upper body for portrait video rendering. Emo-Avatar reduces style\ncustomization time from hours to merely 5 minutes compared with existing\nmethods. In addition, Emo-Avatar requires only a single reference image for\nediting and employs region-aware contrastive learning with semantic invariant\nCLIP guidance, ensuring consistent high-resolution output and identity\npreservation. Through both quantitative and qualitative assessments, Emo-Avatar\ndemonstrates superior performance over existing methods in terms of training\nefficiency, rendering quality and editability in self- and cross-reenactment.\n","authors":["Pinxin Liu","Luchuan Song","Daoan Zhang","Hang Hua","Yunlong Tang","Huaijin Tu","Jiebo Luo","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.00827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06643v3","updated":"2024-02-01T18:05:36Z","published":"2023-12-11T18:56:03Z","title":"Gaze Detection and Analysis for Initiating Joint Activity in Industrial\n Human-Robot Collaboration","summary":" Collaborative robots (cobots) are widely used in industrial applications, yet\nextensive research is still needed to enhance human-robot collaborations and\noperator experience. A potential approach to improve the collaboration\nexperience involves adapting cobot behavior based on natural cues from the\noperator. Inspired by the literature on human-human interactions, we conducted\na wizard-of-oz study to examine whether a gaze towards the cobot can serve as a\ntrigger for initiating joint activities in collaborative sessions. In this\nstudy, 37 participants engaged in an assembly task while their gaze behavior\nwas analyzed. We employ a gaze-based attention recognition model to identify\nwhen the participants look at the cobot. Our results indicate that in most\ncases (84.88\\%), the joint activity is preceded by a gaze towards the cobot.\nFurthermore, during the entire assembly cycle, the participants tend to look at\nthe cobot around the time of the joint activity. To the best of our knowledge,\nthis is the first study to analyze the natural gaze behavior of participants\nworking on a joint activity with a robot during a collaborative assembly task.\n","authors":["Pooja Prajod","Matteo Lavit Nicora","Marta Mondellini","Giovanni Tauro","Rocco Vertechy","Matteo Malosio","Elisabeth André"],"pdf_url":"https://arxiv.org/pdf/2312.06643v3.pdf","comment":"First draft for a paper submitted to Frontiers in Robotics and AI"},{"id":"http://arxiv.org/abs/2302.05087v3","updated":"2024-02-01T17:32:03Z","published":"2023-02-10T07:11:37Z","title":"Generalized Video Anomaly Event Detection: Systematic Taxonomy and\n Comparison of Deep Models","summary":" Video Anomaly Detection (VAD) serves as a pivotal technology in the\nintelligent surveillance systems, enabling the temporal or spatial\nidentification of anomalous events within videos. While existing reviews\npredominantly concentrate on conventional unsupervised methods, they often\noverlook the emergence of weakly-supervised and fully-unsupervised approaches.\nTo address this gap, this survey extends the conventional scope of VAD beyond\nunsupervised methods, encompassing a broader spectrum termed Generalized Video\nAnomaly Event Detection (GVAED). By skillfully incorporating recent\nadvancements rooted in diverse assumptions and learning frameworks, this survey\nintroduces an intuitive taxonomy that seamlessly navigates through\nunsupervised, weakly-supervised, supervised and fully-unsupervised VAD\nmethodologies, elucidating the distinctions and interconnections within these\nresearch trajectories. In addition, this survey facilitates prospective\nresearchers by assembling a compilation of research resources, including public\ndatasets, available codebases, programming tools, and pertinent literature.\nFurthermore, this survey quantitatively assesses model performance, delves into\nresearch challenges and directions, and outlines potential avenues for future\nexploration.\n","authors":["Yang Liu","Dingkang Yang","Yan Wang","Jing Liu","Jun Liu","Azzedine Boukerche","Peng Sun","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2302.05087v3.pdf","comment":"Accepted by ACM Computing Surveys. For more information, please see\n our project page: https://github.com/fudanyliu/GVAED"},{"id":"http://arxiv.org/abs/2402.00769v1","updated":"2024-02-01T16:58:11Z","published":"2024-02-01T16:58:11Z","title":"AnimateLCM: Accelerating the Animation of Personalized Diffusion Models\n and Adapters with Decoupled Consistency Learning","summary":" Video diffusion models has been gaining increasing attention for its ability\nto produce videos that are both coherent and of high fidelity. However, the\niterative denoising process makes it computationally intensive and\ntime-consuming, thus limiting its applications. Inspired by the Consistency\nModel (CM) that distills pretrained image diffusion models to accelerate the\nsampling with minimal steps and its successful extension Latent Consistency\nModel (LCM) on conditional image generation, we propose AnimateLCM, allowing\nfor high-fidelity video generation within minimal steps. Instead of directly\nconducting consistency learning on the raw video dataset, we propose a\ndecoupled consistency learning strategy that decouples the distillation of\nimage generation priors and motion generation priors, which improves the\ntraining efficiency and enhance the generation visual quality. Additionally, to\nenable the combination of plug-and-play adapters in stable diffusion community\nto achieve various functions (e.g., ControlNet for controllable generation). we\npropose an efficient strategy to adapt existing adapters to our distilled\ntext-conditioned video consistency model or train adapters from scratch without\nharming the sampling speed. We validate the proposed strategy in\nimage-conditioned video generation and layout-conditioned video generation, all\nachieving top-performing results. Experimental results validate the\neffectiveness of our proposed method. Code and weights will be made public.\nMore details are available at https://github.com/G-U-N/AnimateLCM.\n","authors":["Fu-Yun Wang","Zhaoyang Huang","Xiaoyu Shi","Weikang Bian","Guanglu Song","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2402.00769v1.pdf","comment":"Project Page: https://animatelcm.github.io/"},{"id":"http://arxiv.org/abs/2402.00763v1","updated":"2024-02-01T16:52:21Z","published":"2024-02-01T16:52:21Z","title":"360-GS: Layout-guided Panoramic Gaussian Splatting For Indoor Roaming","summary":" 3D Gaussian Splatting (3D-GS) has recently attracted great attention with\nreal-time and photo-realistic renderings. This technique typically takes\nperspective images as input and optimizes a set of 3D elliptical Gaussians by\nsplatting them onto the image planes, resulting in 2D Gaussians. However,\napplying 3D-GS to panoramic inputs presents challenges in effectively modeling\nthe projection onto the spherical surface of ${360^\\circ}$ images using 2D\nGaussians. In practical applications, input panoramas are often sparse, leading\nto unreliable initialization of 3D Gaussians and subsequent degradation of\n3D-GS quality. In addition, due to the under-constrained geometry of\ntexture-less planes (e.g., walls and floors), 3D-GS struggles to model these\nflat regions with elliptical Gaussians, resulting in significant floaters in\nnovel views. To address these issues, we propose 360-GS, a novel $360^{\\circ}$\nGaussian splatting for a limited set of panoramic inputs. Instead of splatting\n3D Gaussians directly onto the spherical surface, 360-GS projects them onto the\ntangent plane of the unit sphere and then maps them to the spherical\nprojections. This adaptation enables the representation of the projection using\nGaussians. We guide the optimization of 360-GS by exploiting layout priors\nwithin panoramas, which are simple to obtain and contain strong structural\ninformation about the indoor scene. Our experimental results demonstrate that\n360-GS allows panoramic rendering and outperforms state-of-the-art methods with\nfewer artifacts in novel view synthesis, thus providing immersive roaming in\nindoor scenarios.\n","authors":["Jiayang Bai","Letian Huang","Jie Guo","Wen Gong","Yuanqi Li","Yanwen Guo"],"pdf_url":"https://arxiv.org/pdf/2402.00763v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2402.00752v1","updated":"2024-02-01T16:43:58Z","published":"2024-02-01T16:43:58Z","title":"GS++: Error Analyzing and Optimal Gaussian Splatting","summary":" 3D Gaussian Splatting has garnered extensive attention and application in\nreal-time neural rendering. Concurrently, concerns have been raised about the\nlimitations of this technology in aspects such as point cloud storage,\nperformance , and robustness in sparse viewpoints , leading to various\nimprovements. However, there has been a notable lack of attention to the\nprojection errors introduced by the local affine approximation inherent in the\nsplatting itself, and the consequential impact of these errors on the quality\nof photo-realistic rendering. This paper addresses the projection error\nfunction of 3D Gaussian Splatting, commencing with the residual error from the\nfirst-order Taylor expansion of the projection function $\\phi$. The analysis\nestablishes a correlation between the error and the Gaussian mean position.\nSubsequently, leveraging function optimization theory, this paper analyzes the\nfunction's minima to provide an optimal projection strategy for Gaussian\nSplatting referred to Optimal Gaussian Splatting. Experimental validation\nfurther confirms that this projection methodology reduces artifacts, resulting\nin a more convincingly realistic rendering.\n","authors":["Letian Huang","Jiayang Bai","Jie Guo","Yanwen Guo"],"pdf_url":"https://arxiv.org/pdf/2402.00752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00740v1","updated":"2024-02-01T16:38:51Z","published":"2024-02-01T16:38:51Z","title":"DRSM: efficient neural 4d decomposition for dynamic reconstruction in\n stationary monocular cameras","summary":" With the popularity of monocular videos generated by video sharing and live\nbroadcasting applications, reconstructing and editing dynamic scenes in\nstationary monocular cameras has become a special but anticipated technology.\nIn contrast to scene reconstructions that exploit multi-view observations, the\nproblem of modeling a dynamic scene from a single view is significantly more\nunder-constrained and ill-posed. Inspired by recent progress in neural\nrendering, we present a novel framework to tackle 4D decomposition problem for\ndynamic scenes in monocular cameras. Our framework utilizes decomposed static\nand dynamic feature planes to represent 4D scenes and emphasizes the learning\nof dynamic regions through dense ray casting. Inadequate 3D clues from a\nsingle-view and occlusion are also particular challenges in scene\nreconstruction. To overcome these difficulties, we propose deep supervised\noptimization and ray casting strategies. With experiments on various videos,\nour method generates higher-fidelity results than existing methods for\nsingle-view dynamic scene representation.\n","authors":["Weixing Xie","Xiao Dong","Yong Yang","Qiqin Lin","Jingze Chen","Junfeng Yao","Xiaohu Guo"],"pdf_url":"https://arxiv.org/pdf/2402.00740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00724v1","updated":"2024-02-01T16:14:54Z","published":"2024-02-01T16:14:54Z","title":"Automatic Segmentation of the Spinal Cord Nerve Rootlets","summary":" Precise identification of spinal nerve rootlets is relevant to delineate\nspinal levels for the study of functional activity in the spinal cord. The goal\nof this study was to develop an automatic method for the semantic segmentation\nof spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI)\nscans. Images from two open-access MRI datasets were used to train a 3D\nmulti-class convolutional neural network using an active learning approach to\nsegment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal\nlevel. The method was tested on 3T T2-weighted images from datasets unseen\nduring training to assess inter-site, inter-session, and inter-resolution\nvariability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation\nacross rootlets levels), suggesting a good performance. The method also\ndemonstrated low inter-vendor and inter-site variability (coefficient of\nvariation <= 1.41 %), as well as low inter-session variability (coefficient of\nvariation <= 1.30 %) indicating stable predictions across different MRI\nvendors, sites, and sessions. The proposed methodology is open-source and\nreadily available in the Spinal Cord Toolbox (SCT) v6.2 and higher.\n","authors":["Jan Valosek","Theo Mathieu","Raphaelle Schlienger","Olivia S. Kowalczyk","Julien Cohen-Adad"],"pdf_url":"https://arxiv.org/pdf/2402.00724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00712v1","updated":"2024-02-01T16:07:12Z","published":"2024-02-01T16:07:12Z","title":"ChaosBench: A Multi-Channel, Physics-Based Benchmark for\n Subseasonal-to-Seasonal Climate Prediction","summary":" Accurate prediction of climate in the subseasonal-to-seasonal scale is\ncrucial for disaster readiness, reduced economic risk, and improved\npolicy-making amidst climate change. Yet, S2S prediction remains challenging\ndue to the chaotic nature of the system. At present, existing benchmarks for\nweather and climate applications, tend to (1) have shorter forecasting range of\nup-to 14 days, (2) do not include a wide range of operational baseline\nforecasts, and (3) lack physics-based constraints for explainability. Thus, we\npropose ChaosBench, a large-scale, multi-channel, physics-based benchmark for\nS2S prediction. ChaosBench has over 460K frames of real-world observations and\nsimulations, each with 60 variable-channels and spanning for up-to 45 years. We\nalso propose several physics-based, in addition to vision-based metrics, that\nenables for a more physically-consistent model. Furthermore, we include a\ndiverse set of physics-based forecasts from 4 national weather agencies as\nbaselines to our data-driven counterpart. We establish two tasks that vary in\ncomplexity: full and sparse dynamics prediction. Our benchmark is one of the\nfirst to perform large-scale evaluation on existing models including\nPanguWeather, FourCastNetV2, GraphCast, and ClimaX, and finds methods\noriginally developed for weather-scale applications fails on S2S task. We\nrelease our benchmark code and datasets at\nhttps://leap-stc.github.io/ChaosBench.\n","authors":["Juan Nathaniel","Yongquan Qu","Tung Nguyen","Sungduk Yu","Julius Busecke","Aditya Grover","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2402.00712v1.pdf","comment":"45 pages, 39 figures"},{"id":"http://arxiv.org/abs/2402.00703v1","updated":"2024-02-01T15:59:16Z","published":"2024-02-01T15:59:16Z","title":"Vehicle Perception from Satellite","summary":" Satellites are capable of capturing high-resolution videos. It makes vehicle\nperception from satellite become possible. Compared to street surveillance,\ndrive recorder or other equipments, satellite videos provide a much broader\ncity-scale view, so that the global dynamic scene of the traffic are captured\nand displayed. Traffic monitoring from satellite is a new task with great\npotential applications, including traffic jams prediction, path planning,\nvehicle dispatching, \\emph{etc.}. Practically, limited by the resolution and\nview, the captured vehicles are very tiny (a few pixels) and move slowly. Worse\nstill, these satellites are in Low Earth Orbit (LEO) to capture such\nhigh-resolution videos, so the background is also moving. Under this\ncircumstance, traffic monitoring from the satellite view is an extremely\nchallenging task. To attract more researchers into this field, we build a\nlarge-scale benchmark for traffic monitoring from satellite. It supports\nseveral tasks, including tiny object detection, counting and density\nestimation. The dataset is constructed based on 12 satellite videos and 14\nsynthetic videos recorded from GTA-V. They are separated into 408 video clips,\nwhich contain 7,336 real satellite images and 1,960 synthetic images. 128,801\nvehicles are annotated totally, and the number of vehicles in each image varies\nfrom 0 to 101. Several classic and state-of-the-art approaches in traditional\ncomputer vision are evaluated on the datasets, so as to compare the performance\nof different approaches, analyze the challenges in this task, and discuss the\nfuture prospects. The dataset is available at:\nhttps://github.com/Chenxi1510/Vehicle-Perception-from-Satellite-Videos.\n","authors":["Bin Zhao","Pengfei Han","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2402.00703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00700v1","updated":"2024-02-01T15:57:11Z","published":"2024-02-01T15:57:11Z","title":"In-Bed Pose Estimation: A Review","summary":" Human pose estimation, the process of identifying joint positions in a\nperson's body from images or videos, represents a widely utilized technology\nacross diverse fields, including healthcare. One such healthcare application\ninvolves in-bed pose estimation, where the body pose of an individual lying\nunder a blanket is analyzed. This task, for instance, can be used to monitor a\nperson's sleep behavior and detect symptoms early for potential disease\ndiagnosis in homes and hospitals. Several studies have utilized unimodal and\nmultimodal methods to estimate in-bed human poses. The unimodal studies\ngenerally employ RGB images, whereas the multimodal studies use modalities\nincluding RGB, long-wavelength infrared, pressure map, and depth map.\nMultimodal studies have the advantage of using modalities in addition to RGB\nthat might capture information useful to cope with occlusions. Moreover, some\nmultimodal studies exclude RGB and, this way, better suit privacy preservation.\nTo expedite advancements in this domain, we conduct a review of existing\ndatasets and approaches. Our objectives are to show the limitations of the\nprevious studies, current challenges, and provide insights for future works on\nthe in-bed human pose estimation field.\n","authors":["Ziya Ata Yazıcı","Sara Colantonio","Hazım Kemal Ekenel"],"pdf_url":"https://arxiv.org/pdf/2402.00700v1.pdf","comment":"Accepted at HCCS24 Workshop @ International Conference on Pervasive\n Computing and Communications (PerCom 2024)"},{"id":"http://arxiv.org/abs/2402.00695v1","updated":"2024-02-01T15:51:46Z","published":"2024-02-01T15:51:46Z","title":"Approximating Optimal Morphing Attacks using Template Inversion","summary":" Recent works have demonstrated the feasibility of inverting face recognition\nsystems, enabling to recover convincing face images using only their\nembeddings. We leverage such template inversion models to develop a novel type\nofdeep morphing attack based on inverting a theoretical optimal morph\nembedding, which is obtained as an average of the face embeddings of source\nimages. We experiment with two variants of this approach: the first one\nexploits a fully self-contained embedding-to-image inversion model, while the\nsecond leverages the synthesis network of a pretrained StyleGAN network for\nincreased morph realism. We generate morphing attacks from several source\ndatasets and study the effectiveness of those attacks against several face\nrecognition networks. We showcase that our method can compete with and\nregularly beat the previous state of the art for deep-learning based morph\ngeneration in terms of effectiveness, both in white-box and black-box attack\nscenarios, and is additionally much faster to run. We hope this might\nfacilitate the development of large scale deep morph datasets for training\ndetection models.\n","authors":["Laurent Colbois","Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2402.00695v1.pdf","comment":"Published at the IEEE International Joint Conference on Biometrics\n (IJCB) 2023"},{"id":"http://arxiv.org/abs/2402.00692v1","updated":"2024-02-01T15:50:40Z","published":"2024-02-01T15:50:40Z","title":"A Framework for Building Point Cloud Cleaning, Plane Detection and\n Semantic Segmentation","summary":" This paper presents a framework to address the challenges involved in\nbuilding point cloud cleaning, plane detection, and semantic segmentation, with\nthe ultimate goal of enhancing building modeling. We focus in the cleaning\nstage on removing outliers from the acquired point cloud data by employing an\nadaptive threshold technique based on z-score measure. Following the cleaning\nprocess, we perform plane detection using the robust RANSAC paradigm. The goal\nis to carry out multiple plane segmentations, and to classify segments into\ndistinct categories, such as floors, ceilings, and walls. The resulting\nsegments can generate accurate and detailed point clouds representing the\nbuilding's architectural elements. Moreover, we address the problem of semantic\nsegmentation, which plays a vital role in the identification and classification\nof different components within the building, such as walls, windows, doors,\nroofs, and objects. Inspired by the PointNet architecture, we propose a deep\nlearning architecture for efficient semantic segmentation in buildings. The\nresults demonstrate the effectiveness of the proposed framework in handling\nbuilding modeling tasks, paving the way for improved accuracy and efficiency in\nthe field of building modelization.\n","authors":["Ilyass Abouelaziz","Youssef Mourchid"],"pdf_url":"https://arxiv.org/pdf/2402.00692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00680v1","updated":"2024-02-01T15:43:43Z","published":"2024-02-01T15:43:43Z","title":"LVC-LGMC: Joint Local and Global Motion Compensation for Learned Video\n Compression","summary":" Existing learned video compression models employ flow net or deformable\nconvolutional networks (DCN) to estimate motion information. However, the\nlimited receptive fields of flow net and DCN inherently direct their\nattentiveness towards the local contexts. Global contexts, such as large-scale\nmotions and global correlations among frames are ignored, presenting a\nsignificant bottleneck for capturing accurate motions. To address this issue,\nwe propose a joint local and global motion compensation module (LGMC) for\nleaned video coding. More specifically, we adopt flow net for local motion\ncompensation. To capture global context, we employ the cross attention in\nfeature domain for motion compensation. In addition, to avoid the quadratic\ncomplexity of vanilla cross attention, we divide the softmax operations in\nattention into two independent softmax operations, leading to linear\ncomplexity. To validate the effectiveness of our proposed LGMC, we integrate it\nwith DCVC-TCM and obtain learned video compression with joint local and global\nmotion compensation (LVC-LGMC). Extensive experiments demonstrate that our\nLVC-LGMC has significant rate-distortion performance improvements over baseline\nDCVC-TCM.\n","authors":["Wei Jiang","Junru Li","Kai Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00680v1.pdf","comment":"Accepted at ICASSP 2024. The first attempt to use cross attention for\n bits-free motion estimation and motion compensation"},{"id":"http://arxiv.org/abs/2402.00676v1","updated":"2024-02-01T15:37:23Z","published":"2024-02-01T15:37:23Z","title":"Deep Robot Sketching: An application of Deep Q-Learning Networks for\n human-like sketching","summary":" The current success of Reinforcement Learning algorithms for its performance\nin complex environments has inspired many recent theoretical approaches to\ncognitive science. Artistic environments are studied within the cognitive\nscience community as rich, natural, multi-sensory, multi-cultural environments.\nIn this work, we propose the introduction of Reinforcement Learning for\nimproving the control of artistic robot applications. Deep Q-learning Neural\nNetworks (DQN) is one of the most successful algorithms for the implementation\nof Reinforcement Learning in robotics. DQN methods generate complex control\npolicies for the execution of complex robot applications in a wide set of\nenvironments. Current art painting robot applications use simple control laws\nthat limits the adaptability of the frameworks to a set of simple environments.\nIn this work, the introduction of DQN within an art painting robot application\nis proposed. The goal is to study how the introduction of a complex control\npolicy impacts the performance of a basic art painting robot application. The\nmain expected contribution of this work is to serve as a first baseline for\nfuture works introducing DQN methods for complex art painting robot frameworks.\nExperiments consist of real world executions of human drawn sketches using the\nDQN generated policy and TEO, the humanoid robot. Results are compared in terms\nof similarity and obtained reward with respect to the reference inputs\n","authors":["Raul Fernandez-Fernandez","Juan G. Victores","Carlos Balaguer"],"pdf_url":"https://arxiv.org/pdf/2402.00676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00672v1","updated":"2024-02-01T15:33:17Z","published":"2024-02-01T15:33:17Z","title":"Exploring Homogeneous and Heterogeneous Consistent Label Associations\n for Unsupervised Visible-Infrared Person ReID","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID) aims to\nretrieve pedestrian images of the same identity from different modalities\nwithout annotations. While prior work focuses on establishing cross-modality\npseudo-label associations to bridge the modality-gap, they ignore maintaining\nthe instance-level homogeneous and heterogeneous consistency in pseudo-label\nspace, resulting in coarse associations. In response, we introduce a\nModality-Unified Label Transfer (MULT) module that simultaneously accounts for\nboth homogeneous and heterogeneous fine-grained instance-level structures,\nyielding high-quality cross-modality label associations. It models both\nhomogeneous and heterogeneous affinities, leveraging them to define the\ninconsistency for the pseudo-labels and then minimize it, leading to\npseudo-labels that maintain alignment across modalities and consistency within\nintra-modality structures. Additionally, a straightforward plug-and-play Online\nCross-memory Label Refinement (OCLR) module is proposed to further mitigate the\nimpact of noisy pseudo-labels while simultaneously aligning different\nmodalities, coupled with a Modality-Invariant Representation Learning (MIRL)\nframework. Experiments demonstrate that our proposed method outperforms\nexisting USL-VI-ReID methods, highlighting the superiority of our MULT in\ncomparison to other cross-modality association methods. The code will be\navailable.\n","authors":["Lingfeng He","De Cheng","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00637v1","updated":"2024-02-01T14:52:16Z","published":"2024-02-01T14:52:16Z","title":"Fisheye Camera and Ultrasonic Sensor Fusion For Near-Field Obstacle\n Perception in Bird's-Eye-View","summary":" Accurate obstacle identification represents a fundamental challenge within\nthe scope of near-field perception for autonomous driving. Conventionally,\nfisheye cameras are frequently employed for comprehensive surround-view\nperception, including rear-view obstacle localization. However, the performance\nof such cameras can significantly deteriorate in low-light conditions, during\nnighttime, or when subjected to intense sun glare. Conversely, cost-effective\nsensors like ultrasonic sensors remain largely unaffected under these\nconditions. Therefore, we present, to our knowledge, the first end-to-end\nmultimodal fusion model tailored for efficient obstacle perception in a\nbird's-eye-view (BEV) perspective, utilizing fisheye cameras and ultrasonic\nsensors. Initially, ResNeXt-50 is employed as a set of unimodal encoders to\nextract features specific to each modality. Subsequently, the feature space\nassociated with the visible spectrum undergoes transformation into BEV. The\nfusion of these two modalities is facilitated via concatenation. At the same\ntime, the ultrasonic spectrum-based unimodal feature maps pass through\ncontent-aware dilated convolution, applied to mitigate the sensor misalignment\nbetween two sensors in the fused feature space. Finally, the fused features are\nutilized by a two-stage semantic occupancy decoder to generate grid-wise\npredictions for precise obstacle perception. We conduct a systematic\ninvestigation to determine the optimal strategy for multimodal fusion of both\nsensors. We provide insights into our dataset creation procedures, annotation\nguidelines, and perform a thorough data analysis to ensure adequate coverage of\nall scenarios. When applied to our dataset, the experimental results underscore\nthe robustness and effectiveness of our proposed multimodal fusion approach.\n","authors":["Arindam Das","Sudarshan Paul","Niko Scholz","Akhilesh Kumar Malviya","Ganesh Sistu","Ujjwal Bhattacharya","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2402.00637v1.pdf","comment":"16 pages, 12 Figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.00627v1","updated":"2024-02-01T14:41:59Z","published":"2024-02-01T14:41:59Z","title":"CapHuman: Capture Your Moments in Parallel Universes","summary":" We concentrate on a novel human-centric image synthesis task, that is, given\nonly one reference facial photograph, it is expected to generate specific\nindividual images with diverse head positions, poses, and facial expressions in\ndifferent contexts. To accomplish this goal, we argue that our generative model\nshould be capable of the following favorable characteristics: (1) a strong\nvisual and semantic understanding of our world and human society for basic\nobject and human image generation. (2) generalizable identity preservation\nability. (3) flexible and fine-grained head control. Recently, large\npre-trained text-to-image diffusion models have shown remarkable results,\nserving as a powerful generative foundation. As a basis, we aim to unleash the\nabove two capabilities of the pre-trained model. In this work, we present a new\nframework named CapHuman. We embrace the ``encode then learn to align\"\nparadigm, which enables generalizable identity preservation for new individuals\nwithout cumbersome tuning at inference. CapHuman encodes identity features and\nthen learns to align them into the latent space. Moreover, we introduce the 3D\nfacial prior to equip our model with control over the human head in a flexible\nand 3D-consistent manner. Extensive qualitative and quantitative analyses\ndemonstrate our CapHuman can produce well-identity-preserved, photo-realistic,\nand high-fidelity portraits with content-rich representations and various head\nrenditions, superior to established baselines. Code and checkpoint will be\nreleased at https://github.com/VamosC/CapHuman.\n","authors":["Chao Liang","Fan Ma","Linchao Zhu","Yingying Deng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2402.00627v1.pdf","comment":"Project page: https://caphuman.github.io/"},{"id":"http://arxiv.org/abs/2402.00626v1","updated":"2024-02-01T14:41:20Z","published":"2024-02-01T14:41:20Z","title":"Vision-LLMs Can Fool Themselves with Self-Generated Typographic Attacks","summary":" Recently, significant progress has been made on Large Vision-Language Models\n(LVLMs); a new class of VL models that make use of large pre-trained language\nmodels. Yet, their vulnerability to Typographic attacks, which involve\nsuperimposing misleading text onto an image remain unstudied. Furthermore,\nprior work typographic attacks rely on sampling a random misleading class from\na predefined set of classes. However, the random chosen class might not be the\nmost effective attack. To address these issues, we first introduce a novel\nbenchmark uniquely designed to test LVLMs vulnerability to typographic attacks.\nFurthermore, we introduce a new and more effective typographic attack:\nSelf-Generated typographic attacks. Indeed, our method, given an image, make\nuse of the strong language capabilities of models like GPT-4V by simply\nprompting them to recommend a typographic attack. Using our novel benchmark, we\nuncover that typographic attacks represent a significant threat against\nLVLM(s). Furthermore, we uncover that typographic attacks recommended by GPT-4V\nusing our new method are not only more effective against GPT-4V itself compared\nto prior work attacks, but also against a host of less capable yet popular open\nsource models like LLaVA, InstructBLIP, and MiniGPT4.\n","authors":["Maan Qraitem","Nazia Tasnim","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2402.00626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00608v1","updated":"2024-02-01T14:02:06Z","published":"2024-02-01T14:02:06Z","title":"Deep Clustering Using the Soft Silhouette Score: Towards Compact and\n Well-Separated Clusters","summary":" Unsupervised learning has gained prominence in the big data era, offering a\nmeans to extract valuable insights from unlabeled datasets. Deep clustering has\nemerged as an important unsupervised category, aiming to exploit the non-linear\nmapping capabilities of neural networks in order to enhance clustering\nperformance. The majority of deep clustering literature focuses on minimizing\nthe inner-cluster variability in some embedded space while keeping the learned\nrepresentation consistent with the original high-dimensional dataset. In this\nwork, we propose soft silhoutte, a probabilistic formulation of the silhouette\ncoefficient. Soft silhouette rewards compact and distinctly separated\nclustering solutions like the conventional silhouette coefficient. When\noptimized within a deep clustering framework, soft silhouette guides the\nlearned representations towards forming compact and well-separated clusters. In\naddition, we introduce an autoencoder-based deep learning architecture that is\nsuitable for optimizing the soft silhouette objective function. The proposed\ndeep clustering method has been tested and compared with several well-studied\ndeep clustering methods on various benchmark datasets, yielding very\nsatisfactory clustering results.\n","authors":["Georgios Vardakas","Ioannis Papakostas","Aristidis Likas"],"pdf_url":"https://arxiv.org/pdf/2402.00608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00606v1","updated":"2024-02-01T13:58:32Z","published":"2024-02-01T13:58:32Z","title":"Dynamic Texture Transfer using PatchMatch and Transformers","summary":" How to automatically transfer the dynamic texture of a given video to the\ntarget still image is a challenging and ongoing problem. In this paper, we\npropose to handle this task via a simple yet effective model that utilizes both\nPatchMatch and Transformers. The key idea is to decompose the task of dynamic\ntexture transfer into two stages, where the start frame of the target video\nwith the desired dynamic texture is synthesized in the first stage via a\ndistance map guided texture transfer module based on the PatchMatch algorithm.\nThen, in the second stage, the synthesized image is decomposed into\nstructure-agnostic patches, according to which their corresponding subsequent\npatches can be predicted by exploiting the powerful capability of Transformers\nequipped with VQ-VAE for processing long discrete sequences. After getting all\nthose patches, we apply a Gaussian weighted average merging strategy to\nsmoothly assemble them into each frame of the target stylized video.\nExperimental results demonstrate the effectiveness and superiority of the\nproposed method in dynamic texture transfer compared to the state of the art.\n","authors":["Guo Pu","Shiyao Xu","Xixin Cao","Zhouhui Lian"],"pdf_url":"https://arxiv.org/pdf/2402.00606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00593v1","updated":"2024-02-01T13:43:33Z","published":"2024-02-01T13:43:33Z","title":"Coronary Artery Disease Classification with Different Lesion Degree\n Ranges based on Deep Learning","summary":" Invasive Coronary Angiography (ICA) images are considered the gold standard\nfor assessing the state of the coronary arteries. Deep learning classification\nmethods are widely used and well-developed in different areas where medical\nimaging evaluation has an essential impact due to the development of\ncomputer-aided diagnosis systems that can support physicians in their clinical\nprocedures. In this paper, a new performance analysis of deep learning methods\nfor binary ICA classification with different lesion degrees is reported. To\nreach this goal, an annotated dataset of ICA images that contains the ground\ntruth, the location of lesions and seven possible severity degrees ranging\nbetween 0% and 100% was employed. The ICA images were divided into 'lesion' or\n'non-lesion' patches. We aim to study how binary classification performance is\naffected by the different lesion degrees considered in the positive class.\nTherefore, five known convolutional neural network architectures were trained\nwith different input images where different lesion degree ranges were gradually\nincorporated until considering the seven lesion degrees. Besides, four types of\nexperiments with and without data augmentation were designed, whose F-measure\nand Area Under Curve (AUC) were computed. Reported results achieved an\nF-measure and AUC of 92.7% and 98.1%, respectively. However, lesion\nclassification is highly affected by the degree of the lesion intended to\nclassify, with 15% less accuracy when <99% lesion patches are present.\n","authors":["Ariadna Jiménez-Partinen","Karl Thurnhofer-Hemsi","Esteban J. Palomo","Jorge Rodríguez-Capitán","Ana I. Molina-Ramos"],"pdf_url":"https://arxiv.org/pdf/2402.00593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00576v1","updated":"2024-02-01T13:14:38Z","published":"2024-02-01T13:14:38Z","title":"Tropical Decision Boundaries for Neural Networks Are Robust Against\n Adversarial Attacks","summary":" We introduce a simple, easy to implement, and computationally efficient\ntropical convolutional neural network architecture that is robust against\nadversarial attacks. We exploit the tropical nature of piece-wise linear neural\nnetworks by embedding the data in the tropical projective torus in a single\nhidden layer which can be added to any model. We study the geometry of its\ndecision boundary theoretically and show its robustness against adversarial\nattacks on image datasets using computational experiments.\n","authors":["Kurt Pasque","Christopher Teska","Ruriko Yoshida","Keiji Miura","Jefferson Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00575v1","updated":"2024-02-01T13:13:16Z","published":"2024-02-01T13:13:16Z","title":"Diffusion-based Light Field Synthesis","summary":" Light fields (LFs), conducive to comprehensive scene radiance recorded across\nangular dimensions, find wide applications in 3D reconstruction, virtual\nreality, and computational photography.However, the LF acquisition is\ninevitably time-consuming and resource-intensive due to the mainstream\nacquisition strategy involving manual capture or laborious software\nsynthesis.Given such a challenge, we introduce LFdiff, a straightforward yet\neffective diffusion-based generative framework tailored for LF synthesis, which\nadopts only a single RGB image as input.LFdiff leverages disparity estimated by\na monocular depth estimation network and incorporates two distinctive\ncomponents: a novel condition scheme and a noise estimation network tailored\nfor LF data.Specifically, we design a position-aware warping condition scheme,\nenhancing inter-view geometry learning via a robust conditional signal.We then\npropose DistgUnet, a disentanglement-based noise estimation network, to harness\ncomprehensive LF representations.Extensive experiments demonstrate that LFdiff\nexcels in synthesizing visually pleasing and disparity-controllable light\nfields with enhanced generalization capability.Additionally, comprehensive\nresults affirm the broad applicability of the generated LF data, spanning\napplications like LF super-resolution and refocusing.\n","authors":["Ruisheng Gao","Yutong Liu","Zeyu Xiao","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.00575v1.pdf","comment":"11 pages,9 figures"},{"id":"http://arxiv.org/abs/2312.14232v3","updated":"2024-02-01T13:06:51Z","published":"2023-12-21T18:46:46Z","title":"Parrot Captions Teach CLIP to Spot Text","summary":" Despite CLIP being the foundation model in numerous vision-language\napplications, the CLIP suffers from a severe text spotting bias. Such bias\ncauses CLIP models to `Parrot' the visual text embedded within images while\ndisregarding the authentic visual semantics. We uncover that in the most\npopular image-text dataset LAION-2B, the captions also densely parrot (spell)\nthe text embedded in images. Our analysis shows that around 50% of images are\nembedded with visual text content, and around 30% of captions words are in\nthese embedded visual content. Based on such observation, we thoroughly inspect\nthe different released versions of CLIP models and verify that the visual text\nis the dominant factor in measuring the LAION-style image-text similarity for\nthese models. To examine whether these parrot captions shape the text spotting\nbias, we train a series of CLIP models with LAION subsets curated by different\nparrot-caption-oriented criteria. We show that training with parrot captions\neasily shapes such bias but harms the expected visual-language representation\nlearning in CLIP models. This suggests that it is urgent to revisit either the\ndesign of CLIP-like models or the existing image-text dataset curation pipeline\nbuilt on CLIP score filtering.\n","authors":["Yiqi Lin","Conghui He","Alex Jinpeng Wang","Bin Wang","Weijia Li","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.14232v3.pdf","comment":"project page: https://linyq17.github.io/CLIP-Parrot-Bias/. Add more\n analysis and ablation studies. Update Figure 3 with a more precise metric"},{"id":"http://arxiv.org/abs/2402.00570v1","updated":"2024-02-01T13:03:13Z","published":"2024-02-01T13:03:13Z","title":"CADICA: a new dataset for coronary artery disease detection by using\n invasive coronary angiography","summary":" Coronary artery disease (CAD) remains the leading cause of death globally and\ninvasive coronary angiography (ICA) is considered the gold standard of\nanatomical imaging evaluation when CAD is suspected. However, risk evaluation\nbased on ICA has several limitations, such as visual assessment of stenosis\nseverity, which has significant interobserver variability. This motivates to\ndevelopment of a lesion classification system that can support specialists in\ntheir clinical procedures. Although deep learning classification methods are\nwell-developed in other areas of medical imaging, ICA image classification is\nstill at an early stage. One of the most important reasons is the lack of\navailable and high-quality open-access datasets. In this paper, we reported a\nnew annotated ICA images dataset, CADICA, to provide the research community\nwith a comprehensive and rigorous dataset of coronary angiography consisting of\na set of acquired patient videos and associated disease-related metadata. This\ndataset can be used by clinicians to train their skills in angiographic\nassessment of CAD severity and by computer scientists to create computer-aided\ndiagnostic systems to help in such assessment. In addition, baseline\nclassification methods are proposed and analyzed, validating the functionality\nof CADICA and giving the scientific community a starting point to improve CAD\ndetection.\n","authors":["Ariadna Jiménez-Partinen","Miguel A. Molina-Cabello","Karl Thurnhofer-Hemsi","Esteban J. Palomo","Jorge Rodríguez-Capitán","Ana I. Molina-Ramos","Manuel Jiménez-Navarro"],"pdf_url":"https://arxiv.org/pdf/2402.00570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14976v4","updated":"2024-02-01T12:51:34Z","published":"2023-09-26T14:52:51Z","title":"MoCaE: Mixture of Calibrated Experts Significantly Improves Object\n Detection","summary":" Combining the strengths of many existing predictors to obtain a Mixture of\nExperts which is superior to its individual components is an effective way to\nimprove the performance without having to develop new architectures or train a\nmodel from scratch. However, surprisingly, we find that na\\\"ively combining\nexpert object detectors in a similar way to Deep Ensembles, can often lead to\ndegraded performance. We identify that the primary cause of this issue is that\nthe predictions of the experts do not match their performance, a term referred\nto as miscalibration. Consequently, the most confident detector dominates the\nfinal predictions, preventing the mixture from leveraging all the predictions\nfrom the experts appropriately. To address this, when constructing the Mixture\nof Experts, we propose to combine their predictions in a manner which reflects\nthe individual performance of the experts; an objective we achieve by first\ncalibrating the predictions before filtering and refining them. We term this\napproach the Mixture of Calibrated Experts and demonstrate its effectiveness\nthrough extensive experiments on 5 different detection tasks using a variety of\ndetectors, showing that it: (i) improves object detectors on COCO and instance\nsegmentation methods on LVIS by up to $\\sim 2.5$ AP; (ii) reaches\nstate-of-the-art on COCO test-dev with $65.1$ AP and on DOTA with $82.62$\n$\\mathrm{AP_{50}}$; (iii) outperforms single models consistently on recent\ndetection tasks such as Open Vocabulary Object Detection.\n","authors":["Kemal Oksuz","Selim Kuzucu","Tom Joy","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2309.14976v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00564v1","updated":"2024-02-01T12:50:48Z","published":"2024-02-01T12:50:48Z","title":"A Single Graph Convolution Is All You Need: Efficient Grayscale Image\n Classification","summary":" Image classifiers often rely on convolutional neural networks (CNN) for their\ntasks, which are inherently more heavyweight than multilayer perceptrons\n(MLPs), which can be problematic in real-time applications. Additionally, many\nimage classification models work on both RGB and grayscale datasets.\nClassifiers that operate solely on grayscale images are much less common.\nGrayscale image classification has diverse applications, including but not\nlimited to medical image classification and synthetic aperture radar (SAR)\nautomatic target recognition (ATR). Thus, we present a novel grayscale (single\nchannel) image classification approach using a vectorized view of images. We\nexploit the lightweightness of MLPs by viewing images as a vector and reducing\nour problem setting to the grayscale image classification setting. We find that\nusing a single graph convolutional layer batch-wise increases accuracy and\nreduces variance in the performance of our model. Moreover, we develop a\ncustomized accelerator on FPGA for the proposed model with several\noptimizations to improve its performance. Our experimental results on benchmark\ngrayscale image datasets demonstrate the effectiveness of the proposed model,\nachieving vastly lower latency (up to 16$\\times$ less) and competitive or\nleading performance compared to other state-of-the-art image classification\nmodels on various domain-specific grayscale image classification datasets.\n","authors":["Jacob Fein-Ashley","Tian Ye","Sachini Wickramasinghe","Bingyi Zhang","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2402.00564v1.pdf","comment":"6 pages of content, 1 page of references"},{"id":"http://arxiv.org/abs/2309.11268v3","updated":"2024-02-01T12:47:15Z","published":"2023-09-20T12:51:13Z","title":"StructChart: Perception, Structuring, Reasoning for Visual Chart\n Understanding","summary":" Charts are common in literature across different scientific fields, conveying\nrich information easily accessible to readers. Current chart-related tasks\nfocus on either chart perception which refers to extracting information from\nthe visual charts, or performing reasoning given the extracted data, e.g. in a\ntabular form. In this paper, we aim to establish a unified and label-efficient\nlearning paradigm for joint perception and reasoning tasks, which can be\ngenerally applicable to different downstream tasks, beyond the\nquestion-answering task as specifically studied in peer works. Specifically,\nStructChart first reformulates the chart information from the popular tubular\nform (specifically linearized CSV) to the proposed Structured Triplet\nRepresentations (STR), which is more friendly for reducing the task gap between\nchart perception and reasoning due to the employed structured information\nextraction for charts. We then propose a Structuring Chart-oriented\nRepresentation Metric (SCRM) to quantitatively evaluate the performance for the\nchart perception task. To enrich the dataset for training, we further explore\nthe possibility of leveraging the Large Language Model (LLM), enhancing the\nchart diversity in terms of both chart visual style and its statistical\ninformation. Extensive experiments are conducted on various chart-related\ntasks, demonstrating the effectiveness and promising potential for a unified\nchart perception-reasoning paradigm to push the frontier of chart\nunderstanding.\n","authors":["Renqiu Xia","Bo Zhang","Haoyang Peng","Ning Liao","Peng Ye","Botian Shi","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.11268v3.pdf","comment":"SimChart9K is available for downloading at:\n https://github.com/UniModal4Reasoning/SimChart9K 26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2402.00541v1","updated":"2024-02-01T12:06:55Z","published":"2024-02-01T12:06:55Z","title":"Masked Conditional Diffusion Model for Enhancing Deepfake Detection","summary":" Recent studies on deepfake detection have achieved promising results when\ntraining and testing faces are from the same dataset. However, their results\nseverely degrade when confronted with forged samples that the model has not yet\nseen during training. In this paper, deepfake data to help detect deepfakes.\nthis paper present we put a new insight into diffusion model-based data\naugmentation, and propose a Masked Conditional Diffusion Model (MCDM) for\nenhancing deepfake detection. It generates a variety of forged faces from a\nmasked pristine one, encouraging the deepfake detection model to learn generic\nand robust representations without overfitting to special artifacts. Extensive\nexperiments demonstrate that forgery images generated with our method are of\nhigh quality and helpful to improve the performance of deepfake detection\nmodels.\n","authors":["Tiewen Chen","Shanmin Yang","Shu Hu","Zhenghan Fang","Ying Fu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2402.00541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00534v1","updated":"2024-02-01T12:01:43Z","published":"2024-02-01T12:01:43Z","title":"A Manifold Representation of the Key in Vision Transformers","summary":" Vision Transformers implement multi-head self-attention (MSA) via stacking\nmultiple attention blocks. The query, key, and value are often intertwined and\ngenerated within those blocks via a single, shared linear transformation. This\npaper explores the concept of disentangling the key from the query and value,\nand adopting a manifold representation for the key. Our experiments reveal that\ndecoupling and endowing the key with a manifold structure can enhance the model\nperformance. Specifically, ViT-B exhibits a 0.87% increase in top-1 accuracy,\nwhile Swin-T sees a boost of 0.52% in top-1 accuracy on the ImageNet-1K\ndataset, with eight charts in the manifold key. Our approach also yields\npositive results in object detection and instance segmentation tasks on the\nCOCO dataset. Through detailed ablation studies, we establish that these\nperformance gains are not merely due to the simplicity of adding more\nparameters and computations. Future research may investigate strategies for\ncutting the budget of such representations and aim for further performance\nimprovements based on our findings.\n","authors":["Li Meng","Morten Goodwin","Anis Yazidi","Paal Engelstad"],"pdf_url":"https://arxiv.org/pdf/2402.00534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11098v5","updated":"2024-02-01T11:51:01Z","published":"2023-03-20T13:33:31Z","title":"Understanding the Role of the Projector in Knowledge Distillation","summary":" In this paper we revisit the efficacy of knowledge distillation as a function\nmatching and metric learning problem. In doing so we verify three important\ndesign decisions, namely the normalisation, soft maximum function, and\nprojection layers as key ingredients. We theoretically show that the projector\nimplicitly encodes information on past examples, enabling relational gradients\nfor the student. We then show that the normalisation of representations is\ntightly coupled with the training dynamics of this projector, which can have a\nlarge impact on the students performance. Finally, we show that a simple soft\nmaximum function can be used to address any significant capacity gap problems.\nExperimental results on various benchmark datasets demonstrate that using these\ninsights can lead to superior or comparable performance to state-of-the-art\nknowledge distillation techniques, despite being much more computationally\nefficient. In particular, we obtain these results across image classification\n(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult\ndistillation objectives, such as training data efficient transformers, whereby\nwe attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet. Code and models are\npublicly available.\n","authors":["Roy Miles","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2303.11098v5.pdf","comment":"AAAI 2024. Code available at\n https://github.com/roymiles/Simple-Recipe-Distillation"},{"id":"http://arxiv.org/abs/2402.00525v1","updated":"2024-02-01T11:46:44Z","published":"2024-02-01T11:46:44Z","title":"StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time\n Rendering","summary":" Gaussian Splatting has emerged as a prominent model for constructing 3D\nrepresentations from images across diverse domains. However, the efficiency of\nthe 3D Gaussian Splatting rendering pipeline relies on several simplifications.\nNotably, reducing Gaussian to 2D splats with a single view-space depth\nintroduces popping and blending artifacts during view rotation. Addressing this\nissue requires accurate per-pixel depth computation, yet a full per-pixel sort\nproves excessively costly compared to a global sort operation. In this paper,\nwe present a novel hierarchical rasterization approach that systematically\nresorts and culls splats with minimal processing overhead. Our software\nrasterizer effectively eliminates popping artifacts and view inconsistencies,\nas demonstrated through both quantitative and qualitative measurements.\nSimultaneously, our method mitigates the potential for cheating view-dependent\neffects with popping, ensuring a more authentic representation. Despite the\nelimination of cheating, our approach achieves comparable quantitative results\nfor test images, while increasing the consistency for novel view synthesis in\nmotion. Due to its design, our hierarchical approach is only 4% slower on\naverage than the original Gaussian Splatting. Notably, enforcing consistency\nenables a reduction in the number of Gaussians by approximately half with\nnearly identical quality and view-consistency. Consequently, rendering\nperformance is nearly doubled, making our approach 1.6x faster than the\noriginal Gaussian Splatting, with a 50% reduction in memory requirements.\n","authors":["Lukas Radl","Michael Steiner","Mathias Parger","Alexander Weinrauch","Bernhard Kerbl","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2402.00525v1.pdf","comment":"Video: https://youtu.be/RJQlSORNkr0"},{"id":"http://arxiv.org/abs/2401.11261v2","updated":"2024-02-01T10:44:08Z","published":"2024-01-20T16:01:18Z","title":"Diffusion Model Conditioning on Gaussian Mixture Model and Negative\n Gaussian Mixture Gradient","summary":" Diffusion models (DMs) are a type of generative model that has a huge impact\non image synthesis and beyond. They achieve state-of-the-art generation results\nin various generative tasks. A great diversity of conditioning inputs, such as\ntext or bounding boxes, are accessible to control the generation. In this work,\nwe propose a conditioning mechanism utilizing Gaussian mixture models (GMMs) as\nfeature conditioning to guide the denoising process. Based on set theory, we\nprovide a comprehensive theoretical analysis that shows that conditional latent\ndistribution based on features and classes is significantly different, so that\nconditional latent distribution on features produces fewer defect generations\nthan conditioning on classes. Two diffusion models conditioned on the Gaussian\nmixture model are trained separately for comparison. Experiments support our\nfindings. A novel gradient function called the negative Gaussian mixture\ngradient (NGMG) is proposed and applied in diffusion model training with an\nadditional classifier. Training stability has improved. We also theoretically\nprove that NGMG shares the same benefit as the Earth Mover distance\n(Wasserstein) as a more sensible cost function when learning distributions\nsupported by low-dimensional manifolds.\n","authors":["Weiguo Lu","Xuan Wu","Deng Ding","Jinqiao Duan","Jirong Zhuang","Gangnan Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.11261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00481v1","updated":"2024-02-01T10:37:41Z","published":"2024-02-01T10:37:41Z","title":"Bias Mitigating Few-Shot Class-Incremental Learning","summary":" Few-shot class-incremental learning (FSCIL) aims at recognizing novel classes\ncontinually with limited novel class samples. A mainstream baseline for FSCIL\nis first to train the whole model in the base session, then freeze the feature\nextractor in the incremental sessions. Despite achieving high overall accuracy,\nmost methods exhibit notably low accuracy for incremental classes. Some recent\nmethods somewhat alleviate the accuracy imbalance between base and incremental\nclasses by fine-tuning the feature extractor in the incremental sessions, but\nthey further cause the accuracy imbalance between past and current incremental\nclasses. In this paper, we study the causes of such classification accuracy\nimbalance for FSCIL, and abstract them into a unified model bias problem. Based\non the analyses, we propose a novel method to mitigate model bias of the FSCIL\nproblem during training and inference processes, which includes mapping ability\nstimulation, separately dual-feature classification, and self-optimizing\nclassifiers. Extensive experiments on three widely-used FSCIL benchmark\ndatasets show that our method significantly mitigates the model bias problem\nand achieves state-of-the-art performance.\n","authors":["Li-Jun Zhao","Zhen-Duo Chen","Zi-Chao Zhang","Xin Luo","Xin-Shun Xu"],"pdf_url":"https://arxiv.org/pdf/2402.00481v1.pdf","comment":"8 pages (not including references and checklist)"},{"id":"http://arxiv.org/abs/2402.00467v1","updated":"2024-02-01T10:14:53Z","published":"2024-02-01T10:14:53Z","title":"Can you see me now? Blind spot estimation for autonomous vehicles using\n scenario-based simulation with random reference sensors","summary":" In this paper, we introduce a method for estimating blind spots for sensor\nsetups of autonomous or automated vehicles and/or robotics applications. In\ncomparison to previous methods that rely on geometric approximations, our\npresented approach provides more realistic coverage estimates by utilizing\naccurate and detailed 3D simulation environments. Our method leverages point\nclouds from LiDAR sensors or camera depth images from high-fidelity simulations\nof target scenarios to provide accurate and actionable visibility estimates. A\nMonte Carlo-based reference sensor simulation enables us to accurately estimate\nblind spot size as a metric of coverage, as well as detection probabilities of\nobjects at arbitrary positions.\n","authors":["Marc Uecker","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2402.00467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08731v2","updated":"2024-02-01T09:59:34Z","published":"2023-06-14T20:33:49Z","title":"EPIC Fields: Marrying 3D Geometry and Video Understanding","summary":" Neural rendering is fuelling a unification of learning, 3D geometry and video\nunderstanding that has been waiting for more than two decades. Progress,\nhowever, is still hampered by a lack of suitable datasets and benchmarks. To\naddress this gap, we introduce EPIC Fields, an augmentation of EPIC-KITCHENS\nwith 3D camera information. Like other datasets for neural rendering, EPIC\nFields removes the complex and expensive step of reconstructing cameras using\nphotogrammetry, and allows researchers to focus on modelling problems. We\nillustrate the challenge of photogrammetry in egocentric videos of dynamic\nactions and propose innovations to address them. Compared to other neural\nrendering datasets, EPIC Fields is better tailored to video understanding\nbecause it is paired with labelled action segments and the recent VISOR segment\nannotations. To further motivate the community, we also evaluate two benchmark\ntasks in neural rendering and segmenting dynamic objects, with strong baselines\nthat showcase what is not possible today. We also highlight the advantage of\ngeometry in semi-supervised video object segmentations on the VISOR\nannotations. EPIC Fields reconstructs 96% of videos in EPICKITCHENS,\nregistering 19M frames in 99 hours recorded in 45 kitchens.\n","authors":["Vadim Tschernezki","Ahmad Darkhalil","Zhifan Zhu","David Fouhey","Iro Laina","Diane Larlus","Dima Damen","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2306.08731v2.pdf","comment":"Published at NeurIPS 2023. 24 pages, 15 figures. Project Webpage:\n http://epic-kitchens.github.io/epic-fields"},{"id":"http://arxiv.org/abs/2309.07973v2","updated":"2024-02-01T09:50:08Z","published":"2023-09-14T18:16:58Z","title":"M3Dsynth: A dataset of medical 3D images with AI-generated local\n manipulations","summary":" The ability to detect manipulated visual content is becoming increasingly\nimportant in many application fields, given the rapid advances in image\nsynthesis methods. Of particular concern is the possibility of modifying the\ncontent of medical images, altering the resulting diagnoses. Despite its\nrelevance, this issue has received limited attention from the research\ncommunity. One reason is the lack of large and curated datasets to use for\ndevelopment and benchmarking purposes. Here, we investigate this issue and\npropose M3Dsynth, a large dataset of manipulated Computed Tomography (CT) lung\nimages. We create manipulated images by injecting or removing lung cancer\nnodules in real CT scans, using three different methods based on Generative\nAdversarial Networks (GAN) or Diffusion Models (DM), for a total of 8,577\nmanipulated samples. Experiments show that these images easily fool automated\ndiagnostic tools. We also tested several state-of-the-art forensic detectors\nand demonstrated that, once trained on the proposed dataset, they are able to\naccurately detect and localize manipulated synthetic content, even when\ntraining and test sets are not aligned, showing good generalization ability.\nDataset and code are publicly available at\nhttps://grip-unina.github.io/M3Dsynth/.\n","authors":["Giada Zingarini","Davide Cozzolino","Riccardo Corvi","Giovanni Poggi","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2309.07973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00453v1","updated":"2024-02-01T09:43:30Z","published":"2024-02-01T09:43:30Z","title":"Instruction Makes a Difference","summary":" We introduce Instruction Document Visual Question Answering (iDocVQA) dataset\nand Large Language Document (LLaDoc) model, for training Language-Vision (LV)\nmodels for document analysis and predictions on document images, respectively.\nUsually, deep neural networks for the DocVQA task are trained on datasets\nlacking instructions. We show that using instruction-following datasets\nimproves performance. We compare performance across document-related datasets\nusing the recent state-of-the-art (SotA) Large Language and Vision Assistant\n(LLaVA)1.5 as the base model. We also evaluate the performance of the derived\nmodels for object hallucination using the Polling-based Object Probing\nEvaluation (POPE) dataset. The results show that instruction-tuning performance\nranges from 11X to 32X of zero-shot performance and from 0.1% to 4.2% over\nnon-instruction (traditional task) finetuning. Despite the gains, these still\nfall short of human performance (94.36%), implying there's much room for\nimprovement.\n","authors":["Tosin Adewumi","Nudrat Habib","Lama Alkhaled","Elisa Barney"],"pdf_url":"https://arxiv.org/pdf/2402.00453v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.00241v2","updated":"2024-02-01T09:33:00Z","published":"2023-11-01T02:49:25Z","title":"1DFormer: a Transformer Architecture Learning 1D Landmark\n Representations for Facial Landmark Tracking","summary":" Recently, heatmap regression methods based on 1D landmark representations\nhave shown prominent performance on locating facial landmarks. However,\nprevious methods ignored to make deep explorations on the good potentials of 1D\nlandmark representations for sequential and structural modeling of multiple\nlandmarks to track facial landmarks. To address this limitation, we propose a\nTransformer architecture, namely 1DFormer, which learns informative 1D landmark\nrepresentations by capturing the dynamic and the geometric patterns of\nlandmarks via token communications in both temporal and spatial dimensions for\nfacial landmark tracking. For temporal modeling, we propose a recurrent token\nmixing mechanism, an axis-landmark-positional embedding mechanism, as well as a\nconfidence-enhanced multi-head attention mechanism to adaptively and robustly\nembed long-term landmark dynamics into their 1D representations; for structure\nmodeling, we design intra-group and inter-group structure modeling mechanisms\nto encode the component-level as well as global-level facial structure patterns\nas a refinement for the 1D representations of landmarks through token\ncommunications in the spatial dimension via 1D convolutional layers.\nExperimental results on the 300VW and the TF databases show that 1DFormer\nsuccessfully models the long-range sequential patterns as well as the inherent\nfacial structures to learn informative 1D representations of landmark\nsequences, and achieves state-of-the-art performance on facial landmark\ntracking.\n","authors":["Shi Yin","Shijie Huan","Shangfei Wang","Jinshui Hu","Tao Guo","Bing Yin","Baocai Yin","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.00241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00448v1","updated":"2024-02-01T09:32:39Z","published":"2024-02-01T09:32:39Z","title":"Dual-Student Knowledge Distillation Networks for Unsupervised Anomaly\n Detection","summary":" Due to the data imbalance and the diversity of defects, student-teacher\nnetworks (S-T) are favored in unsupervised anomaly detection, which explores\nthe discrepancy in feature representation derived from the knowledge\ndistillation process to recognize anomalies. However, vanilla S-T network is\nnot stable. Employing identical structures to construct the S-T network may\nweaken the representative discrepancy on anomalies. But using different\nstructures can increase the likelihood of divergent performance on normal data.\nTo address this problem, we propose a novel dual-student knowledge distillation\n(DSKD) architecture. Different from other S-T networks, we use two student\nnetworks a single pre-trained teacher network, where the students have the same\nscale but inverted structures. This framework can enhance the distillation\neffect to improve the consistency in recognition of normal data, and\nsimultaneously introduce diversity for anomaly representation. To explore\nhigh-dimensional semantic information to capture anomaly clues, we employ two\nstrategies. First, a pyramid matching mode is used to perform knowledge\ndistillation on multi-scale feature maps in the intermediate layers of\nnetworks. Second, an interaction is facilitated between the two student\nnetworks through a deep feature embedding module, which is inspired by\nreal-world group discussions. In terms of classification, we obtain pixel-wise\nanomaly segmentation maps by measuring the discrepancy between the output\nfeature maps of the teacher and student networks, from which an anomaly score\nis computed for sample-wise determination. We evaluate DSKD on three benchmark\ndatasets and probe the effects of internal modules through ablation\nexperiments. The results demonstrate that DSKD can achieve exceptional\nperformance on small models like ResNet18 and effectively improve vanilla S-T\nnetworks.\n","authors":["Liyi Yao","Shaobing Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00433v1","updated":"2024-02-01T08:58:57Z","published":"2024-02-01T08:58:57Z","title":"Merging Multi-Task Models via Weight-Ensembling Mixture of Experts","summary":" Merging various task-specific Transformer-based models trained on different\ntasks into a single unified model can execute all the tasks concurrently.\nPrevious methods, exemplified by task arithmetic, have been proven to be both\neffective and scalable. Existing methods have primarily focused on seeking a\nstatic optimal solution within the original model parameter space. A notable\nchallenge is mitigating the interference between parameters of different\nmodels, which can substantially deteriorate performance. In this paper, we\npropose to merge most of the parameters while upscaling the MLP of the\nTransformer layers to a weight-ensembling mixture of experts (MoE) module,\nwhich can dynamically integrate shared and task-specific knowledge based on the\ninput, thereby providing a more flexible solution that can adapt to the\nspecific needs of each instance. Our key insight is that by identifying and\nseparating shared knowledge and task-specific knowledge, and then dynamically\nintegrating them, we can mitigate the parameter interference problem to a great\nextent. We conduct the conventional multi-task model merging experiments and\nevaluate the generalization and robustness of our method. The results\ndemonstrate the effectiveness of our method and provide a comprehensive\nunderstanding of our method. The code is available at\nhttps://anonymous.4open.science/r/weight-ensembling_MoE-67C9/\n","authors":["Anke Tang","Li Shen","Yong Luo","Nan Yin","Lefei Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.00433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13445v5","updated":"2024-02-01T08:47:44Z","published":"2023-04-26T11:02:04Z","title":"Neural-PBIR Reconstruction of Shape, Material, and Illumination","summary":" Reconstructing the shape and spatially varying surface appearances of a\nphysical-world object as well as its surrounding illumination based on 2D\nimages (e.g., photographs) of the object has been a long-standing problem in\ncomputer vision and graphics. In this paper, we introduce an accurate and\nhighly efficient object reconstruction pipeline combining neural based object\nreconstruction and physics-based inverse rendering (PBIR). Our pipeline firstly\nleverages a neural SDF based shape reconstruction to produce high-quality but\npotentially imperfect object shape. Then, we introduce a neural material and\nlighting distillation stage to achieve high-quality predictions for material\nand illumination. In the last stage, initialized by the neural predictions, we\nperform PBIR to refine the initial results and obtain the final high-quality\nreconstruction of object shape, material, and illumination. Experimental\nresults demonstrate our pipeline significantly outperforms existing methods\nquality-wise and performance-wise.\n","authors":["Cheng Sun","Guangyan Cai","Zhengqin Li","Kai Yan","Cheng Zhang","Carl Marshall","Jia-Bin Huang","Shuang Zhao","Zhao Dong"],"pdf_url":"https://arxiv.org/pdf/2304.13445v5.pdf","comment":"ICCV 2023. Project page at https://neural-pbir.github.io/ Update\n Stanford-ORB results"},{"id":"http://arxiv.org/abs/2402.00422v1","updated":"2024-02-01T08:39:31Z","published":"2024-02-01T08:39:31Z","title":"Lightweight Pixel Difference Networks for Efficient Visual\n Representation Learning","summary":" Recently, there have been tremendous efforts in developing lightweight Deep\nNeural Networks (DNNs) with satisfactory accuracy, which can enable the\nubiquitous deployment of DNNs in edge devices. The core challenge of developing\ncompact and efficient DNNs lies in how to balance the competing goals of\nachieving high accuracy and high efficiency. In this paper we propose two novel\ntypes of convolutions, dubbed \\emph{Pixel Difference Convolution (PDC) and\nBinary PDC (Bi-PDC)} which enjoy the following benefits: capturing higher-order\nlocal differential information, computationally efficient, and able to be\nintegrated with existing DNNs. With PDC and Bi-PDC, we further present two\nlightweight deep networks named \\emph{Pixel Difference Networks (PiDiNet)} and\n\\emph{Binary PiDiNet (Bi-PiDiNet)} respectively to learn highly efficient yet\nmore accurate representations for visual tasks including edge detection and\nobject recognition. Extensive experiments on popular datasets (BSDS500,\nImageNet, LFW, YTF, \\emph{etc.}) show that PiDiNet and Bi-PiDiNet achieve the\nbest accuracy-efficiency trade-off. For edge detection, PiDiNet is the first\nnetwork that can be trained without ImageNet, and can achieve the human-level\nperformance on BSDS500 at 100 FPS and with $<$1M parameters. For object\nrecognition, among existing Binary DNNs, Bi-PiDiNet achieves the best accuracy\nand a nearly $2\\times$ reduction of computational cost on ResNet18. Code\navailable at\n\\href{https://github.com/hellozhuo/pidinet}{https://github.com/hellozhuo/pidinet}.\n","authors":["Zhuo Su","Jiehua Zhang","Longguang Wang","Hua Zhang","Zhen Liu","Matti Pietikäinen","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2402.00422v1.pdf","comment":"We design a novel lightweight convolutional operator for computer\n vision tasks. Both full-precision networks and BNNs are developed. Accepted\n by TPAMI"},{"id":"http://arxiv.org/abs/2402.00418v1","updated":"2024-02-01T08:36:16Z","published":"2024-02-01T08:36:16Z","title":"Short: Benchmarking transferable adversarial attacks","summary":" The robustness of deep learning models against adversarial attacks remains a\npivotal concern. This study presents, for the first time, an exhaustive review\nof the transferability aspect of adversarial attacks. It systematically\ncategorizes and critically evaluates various methodologies developed to augment\nthe transferability of adversarial attacks. This study encompasses a spectrum\nof techniques, including Generative Structure, Semantic Similarity, Gradient\nEditing, Target Modification, and Ensemble Approach. Concurrently, this paper\nintroduces a benchmark framework \\textit{TAA-Bench}, integrating ten leading\nmethodologies for adversarial attack transferability, thereby providing a\nstandardized and systematic platform for comparative analysis across diverse\nmodel architectures. Through comprehensive scrutiny, we delineate the efficacy\nand constraints of each method, shedding light on their underlying operational\nprinciples and practical utility. This review endeavors to be a quintessential\nresource for both scholars and practitioners in the field, charting the complex\nterrain of adversarial transferability and setting a foundation for future\nexplorations in this vital sector. The associated codebase is accessible at:\nhttps://github.com/KxPlaug/TAA-Bench\n","authors":["Zhibo Jin","Jiayu Zhang","Zhiyu Zhu","Huaming Chen"],"pdf_url":"https://arxiv.org/pdf/2402.00418v1.pdf","comment":"Accepted by NDSS 2024 Workshop"},{"id":"http://arxiv.org/abs/2401.02814v2","updated":"2024-02-01T08:34:46Z","published":"2024-01-05T13:54:45Z","title":"Object-Centric Instruction Augmentation for Robotic Manipulation","summary":" Humans interpret scenes by recognizing both the identities and positions of\nobjects in their observations. For a robot to perform tasks such as\n\\enquote{pick and place}, understanding both what the objects are and where\nthey are located is crucial. While the former has been extensively discussed in\nthe literature that uses the large language model to enrich the text\ndescriptions, the latter remains underexplored. In this work, we introduce the\n\\textit{Object-Centric Instruction Augmentation (OCI)} framework to augment\nhighly semantic and information-dense language instruction with position cues.\nWe utilize a Multi-modal Large Language Model (MLLM) to weave knowledge of\nobject locations into natural language instruction, thus aiding the policy\nnetwork in mastering actions for versatile manipulation. Additionally, we\npresent a feature reuse mechanism to integrate the vision-language features\nfrom off-the-shelf pre-trained MLLM into policy networks. Through a series of\nsimulated and real-world robotic tasks, we demonstrate that robotic manipulator\nimitation policies trained with our enhanced instructions outperform those\nrelying solely on traditional language instructions.\n","authors":["Junjie Wen","Yichen Zhu","Minjie Zhu","Jinming Li","Zhiyuan Xu","Zhengping Che","Chaomin Shen","Yaxin Peng","Dong Liu","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.02814v2.pdf","comment":"accepted to ICRA2024"},{"id":"http://arxiv.org/abs/2401.04181v2","updated":"2024-02-01T08:32:33Z","published":"2024-01-08T19:00:32Z","title":"Language-Conditioned Robotic Manipulation with Fast and Slow Thinking","summary":" The language-conditioned robotic manipulation aims to transfer natural\nlanguage instructions into executable actions, from simple pick-and-place to\ntasks requiring intent recognition and visual reasoning. Inspired by the dual\nprocess theory in cognitive science, which suggests two parallel systems of\nfast and slow thinking in human decision-making, we introduce Robotics with\nFast and Slow Thinking (RFST), a framework that mimics human cognitive\narchitecture to classify tasks and makes decisions on two systems based on\ninstruction types. Our RFST consists of two key components: 1) an instruction\ndiscriminator to determine which system should be activated based on the\ncurrent user instruction, and 2) a slow-thinking system that is comprised of a\nfine-tuned vision language model aligned with the policy networks, which allows\nthe robot to recognize user intention or perform reasoning tasks. To assess our\nmethodology, we built a dataset featuring real-world trajectories, capturing\nactions ranging from spontaneous impulses to tasks requiring deliberate\ncontemplation. Our results, both in simulation and real-world scenarios,\nconfirm that our approach adeptly manages intricate tasks that demand intent\nrecognition and reasoning. The project is available at\nhttps://jlm-z.github.io/RSFT/\n","authors":["Minjie Zhu","Yichen Zhu","Jinming Li","Junjie Wen","Zhiyuan Xu","Zhengping Che","Chaomin Shen","Yaxin Peng","Dong Liu","Feifei Feng","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.04181v2.pdf","comment":"accepted to ICRA2024"},{"id":"http://arxiv.org/abs/2306.15880v4","updated":"2024-02-01T08:31:59Z","published":"2023-06-28T02:33:06Z","title":"Towards Open Vocabulary Learning: A Survey","summary":" In the field of visual scene understanding, deep neural networks have made\nimpressive advancements in various core tasks like segmentation, tracking, and\ndetection. However, most approaches operate on the close-set assumption,\nmeaning that the model can only identify pre-defined categories that are\npresent in the training set. Recently, open vocabulary settings were proposed\ndue to the rapid progress of vision language pre-training. These new approaches\nseek to locate and recognize categories beyond the annotated label space. The\nopen vocabulary approach is more general, practical, and effective compared to\nweakly supervised and zero-shot settings. This paper provides a thorough review\nof open vocabulary learning, summarizing and analyzing recent developments in\nthe field. In particular, we begin by comparing it to related concepts such as\nzero-shot learning, open-set recognition, and out-of-distribution detection.\nThen, we review several closely related tasks in the case of segmentation and\ndetection, including long-tail problems, few-shot, and zero-shot settings. For\nthe method survey, we first present the basic knowledge of detection and\nsegmentation in close-set as the preliminary knowledge. Next, we examine\nvarious scenarios in which open vocabulary learning is used, identifying common\ndesign elements and core ideas. Then, we compare the recent detection and\nsegmentation approaches in commonly used datasets and benchmarks. Finally, we\nconclude with insights, issues, and discussions regarding future research\ndirections. To our knowledge, this is the first comprehensive literature review\nof open vocabulary learning. We keep tracing related works at\nhttps://github.com/jianzongwu/Awesome-Open-Vocabulary.\n","authors":["Jianzong Wu","Xiangtai Li","Shilin Xu","Haobo Yuan","Henghui Ding","Yibo Yang","Xia Li","Jiangning Zhang","Yunhai Tong","Xudong Jiang","Bernard Ghanem","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2306.15880v4.pdf","comment":"Accepted by IEEE T-PAMI. Project page:\n https://github.com/jianzongwu/Awesome-Open-Vocabulary"},{"id":"http://arxiv.org/abs/2401.17759v2","updated":"2024-02-01T08:26:54Z","published":"2024-01-31T11:36:12Z","title":"Tiered approach for rapid damage characterisation of infrastructure\n enabled by remote sensing and deep learning technologies","summary":" Critical infrastructure such as bridges are systematically targeted during\nwars and conflicts. This is because critical infrastructure is vital for\nenabling connectivity and transportation of people and goods, and hence,\nunderpinning the national and international defence planning and economic\ngrowth. Mass destruction of bridges, along with minimal or no accessibility to\nthese assets during natural and anthropogenic disasters, prevents us from\ndelivering rapid recovery. As a result, systemic resilience is drastically\nreduced. A solution to this challenge is to use technology for stand-off\nobservations. Yet, no method exists to characterise damage at different scales,\ni.e. regional, asset, and structural (component), and more so there is little\nor no systematic correlation between assessments at scale. We propose an\nintegrated three-level tiered approach to fill this capability gap, and we\ndemonstrate the methods for damage characterisation enabled by fit-for-purpose\ndigital technologies. Next, this method is applied and validated to a case\nstudy in Ukraine that includes 17 bridges. From macro to micro, we deploy\ntechnology at scale, from Sentinel-1 SAR images, crowdsourced information, and\nhigh-resolution images to deep learning for damaged infrastructure. For the\nfirst time, the interferometric coherence difference and semantic segmentation\nof images were deployed to improve the reliability of damage characterisations\nfrom regional to infrastructure component level, when enhanced assessment\naccuracy is required. This integrated method improves the speed of\ndecision-making, and thus, enhances resilience. Keywords: critical\ninfrastructure, damage characterisation, targeted attacks, restoration\n","authors":["Nadiia Kopiika","Andreas Karavias","Pavlos Krassakis","Zehao Ye","Jelena Ninic","Nataliya Shakhovska","Nikolaos Koukouzas","Sotirios Argyroudis","Stergios-Aristoteles Mitoulis"],"pdf_url":"https://arxiv.org/pdf/2401.17759v2.pdf","comment":"Main text (34 pages,18 figures); Supplementary materials (13 pages)"},{"id":"http://arxiv.org/abs/2402.00411v1","updated":"2024-02-01T08:10:39Z","published":"2024-02-01T08:10:39Z","title":"LM-HT SNN: Enhancing the Performance of SNN to ANN Counterpart through\n Learnable Multi-hierarchical Threshold Model","summary":" Compared to traditional Artificial Neural Network (ANN), Spiking Neural\nNetwork (SNN) has garnered widespread academic interest for its intrinsic\nability to transmit information in a more biological-inspired and\nenergy-efficient manner. However, despite previous efforts to optimize the\nlearning gradients and model structure of SNNs through various methods, SNNs\nstill lag behind ANNs in terms of performance to some extent. The recently\nproposed multi-threshold model provides more possibilities for further\nenhancing the learning capability of SNNs. In this paper, we rigorously analyze\nthe relationship among the multi-threshold model, vanilla spiking model and\nquantized ANNs from a mathematical perspective, then propose a novel LM-HT\nmodel, which is an equidistant multi-hierarchical model that can dynamically\nregulate the global input current and membrane potential leakage on the time\ndimension. In addition, we note that the direct training algorithm based on the\nLM-HT model can seamlessly integrate with the traditional ANN-SNN Conversion\nframework. This novel hybrid learning framework can effectively improve the\nrelatively poor performance of converted SNNs under low time latency. Extensive\nexperimental results have demonstrated that our LM-HT model can significantly\noutperform previous state-of-the-art works on various types of datasets, which\npromote SNNs to achieve a brand-new level of performance comparable to\nquantized ANNs.\n","authors":["Zecheng Hao","Xinyu Shi","Zhiyu Pan","Yujia Liu","Zhaofei Yu","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00411v1.pdf","comment":"15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.08083v2","updated":"2024-02-01T08:05:26Z","published":"2024-01-16T03:21:42Z","title":"UV-SAM: Adapting Segment Anything Model for Urban Village Identification","summary":" Urban villages, defined as informal residential areas in or around urban\ncenters, are characterized by inadequate infrastructures and poor living\nconditions, closely related to the Sustainable Development Goals (SDGs) on\npoverty, adequate housing, and sustainable cities. Traditionally, governments\nheavily depend on field survey methods to monitor the urban villages, which\nhowever are time-consuming, labor-intensive, and possibly delayed. Thanks to\nwidely available and timely updated satellite images, recent studies develop\ncomputer vision techniques to detect urban villages efficiently. However,\nexisting studies either focus on simple urban village image classification or\nfail to provide accurate boundary information. To accurately identify urban\nvillage boundaries from satellite images, we harness the power of the vision\nfoundation model and adapt the Segment Anything Model (SAM) to urban village\nsegmentation, named UV-SAM. Specifically, UV-SAM first leverages a small-sized\nsemantic segmentation model to produce mixed prompts for urban villages,\nincluding mask, bounding box, and image representations, which are then fed\ninto SAM for fine-grained boundary identification. Extensive experimental\nresults on two datasets in China demonstrate that UV-SAM outperforms existing\nbaselines, and identification results over multiple years show that both the\nnumber and area of urban villages are decreasing over time, providing deeper\ninsights into the development trends of urban villages and sheds light on the\nvision foundation models for sustainable cities. The dataset and codes of this\nstudy are available at https://github.com/tsinghua-fib-lab/UV-SAM.\n","authors":["Xin Zhang","Yu Liu","Yuming Lin","Qingmin Liao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2401.08083v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2402.00407v1","updated":"2024-02-01T08:02:10Z","published":"2024-02-01T08:02:10Z","title":"InfMAE: A Foundation Model in Infrared Modality","summary":" In recent years, the foundation models have swept the computer vision field\nand facilitated the development of various tasks within different modalities.\nHowever, it remains an open question on how to design an infrared foundation\nmodel. In this paper, we propose InfMAE, a foundation model in infrared\nmodality. We release an infrared dataset, called Inf30 to address the problem\nof lacking large-scale data for self-supervised learning in the infrared vision\ncommunity. Besides, we design an information-aware masking strategy, which is\nsuitable for infrared images. This masking strategy allows for a greater\nemphasis on the regions with richer information in infrared images during the\nself-supervised learning process, which is conducive to learning the\ngeneralized representation. In addition, we adopt a multi-scale encoder to\nenhance the performance of the pre-trained encoders in downstream tasks.\nFinally, based on the fact that infrared images do not have a lot of details\nand texture information, we design an infrared decoder module, which further\nimproves the performance of downstream tasks. Extensive experiments show that\nour proposed method InfMAE outperforms other supervised methods and\nself-supervised learning methods in three downstream tasks. Our code will be\nmade public at https://github.com/liufangcen/InfMAE.\n","authors":["Fangcen Liu","Chenqiang Gao","Yaming Zhang","Junjie Guo","Jinhao Wang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2402.00407v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2212.08414v2","updated":"2024-02-01T07:55:03Z","published":"2022-12-16T11:27:44Z","title":"Deep Learning Methods for Calibrated Photometric Stereo and Beyond","summary":" Photometric stereo recovers the surface normals of an object from multiple\nimages with varying shading cues, i.e., modeling the relationship between\nsurface orientation and intensity at each pixel. Photometric stereo prevails in\nsuperior per-pixel resolution and fine reconstruction details. However, it is a\ncomplicated problem because of the non-linear relationship caused by\nnon-Lambertian surface reflectance. Recently, various deep learning methods\nhave shown a powerful ability in the context of photometric stereo against\nnon-Lambertian surfaces. This paper provides a comprehensive review of existing\ndeep learning-based calibrated photometric stereo methods. We first analyze\nthese methods from different perspectives, including input processing,\nsupervision, and network architecture. We summarize the performance of deep\nlearning photometric stereo models on the most widely-used benchmark data set.\nThis demonstrates the advanced performance of deep learning-based photometric\nstereo methods. Finally, we give suggestions and propose future research trends\nbased on the limitations of existing models.\n","authors":["Yakun Ju","Kin-Man Lam","Wuyuan Xie","Huiyu Zhou","Junyu Dong","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2212.08414v2.pdf","comment":"19 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.09117v3","updated":"2024-02-01T07:41:47Z","published":"2023-03-16T07:23:55Z","title":"Cross-Modal Causal Intervention for Medical Report Generation","summary":" Medical report generation (MRG) is essential for computer-aided diagnosis and\nmedication guidance, which can relieve the heavy burden of radiologists by\nautomatically generating the corresponding medical reports according to the\ngiven radiology image. However, due to the spurious correlations within\nimage-text data induced by visual and linguistic biases, it is challenging to\ngenerate accurate reports reliably describing lesion areas. Moreover, the\ncross-modal confounders are usually unobservable and challenging to be\neliminated explicitly. In this paper, we aim to mitigate the cross-modal data\nbias for MRG from a new perspective, i.e., cross-modal causal intervention, and\npropose a novel Visual-Linguistic Causal Intervention (VLCI) framework for MRG,\nwhich consists of a visual deconfounding module (VDM) and a linguistic\ndeconfounding module (LDM), to implicitly mitigate the visual-linguistic\nconfounders by causal front-door intervention. Specifically, due to the absence\nof a generalized semantic extractor, the VDM explores and disentangles the\nvisual confounders from the patch-based local and global features without\nexpensive fine-grained annotations. Simultaneously, due to the lack of\nknowledge encompassing the entire field of medicine, the LDM eliminates the\nlinguistic confounders caused by salient visual features and high-frequency\ncontext without constructing a terminology database. Extensive experiments on\nIU-Xray and MIMIC-CXR datasets show that our VLCI significantly outperforms the\nstate-of-the-art MRG methods. The code and models are available at\nhttps://github.com/WissingChen/VLCI.\n","authors":["Weixing Chen","Yang Liu","Ce Wang","Jiarui Zhu","Shen Zhao","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2303.09117v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07921v2","updated":"2024-02-01T07:01:19Z","published":"2023-09-14T17:59:53Z","title":"OpenIllumination: A Multi-Illumination Dataset for Inverse Rendering\n Evaluation on Real Objects","summary":" We introduce OpenIllumination, a real-world dataset containing over 108K\nimages of 64 objects with diverse materials, captured under 72 camera views and\na large number of different illuminations. For each image in the dataset, we\nprovide accurate camera parameters, illumination ground truth, and foreground\nsegmentation masks. Our dataset enables the quantitative evaluation of most\ninverse rendering and material decomposition methods for real objects. We\nexamine several state-of-the-art inverse rendering methods on our dataset and\ncompare their performances. The dataset and code can be found on the project\npage: https://oppo-us-research.github.io/OpenIllumination.\n","authors":["Isabella Liu","Linghao Chen","Ziyang Fu","Liwen Wu","Haian Jin","Zhong Li","Chin Ming Ryan Wong","Yi Xu","Ravi Ramamoorthi","Zexiang Xu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2309.07921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00376v1","updated":"2024-02-01T06:47:56Z","published":"2024-02-01T06:47:56Z","title":"Image2Points:A 3D Point-based Context Clusters GAN for High-Quality PET\n Image Reconstruction","summary":" To obtain high-quality Positron emission tomography (PET) images while\nminimizing radiation exposure, numerous methods have been proposed to\nreconstruct standard-dose PET (SPET) images from the corresponding low-dose PET\n(LPET) images. However, these methods heavily rely on voxel-based\nrepresentations, which fall short of adequately accounting for the precise\nstructure and fine-grained context, leading to compromised reconstruction. In\nthis paper, we propose a 3D point-based context clusters GAN, namely PCC-GAN,\nto reconstruct high-quality SPET images from LPET. Specifically, inspired by\nthe geometric representation power of points, we resort to a point-based\nrepresentation to enhance the explicit expression of the image structure, thus\nfacilitating the reconstruction with finer details. Moreover, a context\nclustering strategy is applied to explore the contextual relationships among\npoints, which mitigates the ambiguities of small structures in the\nreconstructed images. Experiments on both clinical and phantom datasets\ndemonstrate that our PCC-GAN outperforms the state-of-the-art reconstruction\nmethods qualitatively and quantitatively. Code is available at\nhttps://github.com/gluucose/PCCGAN.\n","authors":["Jiaqi Cui","Yan Wang","Lu Wen","Pinxian Zeng","Xi Wu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2402.00376v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.00375v1","updated":"2024-02-01T06:34:35Z","published":"2024-02-01T06:34:35Z","title":"Disentangled Multimodal Brain MR Image Translation via Transformer-based\n Modality Infuser","summary":" Multimodal Magnetic Resonance (MR) Imaging plays a crucial role in disease\ndiagnosis due to its ability to provide complementary information by analyzing\na relationship between multimodal images on the same subject. Acquiring all MR\nmodalities, however, can be expensive, and, during a scanning session, certain\nMR images may be missed depending on the study protocol. The typical solution\nwould be to synthesize the missing modalities from the acquired images such as\nusing generative adversarial networks (GANs). Yet, GANs constructed with\nconvolutional neural networks (CNNs) are likely to suffer from a lack of global\nrelationships and mechanisms to condition the desired modality. To address\nthis, in this work, we propose a transformer-based modality infuser designed to\nsynthesize multimodal brain MR images. In our method, we extract\nmodality-agnostic features from the encoder and then transform them into\nmodality-specific features using the modality infuser. Furthermore, the\nmodality infuser captures long-range relationships among all brain structures,\nleading to the generation of more realistic images. We carried out experiments\non the BraTS 2018 dataset, translating between four MR modalities, and our\nexperimental results demonstrate the superiority of our proposed method in\nterms of synthesis quality. In addition, we conducted experiments on a brain\ntumor segmentation task and different conditioning methods.\n","authors":["Jihoon Cho","Xiaofeng Liu","Fangxu Xing","Jinsong Ouyang","Georges El Fakhri","Jinah Park","Jonghye Woo"],"pdf_url":"https://arxiv.org/pdf/2402.00375v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2402.00357v1","updated":"2024-02-01T05:57:10Z","published":"2024-02-01T05:57:10Z","title":"Safety of Multimodal Large Language Models on Images and Text","summary":" Attracted by the impressive power of Multimodal Large Language Models\n(MLLMs), the public is increasingly utilizing them to improve the efficiency of\ndaily work. Nonetheless, the vulnerabilities of MLLMs to unsafe instructions\nbring huge safety risks when these models are deployed in real-world scenarios.\nIn this paper, we systematically survey current efforts on the evaluation,\nattack, and defense of MLLMs' safety on images and text. We begin with\nintroducing the overview of MLLMs on images and text and understanding of\nsafety, which helps researchers know the detailed scope of our survey. Then, we\nreview the evaluation datasets and metrics for measuring the safety of MLLMs.\nNext, we comprehensively present attack and defense techniques related to\nMLLMs' safety. Finally, we analyze several unsolved issues and discuss\npromising research directions.\n","authors":["Xin Liu","Yichen Zhu","Yunshi Lan","Chao Yang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2402.00357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00353v1","updated":"2024-02-01T05:51:03Z","published":"2024-02-01T05:51:03Z","title":"High-Quality Medical Image Generation from Free-hand Sketch","summary":" Generating medical images from human-drawn free-hand sketches holds promise\nfor various important medical imaging applications. Due to the extreme\ndifficulty in collecting free-hand sketch data in the medical domain, most deep\nlearning-based methods have been proposed to generate medical images from the\nsynthesized sketches (e.g., edge maps or contours of segmentation masks from\nreal images). However, these models often fail to generalize on the free-hand\nsketches, leading to unsatisfactory results. In this paper, we propose a\npractical free-hand sketch-to-image generation model called Sketch2MedI that\nlearns to represent sketches in StyleGAN's latent space and generate medical\nimages from it. Thanks to the ability to encode sketches into this meaningful\nrepresentation space, Sketch2MedI only requires synthesized sketches for\ntraining, enabling a cost-effective learning process. Our Sketch2MedI\ndemonstrates a robust generalization to free-hand sketches, resulting in\nhigh-quality and realistic medical image generations. Comparative evaluations\nof Sketch2MedI against the pix2pix, CycleGAN, UNIT, and U-GAT-IT models show\nsuperior performance in generating pharyngeal images, both quantitative and\nqualitative across various metrics.\n","authors":["Quan Huu Cap","Atsushi Fukuda"],"pdf_url":"https://arxiv.org/pdf/2402.00353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11133v4","updated":"2024-02-01T05:39:07Z","published":"2022-11-21T00:38:59Z","title":"Enhancing Accuracy and Robustness of Steering Angle Prediction with\n Attention Mechanism","summary":" In this paper, our focus is on enhancing steering angle prediction for\nautonomous driving tasks. We initiate our exploration by investigating two\nveins of widely adopted deep neural architectures, namely ResNets and\nInceptionNets. Within both families, we systematically evaluate various model\nsizes to understand their impact on performance. Notably, our key contribution\nlies in the incorporation of an attention mechanism to augment steering angle\nprediction accuracy and robustness. By introducing attention, our models gain\nthe ability to selectively focus on crucial regions within the input data,\nleading to improved predictive outcomes. Our findings showcase that our\nattention-enhanced models not only achieve state-of-the-art results in terms of\nsteering angle Mean Squared Error (MSE) but also exhibit enhanced adversarial\nrobustness, addressing critical concerns in real-world deployment. For example,\nin our experiments on the Kaggle SAP and our created publicly available\ndatasets, attention can lead to over 6% error reduction in steering angle\nprediction and boost model robustness by up to 56.09%.\n","authors":["Swetha Nadella","Pramiti Barua","Jeremy C. Hagler","David J. Lamb","Qing Tian"],"pdf_url":"https://arxiv.org/pdf/2211.11133v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00351v1","updated":"2024-02-01T05:35:25Z","published":"2024-02-01T05:35:25Z","title":"Machine Unlearning for Image-to-Image Generative Models","summary":" Machine unlearning has emerged as a new paradigm to deliberately forget data\nsamples from a given model in order to adhere to stringent regulations.\nHowever, existing machine unlearning methods have been primarily focused on\nclassification models, leaving the landscape of unlearning for generative\nmodels relatively unexplored. This paper serves as a bridge, addressing the gap\nby providing a unifying framework of machine unlearning for image-to-image\ngenerative models. Within this framework, we propose a\ncomputationally-efficient algorithm, underpinned by rigorous theoretical\nanalysis, that demonstrates negligible performance degradation on the retain\nsamples, while effectively removing the information from the forget samples.\nEmpirical studies on two large-scale datasets, ImageNet-1K and Places-365,\nfurther show that our algorithm does not rely on the availability of the retain\nsamples, which further complies with data retention policy. To our best\nknowledge, this work is the first that represents systemic, theoretical,\nempirical explorations of machine unlearning specifically tailored for\nimage-to-image generative models. Our code is available at\nhttps://github.com/jpmorganchase/l2l-generator-unlearning.\n","authors":["Guihong Li","Hsiang Hsu"," Chun-Fu"," Chen","Radu Marculescu"],"pdf_url":"https://arxiv.org/pdf/2402.00351v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2301.03182v2","updated":"2024-02-01T05:21:20Z","published":"2023-01-09T06:31:52Z","title":"Structure-Informed Shadow Removal Networks","summary":" Existing deep learning-based shadow removal methods still produce images with\nshadow remnants. These shadow remnants typically exist in homogeneous regions\nwith low-intensity values, making them untraceable in the existing\nimage-to-image mapping paradigm. We observe that shadows mainly degrade images\nat the image-structure level (in which humans perceive object shapes and\ncontinuous colors). Hence, in this paper, we propose to remove shadows at the\nimage structure level. Based on this idea, we propose a novel\nstructure-informed shadow removal network (StructNet) to leverage the\nimage-structure information to address the shadow remnant problem.\nSpecifically, StructNet first reconstructs the structure information of the\ninput image without shadows and then uses the restored shadow-free structure\nprior to guiding the image-level shadow removal. StructNet contains two main\nnovel modules: (1) a mask-guided shadow-free extraction (MSFE) module to\nextract image structural features in a non-shadow-to-shadow directional manner,\nand (2) a multi-scale feature & residual aggregation (MFRA) module to leverage\nthe shadow-free structure information to regularize feature consistency. In\naddition, we also propose to extend StructNet to exploit multi-level structure\ninformation (MStructNet), to further boost the shadow removal performance with\nminimum computational overheads. Extensive experiments on three shadow removal\nbenchmarks demonstrate that our method outperforms existing shadow removal\nmethods, and our StructNet can be integrated with existing methods to improve\nthem further.\n","authors":["Yuhao Liu","Qing Guo","Lan Fu","Zhanghan Ke","Ke Xu","Wei Feng","Ivor W. Tsang","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2301.03182v2.pdf","comment":"IEEE TIP"},{"id":"http://arxiv.org/abs/2311.09680v5","updated":"2024-02-01T05:15:52Z","published":"2023-11-16T08:49:46Z","title":"Trustworthy Large Models in Vision: A Survey","summary":" The rapid progress of Large Models (LMs) has recently revolutionized various\nfields of deep learning with remarkable grades, ranging from Natural Language\nProcessing (NLP) to Computer Vision (CV). However, LMs are increasingly\nchallenged and criticized by academia and industry due to their powerful\nperformance but untrustworthy behavior, which urgently needs to be alleviated\nby reliable methods. Despite the abundance of literature on trustworthy LMs in\nNLP, a systematic survey specifically delving into the trustworthiness of LMs\nin CV remains absent. In order to mitigate this gap, we summarize four relevant\nconcerns that obstruct the trustworthy usage in vision of LMs in this survey,\nincluding 1) human misuse, 2) vulnerability, 3) inherent issue and 4)\ninterpretability. By highlighting corresponding challenge, countermeasures, and\ndiscussion in each topic, we hope this survey will facilitate readers'\nunderstanding of this field, promote alignment of LMs with human expectations\nand enable trustworthy LMs to serve as welfare rather than disaster for human\nsociety.\n","authors":["Ziyan Guo","Li Xu","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.09680v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00341v1","updated":"2024-02-01T05:08:39Z","published":"2024-02-01T05:08:39Z","title":"Recasting Regional Lighting for Shadow Removal","summary":" Removing shadows requires an understanding of both lighting conditions and\nobject textures in a scene. Existing methods typically learn pixel-level color\nmappings between shadow and non-shadow images, in which the joint modeling of\nlighting and object textures is implicit and inadequate. We observe that in a\nshadow region, the degradation degree of object textures depends on the local\nillumination, while simply enhancing the local illumination cannot fully\nrecover the attenuated textures. Based on this observation, we propose to\ncondition the restoration of attenuated textures on the corrected local\nlighting in the shadow region. Specifically, We first design a shadow-aware\ndecomposition network to estimate the illumination and reflectance layers of\nshadow regions explicitly. We then propose a novel bilateral correction network\nto recast the lighting of shadow regions in the illumination layer via a novel\nlocal lighting correction module, and to restore the textures conditioned on\nthe corrected illumination layer via a novel illumination-guided texture\nrestoration module. We further annotate pixel-wise shadow masks for the public\nSRD dataset, which originally contains only image pairs. Experiments on three\nbenchmarks show that our method outperforms existing state-of-the-art shadow\nremoval methods.\n","authors":["Yuhao Liu","Zhanghan Ke","Ke Xu","Fang Liu","Zhenwei Wang","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2402.00341v1.pdf","comment":"AAAI 2024 (Oral)"},{"id":"http://arxiv.org/abs/2401.17857v2","updated":"2024-02-01T05:05:36Z","published":"2024-01-31T14:19:03Z","title":"Segment Anything in 3D Gaussians","summary":" 3D Gaussian Splatting has emerged as an alternative 3D representation of\nNeural Radiance Fields (NeRFs), benefiting from its high-quality rendering\nresults and real-time rendering speed. Considering the 3D Gaussian\nrepresentation remains unparsed, it is necessary first to execute object\nsegmentation within this domain. Subsequently, scene editing and collision\ndetection can be performed, proving vital to a multitude of applications, such\nas virtual reality (VR), augmented reality (AR), game/movie production, etc. In\nthis paper, we propose a novel approach to achieve object segmentation in 3D\nGaussian via an interactive procedure without any training process and learned\nparameters. We refer to the proposed method as SA-GS, for Segment Anything in\n3D Gaussians. Given a set of clicked points in a single input view, SA-GS can\ngeneralize SAM to achieve 3D consistent segmentation via the proposed\nmulti-view mask generation and view-wise label assignment methods. We also\npropose a cross-view label-voting approach to assign labels from different\nviews. In addition, in order to address the boundary roughness issue of\nsegmented objects resulting from the non-negligible spatial sizes of 3D\nGaussian located at the boundary, SA-GS incorporates the simple but effective\nGaussian Decomposition scheme. Extensive experiments demonstrate that SA-GS\nachieves high-quality 3D segmentation results, which can also be easily applied\nfor scene editing and collision detection tasks. Codes will be released soon.\n","authors":["Xu Hu","Yuxi Wang","Lue Fan","Junsong Fan","Junran Peng","Zhen Lei","Qing Li","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08984v2","updated":"2024-02-01T04:53:30Z","published":"2023-12-14T14:29:53Z","title":"CL2CM: Improving Cross-Lingual Cross-Modal Retrieval via Cross-Lingual\n Knowledge Transfer","summary":" Cross-lingual cross-modal retrieval has garnered increasing attention\nrecently, which aims to achieve the alignment between vision and target\nlanguage (V-T) without using any annotated V-T data pairs. Current methods\nemploy machine translation (MT) to construct pseudo-parallel data pairs, which\nare then used to learn a multi-lingual and multi-modal embedding space that\naligns visual and target-language representations. However, the large\nheterogeneous gap between vision and text, along with the noise present in\ntarget language translations, poses significant challenges in effectively\naligning their representations. To address these challenges, we propose a\ngeneral framework, Cross-Lingual to Cross-Modal (CL2CM), which improves the\nalignment between vision and target language using cross-lingual transfer. This\napproach allows us to fully leverage the merits of multi-lingual pre-trained\nmodels (e.g., mBERT) and the benefits of the same modality structure, i.e.,\nsmaller gap, to provide reliable and comprehensive semantic correspondence\n(knowledge) for the cross-modal network. We evaluate our proposed approach on\ntwo multilingual image-text datasets, Multi30K and MSCOCO, and one video-text\ndataset, VATEX. The results clearly demonstrate the effectiveness of our\nproposed method and its high potential for large-scale retrieval.\n","authors":["Yabing Wang","Fan Wang","Jianfeng Dong","Hao Luo"],"pdf_url":"https://arxiv.org/pdf/2312.08984v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2402.00321v1","updated":"2024-02-01T04:15:39Z","published":"2024-02-01T04:15:39Z","title":"SmartCooper: Vehicular Collaborative Perception with Adaptive Fusion and\n Judger Mechanism","summary":" In recent years, autonomous driving has garnered significant attention due to\nits potential for improving road safety through collaborative perception among\nconnected and autonomous vehicles (CAVs). However, time-varying channel\nvariations in vehicular transmission environments demand dynamic allocation of\ncommunication resources. Moreover, in the context of collaborative perception,\nit is important to recognize that not all CAVs contribute valuable data, and\nsome CAV data even have detrimental effects on collaborative perception. In\nthis paper, we introduce SmartCooper, an adaptive collaborative perception\nframework that incorporates communication optimization and a judger mechanism\nto facilitate CAV data fusion. Our approach begins with optimizing the\nconnectivity of vehicles while considering communication constraints. We then\ntrain a learnable encoder to dynamically adjust the compression ratio based on\nthe channel state information (CSI). Subsequently, we devise a judger mechanism\nto filter the detrimental image data reconstructed by adaptive decoders. We\nevaluate the effectiveness of our proposed algorithm on the OpenCOOD platform.\nOur results demonstrate a substantial reduction in communication costs by\n23.10\\% compared to the non-judger scheme. Additionally, we achieve a\nsignificant improvement on the average precision of Intersection over Union\n(AP@IoU) by 7.15\\% compared with state-of-the-art schemes.\n","authors":["Yuang Zhang","Haonan An","Zhengru Fang","Guowen Xu","Yuan Zhou","Xianhao Chen","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2402.00321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00319v1","updated":"2024-02-01T04:09:17Z","published":"2024-02-01T04:09:17Z","title":"SCO-VIST: Social Interaction Commonsense Knowledge-based Visual\n Storytelling","summary":" Visual storytelling aims to automatically generate a coherent story based on\na given image sequence. Unlike tasks like image captioning, visual stories\nshould contain factual descriptions, worldviews, and human social commonsense\nto put disjointed elements together to form a coherent and engaging\nhuman-writeable story. However, most models mainly focus on applying factual\ninformation and using taxonomic/lexical external knowledge when attempting to\ncreate stories. This paper introduces SCO-VIST, a framework representing the\nimage sequence as a graph with objects and relations that includes human action\nmotivation and its social interaction commonsense knowledge. SCO-VIST then\ntakes this graph representing plot points and creates bridges between plot\npoints with semantic and occurrence-based edge weights. This weighted story\ngraph produces the storyline in a sequence of events using Floyd-Warshall's\nalgorithm. Our proposed framework produces stories superior across multiple\nmetrics in terms of visual grounding, coherence, diversity, and humanness, per\nboth automatic and human evaluations.\n","authors":["Eileen Wang","Soyeon Caren Han","Josiah Poon"],"pdf_url":"https://arxiv.org/pdf/2402.00319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.04086v3","updated":"2024-02-01T03:54:08Z","published":"2020-01-13T07:27:05Z","title":"GridMask Data Augmentation","summary":" We propose a novel data augmentation method `GridMask' in this paper. It\nutilizes information removal to achieve state-of-the-art results in a variety\nof computer vision tasks. We analyze the requirement of information dropping.\nThen we show limitation of existing information dropping algorithms and propose\nour structured method, which is simple and yet very effective. It is based on\nthe deletion of regions of the input image. Our extensive experiments show that\nour method outperforms the latest AutoAugment, which is way more\ncomputationally expensive due to the use of reinforcement learning to find the\nbest policies. On the ImageNet dataset for recognition, COCO2017 object\ndetection, and on Cityscapes dataset for semantic segmentation, our method all\nnotably improves performance over baselines. The extensive experiments manifest\nthe effectiveness and generality of the new method.\n","authors":["Pengguang Chen","Shu Liu","Hengshuang Zhao","Xingquan Wang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2001.04086v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13325v2","updated":"2024-02-01T03:38:22Z","published":"2024-01-24T09:39:45Z","title":"Memory Consistency Guided Divide-and-Conquer Learning for Generalized\n Category Discovery","summary":" Generalized category discovery (GCD) aims at addressing a more realistic and\nchallenging setting of semi-supervised learning, where only part of the\ncategory labels are assigned to certain training samples. Previous methods\ngenerally employ naive contrastive learning or unsupervised clustering scheme\nfor all the samples. Nevertheless, they usually ignore the inherent critical\ninformation within the historical predictions of the model being trained.\nSpecifically, we empirically reveal that a significant number of salient\nunlabeled samples yield consistent historical predictions corresponding to\ntheir ground truth category. From this observation, we propose a Memory\nConsistency guided Divide-and-conquer Learning framework (MCDL). In this\nframework, we introduce two memory banks to record historical prediction of\nunlabeled data, which are exploited to measure the credibility of each sample\nin terms of its prediction consistency. With the guidance of credibility, we\ncan design a divide-and-conquer learning strategy to fully utilize the\ndiscriminative information of unlabeled data while alleviating the negative\ninfluence of noisy labels. Extensive experimental results on multiple\nbenchmarks demonstrate the generality and superiority of our method, where our\nmethod outperforms state-of-the-art models by a large margin on both seen and\nunseen classes of the generic image recognition and challenging semantic shift\nsettings (i.e.,with +8.4% gain on CUB and +8.1% on Standford Cars).\n","authors":["Yuanpeng Tu","Zhun Zhong","Yuxi Li","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.13325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00304v1","updated":"2024-02-01T03:34:48Z","published":"2024-02-01T03:34:48Z","title":"Invariance-powered Trustworthy Defense via Remove Then Restore","summary":" Adversarial attacks pose a challenge to the deployment of deep neural\nnetworks (DNNs), while previous defense models overlook the generalization to\nvarious attacks. Inspired by targeted therapies for cancer, we view adversarial\nsamples as local lesions of natural benign samples, because a key finding is\nthat salient attack in an adversarial sample dominates the attacking process,\nwhile trivial attack unexpectedly provides trustworthy evidence for obtaining\ngeneralizable robustness. Based on this finding, a Pixel Surgery and Semantic\nRegeneration (PSSR) model following the targeted therapy mechanism is\ndeveloped, which has three merits: 1) To remove the salient attack, a\nscore-based Pixel Surgery module is proposed, which retains the trivial attack\nas a kind of invariance information. 2) To restore the discriminative content,\na Semantic Regeneration module based on a conditional alignment extrapolator is\nproposed, which achieves pixel and semantic consistency. 3) To further\nharmonize robustness and accuracy, an intractable problem, a self-augmentation\nregularizer with adversarial R-drop is designed. Experiments on numerous\nbenchmarks show the superiority of PSSR.\n","authors":["Xiaowei Fu","Yuhang Zhou","Lina Ma","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17910v2","updated":"2024-02-01T03:34:44Z","published":"2024-01-31T15:15:41Z","title":"Controllable Dense Captioner with Multimodal Embedding Bridging","summary":" In this paper, we propose a controllable dense captioner (ControlCap), which\naccommodates user's intention to dense captioning by introducing linguistic\nguidance. ControlCap is defined as a multimodal embedding bridging\narchitecture, which comprises multimodal embedding generation (MEG) module and\nbi-directional embedding bridging (BEB) module. While MEG module represents\nobjects/regions by combining embeddings of detailed information with\ncontext-aware ones, it also endows ControlCap the adaptability to specialized\ncontrols by utilizing them as linguistic guidance. BEB module aligns the\nlinguistic guidance with visual embeddings through borrowing/returning features\nfrom/to the visual domain and gathering such features to predict text\ndescriptions. Experiments on Visual Genome and VG-COCO datasets show that\nControlCap respectively outperforms the state-of-the-art methods by 1.5% and\n3.7% (mAP). Last but not least, with the capability of converting\nregion-category pairs to region-text pairs, ControlCap is able to act as a\npowerful data engine for dense captioning. Code is available at\nhttps://github.com/callsys/ControlCap.\n","authors":["Yuzhong Zhao","Yue Liu","Zonghao Guo","Weijia Wu","Chen Gong","Fang Wan","Qixiang Ye"],"pdf_url":"https://arxiv.org/pdf/2401.17910v2.pdf","comment":"https://github.com/callsys/ControlCap"},{"id":"http://arxiv.org/abs/2402.00300v1","updated":"2024-02-01T03:27:26Z","published":"2024-02-01T03:27:26Z","title":"Self-supervised learning of video representations from a child's\n perspective","summary":" Children learn powerful internal models of the world around them from a few\nyears of egocentric visual experience. Can such internal models be learned from\na child's visual experience with highly generic learning algorithms or do they\nrequire strong inductive biases? Recent advances in collecting large-scale,\nlongitudinal, developmentally realistic video datasets and generic\nself-supervised learning (SSL) algorithms are allowing us to begin to tackle\nthis nature vs. nurture question. However, existing work typically focuses on\nimage-based SSL algorithms and visual capabilities that can be learned from\nstatic images (e.g. object recognition), thus ignoring temporal aspects of the\nworld. To close this gap, here we train self-supervised video models on\nlongitudinal, egocentric headcam recordings collected from a child over a two\nyear period in their early development (6-31 months). The resulting models are\nhighly effective at facilitating the learning of action concepts from a small\nnumber of labeled examples; they have favorable data size scaling properties;\nand they display emergent video interpolation capabilities. Video models also\nlearn more robust object representations than image-based models trained with\nthe exact same data. These results suggest that important temporal aspects of a\nchild's internal model of the world may be learnable from their visual\nexperience using highly generic learning algorithms and without strong\ninductive biases.\n","authors":["A. Emin Orhan","Wentao Wang","Alex N. Wang","Mengye Ren","Brenden M. Lake"],"pdf_url":"https://arxiv.org/pdf/2402.00300v1.pdf","comment":"7 pages, 6 figures; code & models available from\n https://github.com/eminorhan/video-models"},{"id":"http://arxiv.org/abs/2402.00295v1","updated":"2024-02-01T02:54:49Z","published":"2024-02-01T02:54:49Z","title":"Comparative Evaluation of Traditional and Deep Learning-Based\n Segmentation Methods for Spoil Pile Delineation Using UAV Images","summary":" The stability of mine dumps is contingent upon the precise arrangement of\nspoil piles, taking into account their geological and geotechnical attributes.\nYet, on-site characterisation of individual piles poses a formidable challenge.\nThe utilisation of image-based techniques for spoil pile characterisation,\nemploying remotely acquired data through unmanned aerial systems, is a\npromising complementary solution. Image processing, such as object-based\nclassification and feature extraction, are dependent upon effective\nsegmentation. This study refines and juxtaposes various segmentation\napproaches, specifically colour-based and morphology-based techniques. The\nobjective is to enhance and evaluate avenues for object-based analysis for\nspoil characterisation within the context of mining environments. Furthermore,\na comparative analysis is conducted between conventional segmentation\napproaches and those rooted in deep learning methodologies. Among the diverse\nsegmentation approaches evaluated, the morphology-based deep learning\nsegmentation approach, Segment Anything Model (SAM), exhibited superior\nperformance in comparison to other approaches. This outcome underscores the\nefficacy of incorporating advanced morphological and deep learning techniques\nfor accurate and efficient spoil pile characterisation. The findings of this\nstudy contribute valuable insights to the optimisation of segmentation\nstrategies, thereby advancing the application of image-based techniques for the\ncharacterisation of spoil piles in mining environments.\n","authors":["Sureka Thiruchittampalam","Bikram P. Banerjee","Nancy F. Glenn","Simit Raval"],"pdf_url":"https://arxiv.org/pdf/2402.00295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00293v1","updated":"2024-02-01T02:47:39Z","published":"2024-02-01T02:47:39Z","title":"FineBio: A Fine-Grained Video Dataset of Biological Experiments with\n Hierarchical Annotation","summary":" In the development of science, accurate and reproducible documentation of the\nexperimental process is crucial. Automatic recognition of the actions in\nexperiments from videos would help experimenters by complementing the recording\nof experiments. Towards this goal, we propose FineBio, a new fine-grained video\ndataset of people performing biological experiments. The dataset consists of\nmulti-view videos of 32 participants performing mock biological experiments\nwith a total duration of 14.5 hours. One experiment forms a hierarchical\nstructure, where a protocol consists of several steps, each further decomposed\ninto a set of atomic operations. The uniqueness of biological experiments is\nthat while they require strict adherence to steps described in each protocol,\nthere is freedom in the order of atomic operations. We provide hierarchical\nannotation on protocols, steps, atomic operations, object locations, and their\nmanipulation states, providing new challenges for structured activity\nunderstanding and hand-object interaction recognition. To find out challenges\non activity understanding in biological experiments, we introduce baseline\nmodels and results on four different tasks, including (i) step segmentation,\n(ii) atomic operation detection (iii) object detection, and (iv)\nmanipulated/affected object detection. Dataset and code are available from\nhttps://github.com/aistairc/FineBio.\n","authors":["Takuma Yagi","Misaki Ohashi","Yifei Huang","Ryosuke Furuta","Shungo Adachi","Toutai Mitsuyama","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2402.00293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00290v1","updated":"2024-02-01T02:43:20Z","published":"2024-02-01T02:43:20Z","title":"Multimodal Embodied Interactive Agent for Cafe Scene","summary":" With the surge in the development of large language models, embodied\nintelligence has attracted increasing attention. Nevertheless, prior works on\nembodied intelligence typically encode scene or historical memory in an\nunimodal manner, either visual or linguistic, which complicates the alignment\nof the model's action planning with embodied control. To overcome this\nlimitation, we introduce the Multimodal Embodied Interactive Agent (MEIA),\ncapable of translating high-level tasks expressed in natural language into a\nsequence of executable actions. Specifically, we propose a novel Multimodal\nEnvironment Memory (MEM) module, facilitating the integration of embodied\ncontrol with large models through the visual-language memory of scenes. This\ncapability enables MEIA to generate executable action plans based on diverse\nrequirements and the robot's capabilities. We conduct experiments in a dynamic\nvirtual cafe environment, utilizing multiple large models through zero-shot\nlearning, and carefully design scenarios for various situations. The\nexperimental results showcase the promising performance of our MEIA in various\nembodied interactive tasks.\n","authors":["Yang Liu","Xinshuai Song","Kaixuan Jiang","Weixing Chen","Jingzhou Luo","Guanbin Li","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2402.00290v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.15583v2","updated":"2024-02-01T02:29:54Z","published":"2024-01-28T06:41:15Z","title":"SCTransNet: Spatial-channel Cross Transformer Network for Infrared Small\n Target Detection","summary":" Infrared small target detection (IRSTD) has recently benefitted greatly from\nU-shaped neural models. However, largely overlooking effective global\ninformation modeling, existing techniques struggle when the target has high\nsimilarities with the background. We present a Spatial-channel Cross\nTransformer Network (SCTransNet) that leverages spatial-channel cross\ntransformer blocks (SCTBs) on top of long-range skip connections to address the\naforementioned challenge. In the proposed SCTBs, the outputs of all encoders\nare interacted with cross transformer to generate mixed features, which are\nredistributed to all decoders to effectively reinforce semantic differences\nbetween the target and clutter at full scales. Specifically, SCTB contains the\nfollowing two key elements: (a) spatial-embedded single-head channel-cross\nattention (SSCA) for exchanging local spatial features and full-level global\nchannel information to eliminate ambiguity among the encoders and facilitate\nhigh-level semantic associations of the images, and (b) a complementary\nfeed-forward network (CFN) for enhancing the feature discriminability via a\nmulti-scale strategy and cross-spatial-channel information interaction to\npromote beneficial information transfer. Our SCTransNet effectively encodes the\nsemantic differences between targets and backgrounds to boost its internal\nrepresentation for detecting small infrared targets accurately. Extensive\nexperiments on three public datasets, NUDT-SIRST, NUAA-SIRST, and IRSTD-1k,\ndemonstrate that the proposed SCTransNet outperforms existing IRSTD methods.\nOur code will be made public at https://github.com/xdFai.\n","authors":["Shuai Yuan","Hanlin Qin","Xiang Yan","Naveed AKhtar","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2401.15583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13209v2","updated":"2024-02-01T02:27:03Z","published":"2023-11-22T07:47:39Z","title":"Test-time Adaptive Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN) has witnessed significant advancements\nin recent years, largely attributed to meticulously curated datasets and\nproficiently trained models. Nevertheless, when tested in diverse environments,\nthe trained models inevitably encounter significant shifts in data\ndistribution, highlighting that relying solely on pre-trained and fixed\nnavigation models is insufficient. To enhance models' generalization ability,\ntest-time adaptation (TTA) demonstrates significant potential in the computer\nvision field by leveraging unlabeled test samples for model updates. However,\nsimply applying existing TTA methods to the VLN task cannot well handle the\nadaptability-stability dilemma of VLN models, i.e., frequent updates can result\nin drastic changes in model parameters, while occasional updates can make the\nmodels ill-equipped to handle dynamically changing environments. Therefore, we\npropose a Fast-Slow Test-Time Adaptation (FSTTA) approach for VLN by performing\ndecomposition-accumulation analysis for both gradients and parameters in a\nunified framework. Specifically, in the fast update phase, gradients generated\nduring the recent multi-step navigation process are decomposed into components\nwith varying levels of consistency. Then, these components are adaptively\naccumulated to pinpoint a concordant direction for fast model adaptation. In\nthe slow update phase, historically recorded parameters are gathered, and a\nsimilar decomposition-accumulation analysis is conducted to revert the model to\na stable state. Extensive experiments show that our method obtains impressive\nperformance gains on four popular benchmarks.\n","authors":["Junyu Gao","Xuan Yao","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.13209v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.00281v1","updated":"2024-02-01T02:13:49Z","published":"2024-02-01T02:13:49Z","title":"Guided Interpretable Facial Expression Recognition via Spatial Action\n Unit Cues","summary":" While state-of-the-art facial expression recognition (FER) classifiers\nachieve a high level of accuracy, they lack interpretability, an important\naspect for end-users. To recognize basic facial expressions, experts resort to\na codebook associating a set of spatial action units to a facial expression. In\nthis paper, we follow the same expert footsteps, and propose a learning\nstrategy that allows us to explicitly incorporate spatial action units (aus)\ncues into the classifier's training to build a deep interpretable model. In\nparticular, using this aus codebook, input image expression label, and facial\nlandmarks, a single action units heatmap is built to indicate the most\ndiscriminative regions of interest in the image w.r.t the facial expression. We\nleverage this valuable spatial cue to train a deep interpretable classifier for\nFER. This is achieved by constraining the spatial layer features of a\nclassifier to be correlated with \\aus map. Using a composite loss, the\nclassifier is trained to correctly classify an image while yielding\ninterpretable visual layer-wise attention correlated with aus maps, simulating\nthe experts' decision process. This is achieved using only the image class\nexpression as supervision and without any extra manual annotations. Moreover,\nour method is generic. It can be applied to any CNN- or transformer-based deep\nclassifier without the need for architectural change or adding significant\ntraining time. Our extensive evaluation on two public benchmarks RAFDB, and\nAFFECTNET datasets shows that our proposed strategy can improve layer-wise\ninterpretability without degrading classification performance. In addition, we\nexplore a common type of interpretable classifiers that rely on\nClass-Activation Mapping methods (CAMs), and we show that our training\ntechnique improves the CAM interpretability.\n","authors":["Soufiane Belharbi","Marco Pedersoli","Alessandro Lameiras Koerich","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2402.00281v1.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2401.14895v2","updated":"2024-02-01T02:05:02Z","published":"2024-01-26T14:25:15Z","title":"MPTQ-ViT: Mixed-Precision Post-Training Quantization for Vision\n Transformer","summary":" While vision transformers (ViTs) have shown great potential in computer\nvision tasks, their intense computation and memory requirements pose challenges\nfor practical applications. Existing post-training quantization methods\nleverage value redistribution or specialized quantizers to address the\nnon-normal distribution in ViTs. However, without considering the asymmetry in\nactivations and relying on hand-crafted settings, these methods often struggle\nto maintain performance under low-bit quantization. To overcome these\nchallenges, we introduce SmoothQuant with bias term (SQ-b) to alleviate the\nasymmetry issue and reduce the clamping loss. We also introduce optimal scaling\nfactor ratio search (OPT-m) to determine quantization parameters by a\ndata-dependent mechanism automatically. To further enhance the compressibility,\nwe incorporate the above-mentioned techniques and propose a mixed-precision\npost-training quantization framework for vision transformers (MPTQ-ViT). We\ndevelop greedy mixed-precision quantization (Greedy MP) to allocate layer-wise\nbit-width considering both model performance and compressibility. Our\nexperiments on ViT, DeiT, and Swin demonstrate significant accuracy\nimprovements compared with SOTA on the ImageNet dataset. Specifically, our\nproposed methods achieve accuracy improvements ranging from 0.90% to 23.35% on\n4-bit ViTs with single-precision and from 3.82% to 78.14% on 5-bit fully\nquantized ViTs with mixed-precision.\n","authors":["Yu-Shan Tai"," An-Yeu"," Wu"],"pdf_url":"https://arxiv.org/pdf/2401.14895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17429v2","updated":"2024-02-01T01:39:39Z","published":"2023-12-29T01:42:43Z","title":"Commonsense for Zero-Shot Natural Language Video Localization","summary":" Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited\npromising results in training NLVL models exclusively with raw video data by\ndynamically generating video segments and pseudo-query annotations. However,\nexisting pseudo-queries often lack grounding in the source video, resulting in\nunstructured and disjointed content. In this paper, we investigate the\neffectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we\npresent CORONET, a zero-shot NLVL framework that leverages commonsense to\nbridge the gap between videos and generated pseudo-queries via a commonsense\nenhancement module. CORONET employs Graph Convolution Networks (GCN) to encode\ncommonsense information extracted from a knowledge graph, conditioned on the\nvideo, and cross-attention mechanisms to enhance the encoded video and\npseudo-query representations prior to localization. Through empirical\nevaluations on two benchmark datasets, we demonstrate that CORONET surpasses\nboth zero-shot and weakly supervised baselines, achieving improvements up to\n32.13% across various recall thresholds and up to 6.33% in mIoU. These results\nunderscore the significance of leveraging commonsense reasoning for zero-shot\nNLVL.\n","authors":["Meghana Holla","Ismini Lourentzou"],"pdf_url":"https://arxiv.org/pdf/2312.17429v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.08426v2","updated":"2024-02-01T01:17:45Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper investigates the distinctions between gradient methods applied to\nnon-differentiable functions (NGDMs) and classical gradient descents (GDs)\ndesigned for differentiable functions. First, we demonstrate significant\ndifferences in the convergence properties of NGDMs compared to GDs, challenging\nthe applicability of the extensive neural network convergence literature based\non $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the\nparadoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing\nthat increasing the regularization penalty leads to an increase in the $L_{1}$\nnorm of optimal solutions in NGDMs. Consequently, we show that widely adopted\n$L_{1}$ penalization-based techniques for network pruning do not yield expected\nresults. Finally, we explore the Edge of Stability phenomenon, indicating its\ninapplicability even to Lipschitz continuous convex differentiable functions,\nleaving its relevance to non-convex non-differentiable neural networks\ninconclusive. Our analysis exposes misguided interpretations of NGDMs in widely\nreferenced papers and texts due to an overreliance on strong smoothness\nassumptions, emphasizing the necessity for a nuanced understanding of\nfoundational assumptions in the analysis of these systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00261v1","updated":"2024-02-01T01:11:15Z","published":"2024-02-01T01:11:15Z","title":"Understanding Neural Network Systems for Image Analysis using Vector\n Spaces and Inverse Maps","summary":" There is strong interest in developing mathematical methods that can be used\nto understand complex neural networks used in image analysis. In this paper, we\nintroduce techniques from Linear Algebra to model neural network layers as maps\nbetween signal spaces. First, we demonstrate how signal spaces can be used to\nvisualize weight spaces and convolutional layer kernels. We also demonstrate\nhow residual vector spaces can be used to further visualize information lost at\neach layer. Second, we introduce the concept of invertible networks and an\nalgorithm for computing input images that yield specific outputs. We\ndemonstrate our approach on two invertible networks and ResNet18.\n","authors":["Rebecca Pattichis","Marios S. Pattichis"],"pdf_url":"https://arxiv.org/pdf/2402.00261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00253v1","updated":"2024-02-01T00:33:21Z","published":"2024-02-01T00:33:21Z","title":"A Survey on Hallucination in Large Vision-Language Models","summary":" Recent development of Large Vision-Language Models (LVLMs) has attracted\ngrowing attention within the AI landscape for its practical implementation\npotential. However, ``hallucination'', or more specifically, the misalignment\nbetween factual visual content and corresponding textual generation, poses a\nsignificant challenge of utilizing LVLMs. In this comprehensive survey, we\ndissect LVLM-related hallucinations in an attempt to establish an overview and\nfacilitate future mitigation. Our scrutiny starts with a clarification of the\nconcept of hallucinations in LVLMs, presenting a variety of hallucination\nsymptoms and highlighting the unique challenges inherent in LVLM\nhallucinations. Subsequently, we outline the benchmarks and methodologies\ntailored specifically for evaluating hallucinations unique to LVLMs.\nAdditionally, we delve into an investigation of the root causes of these\nhallucinations, encompassing insights from the training data and model\ncomponents. We also critically review existing methods for mitigating\nhallucinations. The open questions and future directions pertaining to\nhallucinations within LVLMs are discussed to conclude this survey.\n","authors":["Hanchao Liu","Wenyuan Xue","Yifei Chen","Dapeng Chen","Xiutian Zhao","Ke Wang","Liping Hou","Rongjun Li","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2402.00253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00250v1","updated":"2024-02-01T00:19:57Z","published":"2024-02-01T00:19:57Z","title":"LRDif: Diffusion Models for Under-Display Camera Emotion Recognition","summary":" This study introduces LRDif, a novel diffusion-based framework designed\nspecifically for facial expression recognition (FER) within the context of\nunder-display cameras (UDC). To address the inherent challenges posed by UDC's\nimage degradation, such as reduced sharpness and increased noise, LRDif employs\na two-stage training strategy that integrates a condensed preliminary\nextraction network (FPEN) and an agile transformer network (UDCformer) to\neffectively identify emotion labels from UDC images. By harnessing the robust\ndistribution mapping capabilities of Diffusion Models (DMs) and the spatial\ndependency modeling strength of transformers, LRDif effectively overcomes the\nobstacles of noise and distortion inherent in UDC environments. Comprehensive\nexperiments on standard FER datasets including RAF-DB, KDEF, and FERPlus, LRDif\ndemonstrate state-of-the-art performance, underscoring its potential in\nadvancing FER applications. This work not only addresses a significant gap in\nthe literature by tackling the UDC challenge in FER but also sets a new\nbenchmark for future research in the field.\n","authors":["Zhifeng Wang","Kaihao Zhang","Ramesh Sankaranarayana"],"pdf_url":"https://arxiv.org/pdf/2402.00250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01067v1","updated":"2024-02-01T23:53:12Z","published":"2024-02-01T23:53:12Z","title":"Assessing Patient Eligibility for Inspire Therapy through Machine\n Learning and Deep Learning Models","summary":" Inspire therapy is an FDA-approved internal neurostimulation treatment for\nobstructive sleep apnea. However, not all patients respond to this therapy,\nposing a challenge even for experienced otolaryngologists to determine\ncandidacy. This paper makes the first attempt to leverage both machine learning\nand deep learning techniques in discerning patient responsiveness to Inspire\ntherapy using medical data and videos captured through Drug-Induced Sleep\nEndoscopy (DISE), an essential procedure for Inspire therapy. To achieve this,\nwe gathered and annotated three datasets from 127 patients. Two of these\ndatasets comprise endoscopic videos focused on the Base of the Tongue and\nVelopharynx. The third dataset composes the patient's clinical information. By\nutilizing these datasets, we benchmarked and compared the performance of six\ndeep learning models and five classical machine learning algorithms. The\nresults demonstrate the potential of employing machine learning and deep\nlearning techniques to determine a patient's eligibility for Inspire therapy,\npaving the way for future advancements in this field.\n","authors":["Mohsena Chowdhury","Tejas Vyas","Rahul Alapati","Andrés M Bur","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2402.01067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02535v2","updated":"2024-02-01T23:45:28Z","published":"2023-11-05T01:09:07Z","title":"TokenMotion: Motion-Guided Vision Transformer for Video Camouflaged\n Object Detection Via Learnable Token Selection","summary":" The area of Video Camouflaged Object Detection (VCOD) presents unique\nchallenges in the field of computer vision due to texture similarities between\ntarget objects and their surroundings, as well as irregular motion patterns\ncaused by both objects and camera movement. In this paper, we introduce\nTokenMotion (TMNet), which employs a transformer-based model to enhance VCOD by\nextracting motion-guided features using a learnable token selection. Evaluated\non the challenging MoCA-Mask dataset, TMNet achieves state-of-the-art\nperformance in VCOD. It outperforms the existing state-of-the-art method by a\n12.8% improvement in weighted F-measure, an 8.4% enhancement in S-measure, and\na 10.7% boost in mean IoU. The results demonstrate the benefits of utilizing\nmotion-guided features via learnable token selection within a transformer-based\nframework to tackle the intricate task of VCOD.\n","authors":["Zifan Yu","Erfan Bank Tavakoli","Meida Chen","Suya You","Raghuveer Rao","Sanjeev Agarwal","Fengbo Ren"],"pdf_url":"https://arxiv.org/pdf/2311.02535v2.pdf","comment":"Revising Needed"},{"id":"http://arxiv.org/abs/2312.05632v2","updated":"2024-02-01T23:07:23Z","published":"2023-12-09T18:40:37Z","title":"Subject-Based Domain Adaptation for Facial Expression Recognition","summary":" Adapting a deep learning (DL) model to a specific target individual is a\nchallenging task in facial expression recognition (FER) that may be achieved\nusing unsupervised domain adaptation (UDA) methods. Although several UDA\nmethods have been proposed to adapt deep FER models across source and target\ndata sets, multiple subject-specific source domains are needed to accurately\nrepresent the intra- and inter-person variability in subject-based adaption. In\nthis paper, we consider the setting where domains correspond to individuals,\nnot entire datasets. Unlike UDA, multi-source domain adaptation (MSDA) methods\ncan leverage multiple source datasets to improve the accuracy and robustness of\nthe target model. However, previous methods for MSDA adapt image classification\nmodels across datasets and do not scale well to a larger number of source\ndomains. In this paper, a new MSDA method is introduced for subject-based\ndomain adaptation in FER. It efficiently leverages information from multiple\nsource subjects (labeled source domain data) to adapt a deep FER model to a\nsingle target individual (unlabeled target domain data). During adaptation, our\nSubject-based MSDA first computes a between-source discrepancy loss to mitigate\nthe domain shift among data from several source subjects. Then, a new strategy\nis employed to generate augmented confident pseudo-labels for the target\nsubject, allowing a reduction in the domain shift between source and target\nsubjects. Experiments on the challenging BioVid heat and pain dataset (PartA)\nwith 87 subjects, and the UNBC-McMaster shoulder pain dataset with 25 subjects\nshow that our Subject-based MSDA can outperform state-of-the-art methods yet\nscale well to multiple subject-based source domains.\n","authors":["Muhammad Osama Zeeshan","Muhammad Haseeb Aslam","Soufiane Belharbi","Alessandro L. Koerich","Marco Pedersoli","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2312.05632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01054v1","updated":"2024-02-01T22:58:21Z","published":"2024-02-01T22:58:21Z","title":"Unconditional Latent Diffusion Models Memorize Patient Imaging Data","summary":" Generative latent diffusion models hold a wide range of applications in the\nmedical imaging domain. A noteworthy application is privacy-preserved open-data\nsharing by proposing synthetic data as surrogates of real patient data. Despite\nthe promise, these models are susceptible to patient data memorization, where\nmodels generate patient data copies instead of novel synthetic samples. This\nundermines the whole purpose of preserving patient data and may even result in\npatient re-identification. Considering the importance of the problem,\nsurprisingly it has received relatively little attention in the medical imaging\ncommunity. To this end, we assess memorization in latent diffusion models for\nmedical image synthesis. We train 2D and 3D latent diffusion models on CT, MR,\nand X-ray datasets for synthetic data generation. Afterwards, we examine the\namount of training data memorized utilizing self-supervised models and further\ninvestigate various factors that can possibly lead to memorization by training\nmodels in different settings. We observe a surprisingly large amount of data\nmemorization among all datasets, with up to 41.7%, 19.6%, and 32.6% of the\ntraining data memorized in CT, MRI, and X-ray datasets respectively. Further\nanalyses reveal that increasing training data size and using data augmentation\nreduce memorization, while over-training enhances it. Overall, our results\nsuggest a call for memorization-informed evaluation of synthetic data prior to\nopen-data sharing.\n","authors":["Salman Ul Hassan Dar","Marvin Seyfarth","Jannik Kahmann","Isabelle Ayx","Theano Papavassiliu","Stefan O. Schoenberg","Sandy Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2402.01054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01052v1","updated":"2024-02-01T22:54:45Z","published":"2024-02-01T22:54:45Z","title":"Weakly Convex Regularisers for Inverse Problems: Convergence of Critical\n Points and Primal-Dual Optimisation","summary":" Variational regularisation is the primary method for solving inverse\nproblems, and recently there has been considerable work leveraging deeply\nlearned regularisation for enhanced performance. However, few results exist\naddressing the convergence of such regularisation, particularly within the\ncontext of critical points as opposed to global minima. In this paper, we\npresent a generalised formulation of convergent regularisation in terms of\ncritical points, and show that this is achieved by a class of weakly convex\nregularisers. We prove convergence of the primal-dual hybrid gradient method\nfor the associated variational problem, and, given a Kurdyka-Lojasiewicz\ncondition, an $\\mathcal{O}(\\log{k}/k)$ ergodic convergence rate. Finally,\napplying this theory to learned regularisation, we prove universal\napproximation for input weakly convex neural networks (IWCNN), and show\nempirically that IWCNNs can lead to improved performance of learned adversarial\nregularisers for computed tomography (CT) reconstruction.\n","authors":["Zakhar Shumaylov","Jeremy Budd","Subhadip Mukherjee","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2402.01052v1.pdf","comment":"26 pages, 4 figures, preprint"},{"id":"http://arxiv.org/abs/2309.05388v2","updated":"2024-02-01T22:49:08Z","published":"2023-09-11T11:35:17Z","title":"Robust Single Rotation Averaging Revisited","summary":" In this work, we propose a novel method for robust single rotation averaging\nthat can efficiently handle an extremely large fraction of outliers. Our\napproach is to minimize the total truncated least unsquared deviations (TLUD)\ncost of geodesic distances. The proposed algorithm consists of three steps:\nFirst, we consider each input rotation as a potential initial solution and\nchoose the one that yields the least sum of truncated chordal deviations. Next,\nwe obtain the inlier set using the initial solution and compute its chordal\n$L_2$-mean. Finally, starting from this estimate, we iteratively compute the\ngeodesic $L_1$-mean of the inliers using the Weiszfeld algorithm on $SO(3)$. An\nextensive evaluation shows that our method is robust against up to 99% outliers\ngiven a sufficient number of accurate inliers, outperforming the current state\nof the art.\n","authors":["Seong Hun Lee","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2309.05388v2.pdf","comment":"Added the url to the code\n (https://github.com/sunghoon031/SingleRotationAveraging_TLUD)"},{"id":"http://arxiv.org/abs/2402.01049v1","updated":"2024-02-01T22:37:33Z","published":"2024-02-01T22:37:33Z","title":"IMUGPT 2.0: Language-Based Cross Modality Transfer for Sensor-Based\n Human Activity Recognition","summary":" One of the primary challenges in the field of human activity recognition\n(HAR) is the lack of large labeled datasets. This hinders the development of\nrobust and generalizable models. Recently, cross modality transfer approaches\nhave been explored that can alleviate the problem of data scarcity. These\napproaches convert existing datasets from a source modality, such as video, to\na target modality (IMU). With the emergence of generative AI models such as\nlarge language models (LLMs) and text-driven motion synthesis models, language\nhas become a promising source data modality as well as shown in proof of\nconcepts such as IMUGPT. In this work, we conduct a large-scale evaluation of\nlanguage-based cross modality transfer to determine their effectiveness for\nHAR. Based on this study, we introduce two new extensions for IMUGPT that\nenhance its use for practical HAR application scenarios: a motion filter\ncapable of filtering out irrelevant motion sequences to ensure the relevance of\nthe generated virtual IMU data, and a set of metrics that measure the diversity\nof the generated data facilitating the determination of when to stop generating\nvirtual IMU data for both effective and efficient processing. We demonstrate\nthat our diversity metrics can reduce the effort needed for the generation of\nvirtual IMU data by at least 50%, which open up IMUGPT for practical use cases\nbeyond a mere proof of concept.\n","authors":["Zikang Leng","Amitrajit Bhattacharjee","Hrudhai Rajasekhar","Lizhe Zhang","Elizabeth Bruda","Hyeokhyen Kwon","Thomas Plötz"],"pdf_url":"https://arxiv.org/pdf/2402.01049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12050v2","updated":"2024-02-01T22:22:17Z","published":"2023-03-21T17:41:36Z","title":"CurveCloudNet: Processing Point Clouds with 1D Structure","summary":" Modern depth sensors such as LiDAR operate by sweeping laser-beams across the\nscene, resulting in a point cloud with notable 1D curve-like structures. In\nthis work, we introduce a new point cloud processing scheme and backbone,\ncalled CurveCloudNet, which takes advantage of the curve-like structure\ninherent to these sensors. While existing backbones discard the rich 1D\ntraversal patterns and rely on generic 3D operations, CurveCloudNet\nparameterizes the point cloud as a collection of polylines (dubbed a \"curve\ncloud\"), establishing a local surface-aware ordering on the points. By\nreasoning along curves, CurveCloudNet captures lightweight curve-aware priors\nto efficiently and accurately reason in several diverse 3D environments. We\nevaluate CurveCloudNet on multiple synthetic and real datasets that exhibit\ndistinct 3D size and structure. We demonstrate that CurveCloudNet outperforms\nboth point-based and sparse-voxel backbones in various segmentation settings,\nnotably scaling to large scenes better than point-based alternatives while\nexhibiting improved single-object performance over sparse-voxel alternatives.\nIn all, CurveCloudNet is an efficient and accurate backbone that can handle a\nlarger variety of 3D environments than past works.\n","authors":["Colton Stearns","Davis Rempe","Jiateng Liu","Alex Fu","Sebastien Mascha","Jeong Joon Park","Despoina Paschalidou","Leonidas J. Guibas"],"pdf_url":"https://arxiv.org/pdf/2303.12050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15679v2","updated":"2024-02-01T22:08:07Z","published":"2023-11-27T10:10:25Z","title":"Model-agnostic Body Part Relevance Assessment for Pedestrian Detection","summary":" Model-agnostic explanation methods for deep learning models are flexible\nregarding usability and availability. However, due to the fact that they can\nonly manipulate input to see changes in output, they suffer from weak\nperformance when used with complex model architectures. For models with large\ninputs as, for instance, in object detection, sampling-based methods like\nKernelSHAP are inefficient due to many computation-heavy forward passes through\nthe model. In this work, we present a framework for using sampling-based\nexplanation models in a computer vision context by body part relevance\nassessment for pedestrian detection. Furthermore, we introduce a novel\nsampling-based method similar to KernelSHAP that shows more robustness for\nlower sampling sizes and, thus, is more efficient for explainability analyses\non large-scale datasets.\n","authors":["Maurice Günder","Sneha Banerjee","Rafet Sifa","Christian Bauckhage"],"pdf_url":"https://arxiv.org/pdf/2311.15679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02524v2","updated":"2024-02-01T22:06:51Z","published":"2024-01-04T20:23:51Z","title":"Comprehensive Exploration of Synthetic Data Generation: A Survey","summary":" Recent years have witnessed a surge in the popularity of Machine Learning\n(ML), applied across diverse domains. However, progress is impeded by the\nscarcity of training data due to expensive acquisition and privacy legislation.\nSynthetic data emerges as a solution, but the abundance of released models and\nlimited overview literature pose challenges for decision-making. This work\nsurveys 417 Synthetic Data Generation (SDG) models over the last decade,\nproviding a comprehensive overview of model types, functionality, and\nimprovements. Common attributes are identified, leading to a classification and\ntrend analysis. The findings reveal increased model performance and complexity,\nwith neural network-based approaches prevailing, except for privacy-preserving\ndata generation. Computer vision dominates, with GANs as primary generative\nmodels, while diffusion models, transformers, and RNNs compete. Implications\nfrom our performance evaluation highlight the scarcity of common metrics and\ndatasets, making comparisons challenging. Additionally, the neglect of training\nand computational costs in literature necessitates attention in future\nresearch. This work serves as a guide for SDG model selection and identifies\ncrucial areas for future exploration.\n","authors":["André Bauer","Simon Trapp","Michael Stenger","Robert Leppich","Samuel Kounev","Mark Leznik","Kyle Chard","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2401.02524v2.pdf","comment":"Fixed bug in Figure 44"},{"id":"http://arxiv.org/abs/2402.01034v1","updated":"2024-02-01T21:45:12Z","published":"2024-02-01T21:45:12Z","title":"VISION-MAE: A Foundation Model for Medical Image Segmentation and\n Classification","summary":" Artificial Intelligence (AI) has the potential to revolutionize diagnosis and\nsegmentation in medical imaging. However, development and clinical\nimplementation face multiple challenges including limited data availability,\nlack of generalizability, and the necessity to incorporate multi-modal data\neffectively. A foundation model, which is a large-scale pre-trained AI model,\noffers a versatile base that can be adapted to a variety of specific tasks and\ncontexts. Here, we present a novel foundation model, VISION-MAE, specifically\ndesigned for medical imaging. Specifically, VISION-MAE is trained on a dataset\nof 2.5 million unlabeled images from various modalities (CT, MR, PET, X-rays,\nand ultrasound), using self-supervised learning techniques. It is then adapted\nto classification and segmentation tasks using explicit labels. VISION-MAE has\nhigh label efficiency, outperforming several benchmark models in both in-domain\nand out-of-domain applications, and achieves high performance even with reduced\navailability of labeled data. This model represents a significant advancement\nin medical imaging AI, offering a generalizable and robust solution for\nimproving segmentation and classification tasks while reducing the data\nannotation workload.\n","authors":["Zelong Liu","Andrew Tieu","Nikhil Patel","Alexander Zhou","George Soultanidis","Zahi A. Fayad","Timothy Deyer","Xueyan Mei"],"pdf_url":"https://arxiv.org/pdf/2402.01034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01031v1","updated":"2024-02-01T21:43:27Z","published":"2024-02-01T21:43:27Z","title":"MRAnnotator: A Multi-Anatomy Deep Learning Model for MRI Segmentation","summary":" Purpose To develop a deep learning model for multi-anatomy and many-class\nsegmentation of diverse anatomic structures on MRI imaging.\n Materials and Methods In this retrospective study, two datasets were curated\nand annotated for model development and evaluation. An internal dataset of 1022\nMRI sequences from various clinical sites within a health system and an\nexternal dataset of 264 MRI sequences from an independent imaging center were\ncollected. In both datasets, 49 anatomic structures were annotated as the\nground truth. The internal dataset was divided into training, validation, and\ntest sets and used to train and evaluate an nnU-Net model. The external dataset\nwas used to evaluate nnU-Net model generalizability and performance in all\nclasses on independent imaging data. Dice scores were calculated to evaluate\nmodel segmentation performance.\n Results The model achieved an average Dice score of 0.801 on the internal\ntest set, and an average score of 0.814 on the complete external dataset across\n49 classes.\n Conclusion The developed model achieves robust and generalizable segmentation\nof 49 anatomic structures on MRI imaging. A future direction is focused on the\nincorporation of additional anatomic regions and structures into the datasets\nand model.\n","authors":["Alexander Zhou","Zelong Liu","Andrew Tieu","Nikhil Patel","Sean Sun","Anthony Yang","Peter Choi","Valentin Fauveau","George Soultanidis","Mingqian Huang","Amish Doshi","Zahi A. Fayad","Timothy Deyer","Xueyan Mei"],"pdf_url":"https://arxiv.org/pdf/2402.01031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09196v2","updated":"2024-02-01T20:40:44Z","published":"2023-12-14T18:18:34Z","title":"DIRECT: Deep Active Learning under Imbalance and Label Noise","summary":" Class imbalance is a prevalent issue in real world machine learning\napplications, often leading to poor performance in rare and minority classes.\nWith an abundance of wild unlabeled data, active learning is perhaps the most\neffective technique in solving the problem at its root -- collecting a more\nbalanced and informative set of labeled examples during annotation. Label noise\nis another common issue in data annotation jobs, which is especially\nchallenging for active learning methods. In this work, we conduct the first\nstudy of active learning under both class imbalance and label noise. We propose\na novel algorithm that robustly identifies the class separation threshold and\nannotates the most uncertain examples that are closest from it. Through a novel\nreduction to one-dimensional active learning, our algorithm DIRECT is able to\nleverage the classic active learning literature to address issues such as batch\nlabeling and tolerance towards label noise. We present extensive experiments on\nimbalanced datasets with and without label noise. Our results demonstrate that\nDIRECT can save more than 60% of the annotation budget compared to state-of-art\nactive learning algorithms and more than 80% of annotation budget compared to\nrandom sampling.\n","authors":["Shyam Nuggehalli","Jifan Zhang","Lalit Jain","Robert Nowak"],"pdf_url":"https://arxiv.org/pdf/2312.09196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01002v1","updated":"2024-02-01T20:32:14Z","published":"2024-02-01T20:32:14Z","title":"AI-generated faces free from racial and gender stereotypes","summary":" Text-to-image generative AI models such as Stable Diffusion are used daily by\nmillions worldwide. However, many have raised concerns regarding how these\nmodels amplify racial and gender stereotypes. To study this phenomenon, we\ndevelop a classifier to predict the race, gender, and age group of any given\nface image, and show that it achieves state-of-the-art performance. Using this\nclassifier, we quantify biases in Stable Diffusion across six races, two\ngenders, five age groups, 32 professions, and eight attributes. We then propose\nnovel debiasing solutions that outperform state-of-the-art alternatives.\nAdditionally, we examine the degree to which Stable Diffusion depicts\nindividuals of the same race as being similar to one another. This analysis\nreveals a high degree of stereotyping, e.g., depicting most middle eastern\nmales as being dark-skinned, bearded, and wearing a traditional headdress. We\naddress these limitations by proposing yet another novel solution that\nincreases facial diversity across genders and racial groups. Our solutions are\nopen-sourced and made publicly available.\n","authors":["Nouar AlDahoul","Talal Rahwan","Yasir Zaki"],"pdf_url":"https://arxiv.org/pdf/2402.01002v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.00996v1","updated":"2024-02-01T20:19:38Z","published":"2024-02-01T20:19:38Z","title":"mmID: High-Resolution mmWave Imaging for Human Identification","summary":" Achieving accurate human identification through RF imaging has been a\npersistent challenge, primarily attributed to the limited aperture size and its\nconsequent impact on imaging resolution. The existing imaging solution enables\ntasks such as pose estimation, activity recognition, and human tracking based\non deep neural networks by estimating skeleton joints. In contrast to\nestimating joints, this paper proposes to improve imaging resolution by\nestimating the human figure as a whole using conditional generative adversarial\nnetworks (cGAN). In order to reduce training complexity, we use an estimated\nspatial spectrum using the MUltiple SIgnal Classification (MUSIC) algorithm as\ninput to the cGAN. Our system generates environmentally independent,\nhigh-resolution images that can extract unique physical features useful for\nhuman identification. We use a simple convolution layers-based classification\nnetwork to obtain the final identification result. From the experimental\nresults, we show that resolution of the image produced by our trained generator\nis high enough to enable human identification. Our finding indicates\nhigh-resolution accuracy with 5% mean silhouette difference to the Kinect\ndevice. Extensive experiments in different environments on multiple testers\ndemonstrate that our system can achieve 93% overall test accuracy in unseen\nenvironments for static human target identification.\n","authors":["Sakila S. Jayaweera","Sai Deepika Regani","Yuqian Hu","Beibei Wang","K. J. Ray Liu"],"pdf_url":"https://arxiv.org/pdf/2402.00996v1.pdf","comment":"This paper was published in the IEEE 9th World Forum on Internet of\n Things"},{"id":"http://arxiv.org/abs/2402.00994v1","updated":"2024-02-01T20:18:06Z","published":"2024-02-01T20:18:06Z","title":"A Cost-Efficient Approach for Creating Virtual Fitting Room using\n Generative Adversarial Networks (GANs)","summary":" Customers all over the world want to see how the clothes fit them or not\nbefore purchasing. Therefore, customers by nature prefer brick-and-mortar\nclothes shopping so they can try on products before purchasing them. But after\nthe Pandemic of COVID19 many sellers either shifted to online shopping or\nclosed their fitting rooms which made the shopping process hesitant and\ndoubtful. The fact that the clothes may not be suitable for their buyers after\npurchase led us to think about using new AI technologies to create an online\nplatform or a virtual fitting room (VFR) in the form of a mobile application\nand a deployed model using a webpage that can be embedded later to any online\nstore where they can try on any number of cloth items without physically trying\nthem. Besides, it will save much searching time for their needs. Furthermore,\nit will reduce the crowding and headache in the physical shops by applying the\nsame technology using a special type of mirror that will enable customers to\ntry on faster. On the other hand, from business owners' perspective, this\nproject will highly increase their online sales, besides, it will save the\nquality of the products by avoiding physical trials issues. The main approach\nused in this work is applying Generative Adversarial Networks (GANs) combined\nwith image processing techniques to generate one output image from two input\nimages which are the person image and the cloth image. This work achieved\nresults that outperformed the state-of-the-art approaches found in literature.\n","authors":["Kirolos Attallah","Girgis Zaky","Nourhan Abdelrhim","Kyrillos Botros","Amjad Dife","Nermin Negied"],"pdf_url":"https://arxiv.org/pdf/2402.00994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12946v5","updated":"2024-02-01T20:15:04Z","published":"2024-01-23T18:07:07Z","title":"Coverage Axis++: Efficient Inner Point Selection for 3D Shape\n Skeletonization","summary":" We introduce Coverage Axis++, a novel and efficient approach to 3D shape\nskeletonization. The current state-of-the-art approaches for this task often\nrely on the watertightness of the input or suffer from substantial\ncomputational costs, thereby limiting their practicality. To address this\nchallenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal\npoints, offering a high-accuracy approximation of the Medial Axis Transform\n(MAT) while significantly mitigating computational intensity for various shape\nrepresentations. We introduce a simple yet effective strategy that considers\nboth shape coverage and uniformity to derive skeletal points. The selection\nprocedure enforces consistency with the shape structure while favoring the\ndominant medial balls, which thus introduces a compact underlying shape\nrepresentation in terms of MAT. As a result, Coverage Axis++ allows for\nskeletonization for various shape representations (e.g., water-tight meshes,\ntriangle soups, point clouds), specification of the number of skeletal points,\nfew hyperparameters, and highly efficient computation with improved\nreconstruction accuracy. Extensive experiments across a wide range of 3D shapes\nvalidate the efficiency and effectiveness of Coverage Axis++. The code will be\npublicly available once the paper is published.\n","authors":["Zimeng Wang","Zhiyang Dou","Rui Xu","Cheng Lin","Yuan Liu","Xiaoxiao Long","Shiqing Xin","Lingjie Liu","Taku Komura","Xiaoming Yuan","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12946v5.pdf","comment":"This paper needs major revisions in layout/content"},{"id":"http://arxiv.org/abs/2402.00993v1","updated":"2024-02-01T20:12:26Z","published":"2024-02-01T20:12:26Z","title":"Compressed image quality assessment using stacking","summary":" It is well-known that there is no universal metric for image quality\nevaluation. In this case, distortion-specific metrics can be more reliable. The\nartifact imposed by image compression can be considered as a combination of\nvarious distortions. Depending on the image context, this combination can be\ndifferent. As a result, Generalization can be regarded as the major challenge\nin compressed image quality assessment. In this approach, stacking is employed\nto provide a reliable method. Both semantic and low-level information are\nemployed in the presented IQA to predict the human visual system. Moreover, the\nresults of the Full-Reference (FR) and No-Reference (NR) models are aggregated\nto improve the proposed Full-Reference method for compressed image quality\nevaluation. The accuracy of the quality benchmark of the clic2024 perceptual\nimage challenge was achieved 79.6\\%, which illustrates the effectiveness of the\nproposed fusion-based approach.\n","authors":["S. Farhad Hosseini-Benvidi","Hossein Motamednia","Azadeh Mansouri","Mohammadreza Raei","Ahmad Mahmoudi-Aznaveh"],"pdf_url":"https://arxiv.org/pdf/2402.00993v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.00989v1","updated":"2024-02-01T20:10:01Z","published":"2024-02-01T20:10:01Z","title":"YOLinO++: Single-Shot Estimation of Generic Polylines for Mapless\n Automated Diving","summary":" In automated driving, highly accurate maps are commonly used to support and\ncomplement perception. These maps are costly to create and quickly become\noutdated as the traffic world is permanently changing. In order to support or\nreplace the map of an automated system with detections from sensor data, a\nperception module must be able to detect the map features. We propose a neural\nnetwork that follows the one shot philosophy of YOLO but is designed for\ndetection of 1D structures in images, such as lane boundaries.\n We extend previous ideas by a midpoint based line representation and anchor\ndefinitions. This representation can be used to describe lane borders,\nmarkings, but also implicit features such as centerlines of lanes. The broad\napplicability of the approach is shown with the detection performance on lane\ncenterlines, lane borders as well as the markings both on highways and in urban\nareas.\n Versatile lane boundaries are detected and can be inherently classified as\ndashed or solid lines, curb, road boundaries, or implicit delimitation.\n","authors":["Annika Meyer","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2402.00989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00977v1","updated":"2024-02-01T19:47:34Z","published":"2024-02-01T19:47:34Z","title":"Enhanced fringe-to-phase framework using deep learning","summary":" In Fringe Projection Profilometry (FPP), achieving robust and accurate 3D\nreconstruction with a limited number of fringe patterns remains a challenge in\nstructured light 3D imaging. Conventional methods require a set of fringe\nimages, but using only one or two patterns complicates phase recovery and\nunwrapping. In this study, we introduce SFNet, a symmetric fusion network that\ntransforms two fringe images into an absolute phase. To enhance output\nreliability, Our framework predicts refined phases by incorporating information\nfrom fringe images of a different frequency than those used as input. This\nallows us to achieve high accuracy with just two images. Comparative\nexperiments and ablation studies validate the effectiveness of our proposed\nmethod. The dataset and code are publicly accessible on our project page\nhttps://wonhoe-kim.github.io/SFNet.\n","authors":["Won-Hoe Kim","Bongjoong Kim","Hyung-Gun Chi","Jae-Sang Hyun"],"pdf_url":"https://arxiv.org/pdf/2402.00977v1.pdf","comment":"35 pages, 13 figures, 6 tables"},{"id":"http://arxiv.org/abs/2303.18240v2","updated":"2024-02-01T19:42:05Z","published":"2023-03-31T17:56:33Z","title":"Where are we in the search for an Artificial Visual Cortex for Embodied\n Intelligence?","summary":" We present the largest and most comprehensive empirical study of pre-trained\nvisual representations (PVRs) or visual 'foundation models' for Embodied AI.\nFirst, we curate CortexBench, consisting of 17 different tasks spanning\nlocomotion, navigation, dexterous, and mobile manipulation. Next, we\nsystematically evaluate existing PVRs and find that none are universally\ndominant. To study the effect of pre-training data size and diversity, we\ncombine over 4,000 hours of egocentric videos from 7 different sources (over\n4.3M images) and ImageNet to train different-sized vision transformers using\nMasked Auto-Encoding (MAE) on slices of this data. Contrary to inferences from\nprior work, we find that scaling dataset size and diversity does not improve\nperformance universally (but does so on average). Our largest model, named\nVC-1, outperforms all prior PVRs on average but does not universally dominate\neither. Next, we show that task- or domain-specific adaptation of VC-1 leads to\nsubstantial gains, with VC-1 (adapted) achieving competitive or superior\nperformance than the best known results on all of the benchmarks in\nCortexBench. Finally, we present real-world hardware experiments, in which VC-1\nand VC-1 (adapted) outperform the strongest pre-existing PVR. Overall, this\npaper presents no new techniques but a rigorous systematic evaluation, a broad\nset of findings about PVRs (that in some cases, refute those made in narrow\ndomains in prior work), and open-sourced code and models (that required over\n10,000 GPU-hours to train) for the benefit of the research community.\n","authors":["Arjun Majumdar","Karmesh Yadav","Sergio Arnaud","Yecheng Jason Ma","Claire Chen","Sneha Silwal","Aryan Jain","Vincent-Pierre Berges","Pieter Abbeel","Jitendra Malik","Dhruv Batra","Yixin Lin","Oleksandr Maksymets","Aravind Rajeswaran","Franziska Meier"],"pdf_url":"https://arxiv.org/pdf/2303.18240v2.pdf","comment":"Project website: https://eai-vc.github.io"},{"id":"http://arxiv.org/abs/2402.00971v1","updated":"2024-02-01T19:40:39Z","published":"2024-02-01T19:40:39Z","title":"FuseFormer: A Transformer for Visual and Thermal Image Fusion","summary":" Image fusion is the process of combining images from different sensors into a\nsingle image that incorporates all relevant information. The majority of\nstate-of-the-art image fusion techniques use deep learning methods to extract\nmeaningful features; however, they primarily integrate local features without\nconsidering the image's broader context. To overcome this limitation,\nTransformer-based models have emerged as a promising solution, aiming to\ncapture general context dependencies through attention mechanisms. Since there\nis no ground truth for image fusion, the loss functions are structured based on\nevaluation metrics, such as the structural similarity index measure (SSIM). By\ndoing so, we create a bias towards the SSIM and, therefore, the input visual\nband image. The objective of this study is to propose a novel methodology for\nimage fusion that mitigates the limitations associated with using evaluation\nmetrics as loss functions. Our approach integrates a transformer-based\nmulti-scale fusion strategy, which adeptly addresses both local and global\ncontext information. This integration not only refines the individual\ncomponents of the image fusion process but also significantly enhances the\noverall efficacy of the method. Our proposed method follows a two-stage\ntraining approach, where an auto-encoder is initially trained to extract deep\nfeatures at multiple scales at the first stage. For the second stage, we\nintegrate our fusion block and change the loss function as mentioned. The\nmulti-scale features are fused using a combination of Convolutional Neural\nNetworks (CNNs) and Transformers. The CNNs are utilized to capture local\nfeatures, while the Transformer handles the integration of general context\nfeatures.\n","authors":["Aytekin Erdogan","Erdem Akagunduz"],"pdf_url":"https://arxiv.org/pdf/2402.00971v1.pdf","comment":"9 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.00965v1","updated":"2024-02-01T19:31:51Z","published":"2024-02-01T19:31:51Z","title":"Multi-Modal Machine Learning Framework for Automated Seizure Detection\n in Laboratory Rats","summary":" A multi-modal machine learning system uses multiple unique data sources and\ntypes to improve its performance. This article proposes a system that combines\nresults from several types of models, all of which are trained on different\ndata signals. As an example to illustrate the efficacy of the system, an\nexperiment is described in which multiple types of data are collected from rats\nsuffering from seizures. This data includes electrocorticography readings,\npiezoelectric motion sensor data, and video recordings. Separate models are\ntrained on each type of data, with the goal of classifying each time frame as\neither containing a seizure or not. After each model has generated its\nclassification predictions, these results are combined. While each data signal\nworks adequately on its own for prediction purposes, the significant imbalance\nin class labels leads to increased numbers of false positives, which can be\nfiltered and removed by utilizing all data sources. This paper will demonstrate\nthat, after postprocessing and combination techniques, classification accuracy\nis improved with this multi-modal system when compared to the performance of\neach individual data source.\n","authors":["Aaron Mullen","Samuel E. Armstrong","Jasmine Perdeh","Bjorn Bauer","Jeffrey Talbert","V. K. Cody Bumgardner"],"pdf_url":"https://arxiv.org/pdf/2402.00965v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.00485v1","updated":"2024-02-01T10:42:05Z","published":"2024-02-01T10:42:05Z","title":"A Personalized Framework for Consumer and Producer Group Fairness\n Optimization in Recommender Systems","summary":" In recent years, there has been an increasing recognition that when machine\nlearning (ML) algorithms are used to automate decisions, they may mistreat\nindividuals or groups, with legal, ethical, or economic implications.\nRecommender systems are prominent examples of these machine learning (ML)\nsystems that aid users in making decisions. The majority of past literature\nresearch on RS fairness treats user and item fairness concerns independently,\nignoring the fact that recommender systems function in a two-sided marketplace.\nIn this paper, we propose CP-FairRank, an optimization-based re-ranking\nalgorithm that seamlessly integrates fairness constraints from both the\nconsumer and producer side in a joint objective framework. The framework is\ngeneralizable and may take into account varied fairness settings based on group\nsegmentation, recommendation model selection, and domain, which is one of its\nkey characteristics. For instance, we demonstrate that the system may jointly\nincrease consumer and producer fairness when (un)protected consumer groups are\ndefined on the basis of their activity level and main-streamness, while\nproducer groups are defined according to their popularity level. For empirical\nvalidation, through large-scale on eight datasets and four mainstream\ncollaborative filtering (CF) recommendation models, we demonstrate that our\nproposed strategy is able to improve both consumer and producer fairness\nwithout compromising or very little overall recommendation quality,\ndemonstrating the role algorithms may play in avoiding data biases.\n","authors":["Hossein A. Rahmani","Mohammadmehdi Naghiaei","Yashar Deldjoo"],"pdf_url":"https://arxiv.org/pdf/2402.00485v1.pdf","comment":"TORS. arXiv admin note: substantial text overlap with\n arXiv:2204.08085"},{"id":"http://arxiv.org/abs/2402.00421v1","updated":"2024-02-01T08:37:13Z","published":"2024-02-01T08:37:13Z","title":"From PARIS to LE-PARIS: Toward Patent Response Automation with\n Recommender Systems and Collaborative Large Language Models","summary":" In patent prosecution, timely and effective responses to Office Actions (OAs)\nare crucial for acquiring patents, yet past automation and AI research have\nscarcely addressed this aspect. To address this gap, our study introduces the\nPatent Office Action Response Intelligence System (PARIS) and its advanced\nversion, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are\ndesigned to expedite the efficiency of patent attorneys in collaboratively\nhandling OA responses. The systems' key features include the construction of an\nOA Topics Database, development of Response Templates, and implementation of\nRecommender Systems and LLM-based Response Generation. Our validation involves\na multi-paradigmatic analysis using the USPTO Office Action database and\nlongitudinal data of attorney interactions with our systems over six years.\nThrough five studies, we examine the constructiveness of OA topics (studies 1\nand 2) using topic modeling and the proposed Delphi process, the efficacy of\nour proposed hybrid recommender system tailored for OA (both LLM-based and\nnon-LLM-based) (study 3), the quality of response generation (study 4), and the\npractical value of the systems in real-world scenarios via user studies (study\n5). Results demonstrate that both PARIS and LE-PARIS significantly meet key\nmetrics and positively impact attorney performance.\n","authors":["Jung-Mei Chu","Hao-Cheng Lo","Jieh Hsiang","Chun-Chieh Cho"],"pdf_url":"https://arxiv.org/pdf/2402.00421v1.pdf","comment":"14 pages, 4 figures, summitted to a journal"},{"id":"http://arxiv.org/abs/2402.00390v1","updated":"2024-02-01T07:22:52Z","published":"2024-02-01T07:22:52Z","title":"EASRec: Elastic Architecture Search for Efficient Long-term Sequential\n Recommender Systems","summary":" In this age where data is abundant, the ability to distill meaningful\ninsights from the sea of information is essential. Our research addresses the\ncomputational and resource inefficiencies that current Sequential Recommender\nSystems (SRSs) suffer from. especially those employing attention-based models\nlike SASRec, These systems are designed for next-item recommendations in\nvarious applications, from e-commerce to social networks. However, such systems\nsuffer from substantial computational costs and resource consumption during the\ninference stage. To tackle these issues, our research proposes a novel method\nthat combines automatic pruning techniques with advanced model architectures.\nWe also explore the potential of resource-constrained Neural Architecture\nSearch (NAS), a technique prevalent in the realm of recommendation systems, to\nfine-tune models for reduced FLOPs, latency, and energy usage while retaining\nor even enhancing accuracy. The main contribution of our work is developing the\nElastic Architecture Search for Efficient Long-term Sequential Recommender\nSystems (EASRec). This approach aims to find optimal compact architectures for\nattention-based SRSs, ensuring accuracy retention. EASRec introduces data-aware\ngates that leverage historical information from input data batch to improve the\nperformance of the recommendation network. Additionally, it utilizes a dynamic\nresource constraint approach, which standardizes the search process and results\nin more appropriate architectures. The effectiveness of our methodology is\nvalidated through exhaustive experiments on three benchmark datasets, which\ndemonstrates EASRec's superiority in SRSs. Our research set a new standard for\nfuture exploration into efficient and accurate recommender systems, signifying\na substantial advancement within this swiftly advancing field.\n","authors":["Sheng Zhang","Maolin Wang","Yao Zhao","Chenyi Zhuang","Jinjie Gu","Ruocheng Guo","Xiangyu Zhao","Zijian Zhang","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2402.00390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13063v2","updated":"2024-02-01T04:57:05Z","published":"2023-09-14T20:46:48Z","title":"Using Large Language Models to Generate, Validate, and Apply User Intent\n Taxonomies","summary":" Log data can reveal valuable information about how users interact with Web\nsearch services, what they want, and how satisfied they are. However, analyzing\nuser intents in log data is not easy, especially for emerging forms of Web\nsearch such as AI-driven chat. To understand user intents from log data, we\nneed a way to label them with meaningful categories that capture their\ndiversity and dynamics. Existing methods rely on manual or machine-learned\nlabeling, which are either expensive or inflexible for large and dynamic\ndatasets. We propose a novel solution using large language models (LLMs), which\ncan generate rich and relevant concepts, descriptions, and examples for user\nintents. However, using LLMs to generate a user intent taxonomy and apply it\nfor log analysis can be problematic for two main reasons: (1) such a taxonomy\nis not externally validated; and (2) there may be an undesirable feedback loop.\nTo address this, we propose a new methodology with human experts and assessors\nto verify the quality of the LLM-generated taxonomy. We also present an\nend-to-end pipeline that uses an LLM with human-in-the-loop to produce, refine,\nand apply labels for user intent analysis in log data. We demonstrate its\neffectiveness by uncovering new insights into user intents from search and chat\nlogs from the Microsoft Bing commercial search engine. The proposed work's\nnovelty stems from the method for generating purpose-driven user intent\ntaxonomies with strong validation. This method not only helps remove\nmethodological and practical bottlenecks from intent-focused research, but also\nprovides a new framework for generating, validating, and applying other kinds\nof taxonomies in a scalable and adaptable way with minimal human effort.\n","authors":["Chirag Shah","Ryen W. White","Reid Andersen","Georg Buscher","Scott Counts","Sarkar Snigdha Sarathi Das","Ali Montazer","Sathish Manivannan","Jennifer Neville","Xiaochuan Ni","Nagu Rangan","Tara Safavi","Siddharth Suri","Mengting Wan","Leijie Wang","Longqi Yang"],"pdf_url":"https://arxiv.org/pdf/2309.13063v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10776v4","updated":"2024-02-01T04:19:41Z","published":"2023-11-16T01:21:33Z","title":"Chemist-X: Large Language Model-empowered Agent for Reaction Condition\n Recommendation in Chemical Synthesis","summary":" Recent AI research plots a promising future of automatic chemical reactions\nwithin the chemistry society. This study proposes Chemist-X, a transformative\nAI agent that automates the reaction condition recommendation (RCR) task in\nchemical synthesis with retrieval-augmented generation (RAG) technology. To\nemulate expert chemists' strategies when solving RCR tasks, Chemist-X utilizes\nadvanced RAG schemes to interrogate online molecular databases and distill\ncritical data from the latest literature database. Further, the agent leverages\nstate-of-the-art computer-aided design (CAD) tools with a large language model\n(LLM) supervised programming interface. With the ability to utilize updated\nchemical knowledge and CAD tools, our agent significantly outperforms\nconventional synthesis AIs confined to the fixed knowledge within its training\ndata. Chemist-X considerably reduces chemists' workload and allows them to\nfocus on more fundamental and creative problems, thereby bringing closer\ncomputational techniques and chemical research and making a remarkable leap\ntoward harnessing AI's full capabilities in scientific discovery.\n","authors":["Kexin Chen","Junyou Li","Kunyi Wang","Yuyang Du","Jiahui Yu","Jiamin Lu","Lanqing Li","Jiezhong Qiu","Jianzhang Pan","Yi Huang","Qun Fang","Pheng Ann Heng","Guangyong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.10776v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16868v2","updated":"2024-02-01T03:55:02Z","published":"2023-12-28T07:37:30Z","title":"Pareto-based Multi-Objective Recommender System with Forgetting Curve","summary":" Recommender systems with cascading architecture play an increasingly\nsignificant role in online recommendation platforms, where the approach to\ndealing with negative feedback is a vital issue. For instance, in short video\nplatforms, users tend to quickly slip away from candidates that they feel\naversive, and recommender systems are expected to receive these explicit\nnegative feedbacks and make adjustments to avoid these recommendations.\nConsidering recency effect in memories, we propose a forgetting model based on\nEbbinghaus Forgetting Curve to cope with negative feedback. In addition, we\nintroduce a Pareto optimization solver to guarantee a better trade-off between\nrecency and model performance. In conclusion, we propose Pareto-based\nMulti-Objective Recommender System with forgetting curve (PMORS), which can be\napplied to any multi-objective recommendation and show sufficiently superiority\nwhen facing explicit negative feedback. We have conducted evaluations of PMORS\nand achieved favorable outcomes in short-video scenarios on both public dataset\nand industrial dataset. After being deployed on an online short video platform\nnamed WeChat Channels in May, 2023, PMORS has not only demonstrated promising\nresults for both consistency and recency but also achieved an improvement of up\nto +1.45% GMV.\n","authors":["Jipeng Jin","Zhaoxiang Zhang","Zhiheng Li","Xiaofeng Gao","Xiongwen Yang","Lei Xiao","Jie Jiang"],"pdf_url":"https://arxiv.org/pdf/2312.16868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00309v1","updated":"2024-02-01T03:46:11Z","published":"2024-02-01T03:46:11Z","title":"An Exam-based Evaluation Approach Beyond Traditional Relevance Judgments","summary":" Current IR evaluation is based on relevance judgments, created either\nmanually or automatically, with decisions outsourced to Large Language Models\n(LLMs). We offer an alternative paradigm, that never relies on relevance\njudgments in any form. Instead, a text is defined as relevant if it contains\ninformation that enables the answering of key questions. We use this idea to\ndesign the EXAM Answerability Metric to evaluate information\nretrieval/generation systems for their ability to provide topically relevant\ninformation.\n We envision the role of a human judge to edit and define an exam question\nbank that will test for the presence of relevant information in text. We\nsupport this step by generating an initial set of exam questions. In the next\nphase, an LLM-based question answering system will automatically grade system\nresponses by tracking which exam questions are answerable with which system\nresponses. We propose two evaluation measures, the recall-oriented EXAM Cover\nmetric, and the precision-oriented EXAM Qrels metric, the latter which can be\nimplemented with trec_eval. This paradigm not only allows for the expansion of\nthe exam question set post-hoc but also facilitates the ongoing evaluation of\nfuture information systems, whether they focus on retrieval, generation, or\nboth.\n","authors":["Naghmeh Farzi","Laura Dietz"],"pdf_url":"https://arxiv.org/pdf/2402.00309v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2402.00284v1","updated":"2024-02-01T02:29:16Z","published":"2024-02-01T02:29:16Z","title":"PAP-REC: Personalized Automatic Prompt for Recommendation Language Model","summary":" Recently emerged prompt-based Recommendation Language Models (RLM) can solve\nmultiple recommendation tasks uniformly. The RLMs make full use of the\ninherited knowledge learned from the abundant pre-training data to solve the\ndownstream recommendation tasks by prompts, without introducing additional\nparameters or network training. However, handcrafted prompts require\nsignificant expertise and human effort since slightly rewriting prompts may\ncause massive performance changes. In this paper, we propose PAP-REC, a\nframework to generate the Personalized Automatic Prompt for RECommendation\nlanguage models to mitigate the inefficiency and ineffectiveness problems\nderived from manually designed prompts. Specifically, personalized automatic\nprompts allow different users to have different prompt tokens for the same\ntask, automatically generated using a gradient-based method. One challenge for\npersonalized automatic prompt generation for recommendation language models is\nthe extremely large search space, leading to a long convergence time. To\neffectively and efficiently address the problem, we develop surrogate metrics\nand leverage an alternative updating schedule for prompting recommendation\nlanguage models. Experimental results show that our PAP-REC framework manages\nto generate personalized prompts, and the automatically generated prompts\noutperform manually constructed prompts and also outperform various baseline\nrecommendation models. The source code of the work is available at\nhttps://github.com/rutgerswiselab/PAP-REC.\n","authors":["Zelong Li","Jianchao Ji","Yingqiang Ge","Wenyue Hua","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06320v2","updated":"2024-02-01T02:08:28Z","published":"2024-01-12T01:54:08Z","title":"Zero-shot Generative Large Language Models for Systematic Review\n Screening Automation","summary":" Systematic reviews are crucial for evidence-based medicine as they\ncomprehensively analyse published research findings on specific questions.\nConducting such reviews is often resource- and time-intensive, especially in\nthe screening phase, where abstracts of publications are assessed for inclusion\nin a review. This study investigates the effectiveness of using zero-shot large\nlanguage models~(LLMs) for automatic screening. We evaluate the effectiveness\nof eight different LLMs and investigate a calibration technique that uses a\npredefined recall threshold to determine whether a publication should be\nincluded in a systematic review. Our comprehensive evaluation using five\nstandard test collections shows that instruction fine-tuning plays an important\nrole in screening, that calibration renders LLMs practical for achieving a\ntargeted recall, and that combining both with an ensemble of zero-shot models\nsaves significant screening time compared to state-of-the-art approaches.\n","authors":["Shuai Wang","Harrisen Scells","Shengyao Zhuang","Martin Potthast","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.06320v2.pdf","comment":"Accepted to ECIR2024 full paper (findings)"},{"id":"http://arxiv.org/abs/2401.01472v2","updated":"2024-02-01T01:32:48Z","published":"2024-01-03T00:13:52Z","title":"A First Look at Information Highlighting in Stack Overflow Answers","summary":" Context: Navigating the knowledge of Stack Overflow (SO) remains challenging.\nTo make the posts vivid to users, SO allows users to write and edit posts with\nMarkdown or HTML so that users can leverage various formatting styles (e.g.,\nbold, italic, and code) to highlight the important information. Nonetheless,\nthere have been limited studies on the highlighted information. Objective: We\ncarried out the first large-scale exploratory study on the information\nhighlighted in SO answers in our recent study. To extend our previous study, we\ndevelop approaches to automatically recommend highlighted content with\nformatting styles using neural network architectures initially designed for the\nNamed Entity Recognition task. Method: In this paper, we studied 31,169,429\nanswers of Stack Overflow. For training recommendation models, we choose CNN\nand BERT models for each type of formatting (i.e., Bold, Italic, Code, and\nHeading) using the information highlighting dataset we collected from SO\nanswers. Results: Our models based on CNN architecture achieve precision\nranging from 0.71 to 0.82. The trained model for automatic code content\nhighlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming\nthe trained models for other formatting styles. The BERT models have even lower\nrecalls and F1 scores than the CNN models. Our analysis of failure cases\nindicates that the majority of the failure cases are missing identification\n(i.e., the model misses the content that is supposed to be highlighted) due to\nthe models tend to learn the frequently highlighted words while struggling to\nlearn less frequent words. Conclusion: Our findings suggest that it is possible\nto develop recommendation models for highlighting information for answers with\ndifferent formatting styles on Stack Overflow.\n","authors":["Shahla Shaan Ahmed","Shaowei Wang","Yuan Tian"," Tse-Hsun"," Chen","Haoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.01472v2.pdf","comment":"This work is submitted to Information and Software Technology Journal"},{"id":"http://arxiv.org/abs/2402.01008v1","updated":"2024-02-01T20:46:07Z","published":"2024-02-01T20:46:07Z","title":"CF4J: Collaborative Filtering for Java","summary":" Recommender Systems (RS) provide a relevant tool to mitigate the information\noverload problem. A large number of researchers have published hundreds of\npapers to improve different RS features. It is advisable to use RS frameworks\nthat simplify RS researchers: a) to design and implement recommendations\nmethods and, b) to speed up the execution time of the experiments. In this\npaper, we present CF4J, a Java library designed to carry out Collaborative\nFiltering based RS research experiments. CF4J has been designed from\nresearchers to researchers. It allows: a) RS datasets reading, b) full and easy\naccess to data and intermediate or final results, c) to extend their main\nfunctionalities, d) to concurrently execute the implemented methods, and e) to\nprovide a thorough evaluation for the implementations by quality measures. In\nsummary, CF4J serves as a library specifically designed for the research trial\nand error process.\n","authors":["Fernando Ortega","Bo Zhu","Jesus Bobadilla","Antonio Hernando"],"pdf_url":"https://arxiv.org/pdf/2402.01008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00969v1","updated":"2024-02-01T19:38:32Z","published":"2024-02-01T19:38:32Z","title":"SPARQL Generation with Entity Pre-trained GPT for KG Question Answering","summary":" Knowledge Graphs popularity has been rapidly growing in last years. All that\nknowledge is available for people to query it through the many online databases\non the internet. Though, it would be a great achievement if non-programmer\nusers could access whatever information they want to know. There has been a lot\nof effort oriented to solve this task using natural language processing tools\nand creativity encouragement by way of many challenges. Our approach focuses on\nassuming a correct entity linking on the natural language questions and\ntraining a GPT model to create SPARQL queries from them. We managed to isolate\nwhich property of the task can be the most difficult to solve at few or\nzero-shot and we proposed pre-training on all entities (under CWA) to improve\nthe performance. We obtained a 62.703% accuracy of exact SPARQL matches on\ntesting at 3-shots, a F1 of 0.809 on the entity linking challenge and a F1 of\n0.009 on the question answering challenge.\n","authors":["Diego Bustamante","Hideaki Takeda"],"pdf_url":"https://arxiv.org/pdf/2402.00969v1.pdf","comment":"7 pages, 1 figure, 2 tables. For the implementation, see\n https://github.com/DiegoEmilio01/SPARQL-generation-with-entity-pre-trained-GPT-for-KG-Question-Answering"},{"id":"http://arxiv.org/abs/2402.00943v1","updated":"2024-02-01T19:00:40Z","published":"2024-02-01T19:00:40Z","title":"Approximate Nearest Neighbor Search with Window Filters","summary":" We define and investigate the problem of $\\textit{c-approximate window\nsearch}$: approximate nearest neighbor search where each point in the dataset\nhas a numeric label, and the goal is to find nearest neighbors to queries\nwithin arbitrary label ranges. Many semantic search problems, such as image and\ndocument search with timestamp filters, or product search with cost filters,\nare natural examples of this problem. We propose and theoretically analyze a\nmodular tree-based framework for transforming an index that solves the\ntraditional c-approximate nearest neighbor problem into a data structure that\nsolves window search. On standard nearest neighbor benchmark datasets equipped\nwith random label values, adversarially constructed embeddings, and image\nsearch embeddings with real timestamps, we obtain up to a $75\\times$ speedup\nover existing solutions at the same level of recall.\n","authors":["Joshua Engels","Benjamin Landrum","Shangdi Yu","Laxman Dhulipala","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2402.00943v1.pdf","comment":"Code available: https://github.com/JoshEngels/RangeFilteredANN"},{"id":"http://arxiv.org/abs/2402.03368v1","updated":"2024-02-01T23:51:29Z","published":"2024-02-01T23:51:29Z","title":"Empirical and Experimental Perspectives on Big Data in Recommendation\n Systems: A Comprehensive Survey","summary":" This survey paper provides a comprehensive analysis of big data algorithms in\nrecommendation systems, addressing the lack of depth and precision in existing\nliterature. It proposes a two-pronged approach: a thorough analysis of current\nalgorithms and a novel, hierarchical taxonomy for precise categorization. The\ntaxonomy is based on a tri-level hierarchy, starting with the methodology\ncategory and narrowing down to specific techniques. Such a framework allows for\na structured and comprehensive classification of algorithms, assisting\nresearchers in understanding the interrelationships among diverse algorithms\nand techniques. Covering a wide range of algorithms, this taxonomy first\ncategorizes algorithms into four main analysis types: User and Item\nSimilarity-Based Methods, Hybrid and Combined Approaches, Deep Learning and\nAlgorithmic Methods, and Mathematical Modeling Methods, with further\nsubdivisions into sub-categories and techniques. The paper incorporates both\nempirical and experimental evaluations to differentiate between the techniques.\nThe empirical evaluation ranks the techniques based on four criteria. The\nexperimental assessments rank the algorithms that belong to the same category,\nsub-category, technique, and sub-technique. Also, the paper illuminates the\nfuture prospects of big data techniques in recommendation systems, underscoring\npotential advancements and opportunities for further research in this field\n","authors":["Kamal Taha","Paul D. Yoo","Aya Taha"],"pdf_url":"https://arxiv.org/pdf/2402.03368v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.18826v4","updated":"2024-02-01T18:59:44Z","published":"2023-11-30T18:59:05Z","title":"Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal\n Inference","summary":" This paper presents a groundbreaking approach to causal inference by\nintegrating continuous normalizing flows (CNFs) with parametric submodels,\nenhancing their geometric sensitivity and improving upon traditional Targeted\nMaximum Likelihood Estimation (TMLE). Our method employs CNFs to refine TMLE,\noptimizing the Cram\\'er-Rao bound and transitioning from a predefined\ndistribution $p_0$ to a data-driven distribution $p_1$. We innovate further by\nembedding Wasserstein gradient flows within Fokker-Planck equations, thus\nimposing geometric structures that boost the robustness of CNFs, particularly\nin optimal transport theory.\n Our approach addresses the disparity between sample and population\ndistributions, a critical factor in parameter estimation bias. We leverage\noptimal transport and Wasserstein gradient flows to develop causal inference\nmethodologies with minimal variance in finite-sample settings, outperforming\ntraditional methods like TMLE and AIPW. This novel framework, centered on\nWasserstein gradient flows, minimizes variance in efficient influence functions\nunder distribution $p_t$. Preliminary experiments showcase our method's\nsuperiority, yielding lower mean-squared errors compared to standard flows,\nthereby demonstrating the potential of geometry-aware normalizing Wasserstein\nflows in advancing statistical modeling and inference.\n","authors":["Kaiwen Hou"],"pdf_url":"https://arxiv.org/pdf/2311.18826v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00865v1","updated":"2024-02-01T18:59:22Z","published":"2024-02-01T18:59:22Z","title":"Towards Optimal Feature-Shaping Methods for Out-of-Distribution\n Detection","summary":" Feature shaping refers to a family of methods that exhibit state-of-the-art\nperformance for out-of-distribution (OOD) detection. These approaches\nmanipulate the feature representation, typically from the penultimate layer of\na pre-trained deep learning model, so as to better differentiate between\nin-distribution (ID) and OOD samples. However, existing feature-shaping methods\nusually employ rules manually designed for specific model architectures and OOD\ndatasets, which consequently limit their generalization ability. To address\nthis gap, we first formulate an abstract optimization framework for studying\nfeature-shaping methods. We then propose a concrete reduction of the framework\nwith a simple piecewise constant shaping function and show that existing\nfeature-shaping methods approximate the optimal solution to the concrete\noptimization problem. Further, assuming that OOD data is inaccessible, we\npropose a formulation that yields a closed-form solution for the piecewise\nconstant shaping function, utilizing solely the ID data. Through extensive\nexperiments, we show that the feature-shaping function optimized by our method\nimproves the generalization ability of OOD detection across a large variety of\ndatasets and model architectures.\n","authors":["Qinyu Zhao","Ming Xu","Kartik Gupta","Akshay Asthana","Liang Zheng","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2402.00865v1.pdf","comment":"ICLR 2024. Project page: https://github.com/Qinyu-Allen-Zhao/OptFSOOD"},{"id":"http://arxiv.org/abs/2312.01057v3","updated":"2024-02-01T18:57:20Z","published":"2023-12-02T08:04:29Z","title":"RLHF and IIA: Perverse Incentives","summary":" Existing algorithms for reinforcement learning from human feedback (RLHF) can\nincentivize responses at odds with preferences because they are based on models\nthat assume independence of irrelevant alternatives (IIA). The perverse\nincentives induced by IIA hinder innovations on query formats and learning\nalgorithms.\n","authors":["Wanqiao Xu","Shi Dong","Xiuyuan Lu","Grace Lam","Zheng Wen","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2312.01057v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00857v1","updated":"2024-02-01T18:54:34Z","published":"2024-02-01T18:54:34Z","title":"Early Time Classification with Accumulated Accuracy Gap Control","summary":" Early time classification algorithms aim to label a stream of features\nwithout processing the full input stream, while maintaining accuracy comparable\nto that achieved by applying the classifier to the entire input. In this paper,\nwe introduce a statistical framework that can be applied to any sequential\nclassifier, formulating a calibrated stopping rule. This data-driven rule\nattains finite-sample, distribution-free control of the accuracy gap between\nfull and early-time classification. We start by presenting a novel method that\nbuilds on the Learn-then-Test calibration framework to control this gap\nmarginally, on average over i.i.d. instances. As this algorithm tends to yield\nan excessively high accuracy gap for early halt times, our main contribution is\nthe proposal of a framework that controls a stronger notion of error, where the\naccuracy gap is controlled conditionally on the accumulated halt times.\nNumerical experiments demonstrate the effectiveness, applicability, and\nusefulness of our method. We show that our proposed early stopping mechanism\nreduces up to 94% of timesteps used for classification while achieving rigorous\naccuracy gap control.\n","authors":["Liran Ringel","Regev Cohen","Daniel Freedman","Michael Elad","Yaniv Romano"],"pdf_url":"https://arxiv.org/pdf/2402.00857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08670v3","updated":"2024-02-01T18:54:21Z","published":"2023-06-14T17:59:15Z","title":"The Power of Populations in Decentralized Bandits","summary":" We study a cooperative multi-agent bandit setting in the distributed GOSSIP\nmodel: in every round, each of $n$ agents chooses an action from a common set,\nobserves the action's corresponding reward, and subsequently exchanges\ninformation with a single randomly chosen neighbor, which informs its policy in\nthe next round. We introduce and analyze several families of\nfully-decentralized local algorithms in this setting under the constraint that\neach agent has only constant memory. We highlight a connection between the\nglobal evolution of such decentralized algorithms and a new class of \"zero-sum\"\nmultiplicative weights update methods, and we develop a general framework for\nanalyzing the population-level regret of these natural protocols. Using this\nframework, we derive sublinear regret bounds for both stationary and\nadversarial reward settings. Moreover, we show that these simple local\nalgorithms can approximately optimize convex functions over the simplex,\nassuming that the reward distributions are generated from a stochastic gradient\noracle.\n","authors":["John Lazarsfeld","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2306.08670v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00854v1","updated":"2024-02-01T18:50:50Z","published":"2024-02-01T18:50:50Z","title":"SymbolicAI: A framework for logic-based approaches combining generative\n models and solvers","summary":" We introduce SymbolicAI, a versatile and modular framework employing a\nlogic-based approach to concept learning and flow management in generative\nprocesses. SymbolicAI enables the seamless integration of generative models\nwith a diverse range of solvers by treating large language models (LLMs) as\nsemantic parsers that execute tasks based on both natural and formal language\ninstructions, thus bridging the gap between symbolic reasoning and generative\nAI. We leverage probabilistic programming principles to tackle complex tasks,\nand utilize differentiable and classical programming paradigms with their\nrespective strengths. The framework introduces a set of polymorphic,\ncompositional, and self-referential operations for data stream manipulation,\naligning LLM outputs with user objectives. As a result, we can transition\nbetween the capabilities of various foundation models endowed with zero- and\nfew-shot learning capabilities and specialized, fine-tuned models or solvers\nproficient in addressing specific problems. In turn, the framework facilitates\nthe creation and evaluation of explainable computational graphs. We conclude by\nintroducing a quality measure and its empirical score for evaluating these\ncomputational graphs, and propose a benchmark that compares various\nstate-of-the-art LLMs across a set of complex workflows. We refer to the\nempirical score as the \"Vector Embedding for Relational Trajectory Evaluation\nthrough Cross-similarity\", or VERTEX score for short. The framework codebase\nand benchmark are linked below.\n","authors":["Marius-Constantin Dinu","Claudiu Leoveanu-Condrei","Markus Holzleitner","Werner Zellinger","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2402.00854v1.pdf","comment":"38 pages, 12 figures, external resources: framework is available at\n https://github.com/ExtensityAI/symbolicai and benchmark at\n https://github.com/ExtensityAI/benchmark"},{"id":"http://arxiv.org/abs/2402.00853v1","updated":"2024-02-01T18:50:42Z","published":"2024-02-01T18:50:42Z","title":"LTAU-FF: Loss Trajectory Analysis for Uncertainty in Atomistic Force\n Fields","summary":" Model ensembles are simple and effective tools for estimating the prediction\nuncertainty of deep learning atomistic force fields. Despite this, widespread\nadoption of ensemble-based uncertainty quantification (UQ) techniques is\nlimited by the high computational costs incurred by ensembles during both\ntraining and inference. In this work we leverage the cumulative distribution\nfunctions (CDFs) of per-sample errors obtained over the course of training to\nefficiently represent the model ensemble, and couple them with a distance-based\nsimilarity search in the model latent space. Using these tools, we develop a\nsimple UQ metric (which we call LTAU) that leverages the strengths of\nensemble-based techniques without requiring the evaluation of multiple models\nduring either training or inference. As an initial test, we apply our method\ntowards estimating the epistemic uncertainty in atomistic force fields\n(LTAU-FF) and demonstrate that it can be easily calibrated to accurately\npredict test errors on multiple datasets from the literature. We then\nillustrate the utility of LTAU-FF in two practical applications: 1) tuning the\ntraining-validation gap for an example dataset, and 2) predicting errors in\nrelaxation trajectories on the OC20 IS2RS task. Though in this work we focus on\nthe use of LTAU with deep learning atomistic force fields, we emphasize that it\ncan be readily applied to any regression task, or any ensemble-generation\ntechnique, to provide a reliable and easy-to-implement UQ metric.\n","authors":["Joshua A. Vita","Amit Samanta","Fei Zhou","Vincenzo Lordi"],"pdf_url":"https://arxiv.org/pdf/2402.00853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00851v1","updated":"2024-02-01T18:46:28Z","published":"2024-02-01T18:46:28Z","title":"Data Augmentation Scheme for Raman Spectra with Highly Correlated\n Annotations","summary":" In biotechnology Raman Spectroscopy is rapidly gaining popularity as a\nprocess analytical technology (PAT) that measures cell densities, substrate-\nand product concentrations. As it records vibrational modes of molecules it\nprovides that information non-invasively in a single spectrum. Typically,\npartial least squares (PLS) is the model of choice to infer information about\nvariables of interest from the spectra. However, biological processes are known\nfor their complexity where convolutional neural networks (CNN) present a\npowerful alternative. They can handle non-Gaussian noise and account for beam\nmisalignment, pixel malfunctions or the presence of additional substances.\nHowever, they require a lot of data during model training, and they pick up\nnon-linear dependencies in the process variables. In this work, we exploit the\nadditive nature of spectra in order to generate additional data points from a\ngiven dataset that have statistically independent labels so that a network\ntrained on such data exhibits low correlations between the model predictions.\nWe show that training a CNN on these generated data points improves the\nperformance on datasets where the annotations do not bear the same correlation\nas the dataset that was used for model training. This data augmentation\ntechnique enables us to reuse spectra as training data for new contexts that\nexhibit different correlations. The additional data allows for building a\nbetter and more robust model. This is of interest in scenarios where large\namounts of historical data are available but are currently not used for model\ntraining. We demonstrate the capabilities of the proposed method using\nsynthetic spectra of Ralstonia eutropha batch cultivations to monitor\nsubstrate, biomass and polyhydroxyalkanoate (PHA) biopolymer concentrations\nduring of the experiments.\n","authors":["Christoph Lange","Isabel Thiele","Lara Santolin","Sebastian L. Riedel","Maxim Borisyak","Peter Neubauer","M. Nicolas Cruz Bournazou"],"pdf_url":"https://arxiv.org/pdf/2402.00851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00849v1","updated":"2024-02-01T18:40:03Z","published":"2024-02-01T18:40:03Z","title":"Score-based Causal Representation Learning: Linear and General\n Transformations","summary":" This paper addresses intervention-based causal representation learning (CRL)\nunder a general nonparametric latent causal model and an unknown transformation\nthat maps the latent variables to the observed variables. Linear and general\ntransformations are investigated. The paper addresses both the\n\\emph{identifiability} and \\emph{achievability} aspects. Identifiability refers\nto determining algorithm-agnostic conditions that ensure recovering the true\nlatent causal variables and the latent causal graph underlying them.\nAchievability refers to the algorithmic aspects and addresses designing\nalgorithms that achieve identifiability guarantees. By drawing novel\nconnections between \\emph{score functions} (i.e., the gradients of the\nlogarithm of density functions) and CRL, this paper designs a \\emph{score-based\nclass of algorithms} that ensures both identifiability and achievability.\nFirst, the paper focuses on \\emph{linear} transformations and shows that one\nstochastic hard intervention per node suffices to guarantee identifiability. It\nalso provides partial identifiability guarantees for soft interventions,\nincluding identifiability up to ancestors for general causal models and perfect\nlatent graph recovery for sufficiently non-linear causal models. Secondly, it\nfocuses on \\emph{general} transformations and shows that two stochastic hard\ninterventions per node suffice for identifiability. Notably, one does\n\\emph{not} need to know which pair of interventional environments have the same\nnode intervened.\n","authors":["Burak Varıcı","Emre Acartürk","Karthikeyan Shanmugam","Ali Tajer"],"pdf_url":"https://arxiv.org/pdf/2402.00849v1.pdf","comment":"Linear transformations: stronger results for hard and soft\n interventions than our previous paper Score-based Causal Representation\n Learning with Interventions (https://arxiv.org/abs/2301.08230). General\n transformations: results also appear in our paper General Identifiability and\n Achievability for Causal Representation Learning (arXiv:2310.15450) accepted\n to AISTATS 2024 (oral)"},{"id":"http://arxiv.org/abs/2402.00839v1","updated":"2024-02-01T18:29:16Z","published":"2024-02-01T18:29:16Z","title":"X-CBA: Explainability Aided CatBoosted Anomal-E for Intrusion Detection\n System","summary":" The effectiveness of Intrusion Detection Systems (IDS) is critical in an era\nwhere cyber threats are becoming increasingly complex. Machine learning (ML)\nand deep learning (DL) models provide an efficient and accurate solution for\nidentifying attacks and anomalies in computer networks. However, using ML and\nDL models in IDS has led to a trust deficit due to their non-transparent\ndecision-making. This transparency gap in IDS research is significant,\naffecting confidence and accountability. To address, this paper introduces a\nnovel Explainable IDS approach, called X-CBA, that leverages the structural\nadvantages of Graph Neural Networks (GNNs) to effectively process network\ntraffic data, while also adapting a new Explainable AI (XAI) methodology.\nUnlike most GNN-based IDS that depend on labeled network traffic and node\nfeatures, thereby overlooking critical packet-level information, our approach\nleverages a broader range of traffic data through network flows, including edge\nattributes, to improve detection capabilities and adapt to novel threats.\nThrough empirical testing, we establish that our approach not only achieves\nhigh accuracy with 99.47% in threat detection but also advances the field by\nproviding clear, actionable explanations of its analytical outcomes. This\nresearch also aims to bridge the current gap and facilitate the broader\nintegration of ML/DL technologies in cybersecurity defenses by offering a local\nand global explainability solution that is both precise and interpretable.\n","authors":["Kiymet Kaya","Elif Ak","Sumeyye Bas","Berk Canberk","Sule Gunduz Oguducu"],"pdf_url":"https://arxiv.org/pdf/2402.00839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13366v2","updated":"2024-02-01T18:26:39Z","published":"2024-01-24T10:51:15Z","title":"Mitigating System Bias in Resource Constrained Asynchronous Federated\n Learning Systems","summary":" Federated learning (FL) systems face performance challenges in dealing with\nheterogeneous devices and non-identically distributed data across clients. We\npropose a dynamic global model aggregation method within Asynchronous Federated\nLearning (AFL) deployments to address these issues. Our aggregation method\nscores and adjusts the weighting of client model updates based on their upload\nfrequency to accommodate differences in device capabilities. Additionally, we\nalso immediately provide an updated global model to clients after they upload\ntheir local models to reduce idle time and improve training efficiency. We\nevaluate our approach within an AFL deployment consisting of 10 simulated\nclients with heterogeneous compute constraints and non-IID data. The simulation\nresults, using the FashionMNIST dataset, demonstrate over 10% and 19%\nimprovement in global model accuracy compared to state-of-the-art methods\nPAPAYA and FedAsync, respectively. Our dynamic aggregation method allows\nreliable global model training despite limiting client resources and\nstatistical data heterogeneity. This improves robustness and scalability for\nreal-world FL deployments.\n","authors":["Jikun Gao","Ioannis Mavromatis","Peizheng Li","Pietro Carnelli","Aftab Khan"],"pdf_url":"https://arxiv.org/pdf/2401.13366v2.pdf","comment":"6 pages, 5 figures. This work has been accepted by PerCom PerconAI\n workshop 2024"},{"id":"http://arxiv.org/abs/2401.16736v2","updated":"2024-02-01T18:24:09Z","published":"2024-01-30T04:29:48Z","title":"Engineering A Large Language Model From Scratch","summary":" The proliferation of deep learning in natural language processing (NLP) has\nled to the development and release of innovative technologies capable of\nunderstanding and generating human language with remarkable proficiency.\nAtinuke, a Transformer-based neural network, optimises performance across\nvarious language tasks by utilising a unique configuration. The architecture\ninterweaves layers for processing sequential data with attention mechanisms to\ndraw meaningful affinities between inputs and outputs. Due to the configuration\nof its topology and hyperparameter tuning, it can emulate human-like language\nby extracting features and learning complex mappings. Atinuke is modular,\nextensible, and integrates seamlessly with existing machine learning pipelines.\nAdvanced matrix operations like softmax, embeddings, and multi-head attention\nenable nuanced handling of textual, acoustic, and visual signals. By unifying\nmodern deep learning techniques with software design principles and\nmathematical theory, the system achieves state-of-the-art results on natural\nlanguage tasks whilst remaining interpretable and robust.\n","authors":["Abiodun Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2401.16736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00835v1","updated":"2024-02-01T18:22:32Z","published":"2024-02-01T18:22:32Z","title":"ALISON: Fast and Effective Stylometric Authorship Obfuscation","summary":" Authorship Attribution (AA) and Authorship Obfuscation (AO) are two competing\ntasks of increasing importance in privacy research. Modern AA leverages an\nauthor's consistent writing style to match a text to its author using an AA\nclassifier. AO is the corresponding adversarial task, aiming to modify a text\nin such a way that its semantics are preserved, yet an AA model cannot\ncorrectly infer its authorship. To address privacy concerns raised by\nstate-of-the-art (SOTA) AA methods, new AO methods have been proposed but\nremain largely impractical to use due to their prohibitively slow training and\nobfuscation speed, often taking hours. To this challenge, we propose a\npractical AO method, ALISON, that (1) dramatically reduces training/obfuscation\ntime, demonstrating more than 10x faster obfuscation than SOTA AO methods, (2)\nachieves better obfuscation success through attacking three transformer-based\nAA methods on two benchmark datasets, typically performing 15% better than\ncompeting methods, (3) does not require direct signals from a target AA\nclassifier during obfuscation, and (4) utilizes unique stylometric features,\nallowing sound model interpretation for explainable obfuscation. We also\ndemonstrate that ALISON can effectively prevent four SOTA AA methods from\naccurately determining the authorship of ChatGPT-generated texts, all while\nminimally changing the original text semantics. To ensure the reproducibility\nof our findings, our code and data are available at:\nhttps://github.com/EricX003/ALISON.\n","authors":["Eric Xing","Saranya Venkatraman","Thai Le","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2402.00835v1.pdf","comment":"10 pages, 6 figures, 4 tables. To be published in the Proceedings of\n the 38th Annual AAAI Conference on Artificial Intelligence (AAAI-24)"},{"id":"http://arxiv.org/abs/2402.00831v1","updated":"2024-02-01T18:17:37Z","published":"2024-02-01T18:17:37Z","title":"A YANG-aided Unified Strategy for Black Hole Detection for Backbone\n Networks","summary":" Despite the crucial importance of addressing Black Hole failures in Internet\nbackbone networks, effective detection strategies in backbone networks are\nlacking. This is largely because previous research has been centered on Mobile\nAd-hoc Networks (MANETs), which operate under entirely different dynamics,\nprotocols, and topologies, making their findings not directly transferable to\nbackbone networks. Furthermore, detecting Black Hole failures in backbone\nnetworks is particularly challenging. It requires a comprehensive range of\nnetwork data due to the wide variety of conditions that need to be considered,\nmaking data collection and analysis far from straightforward. Addressing this\ngap, our study introduces a novel approach for Black Hole detection in backbone\nnetworks using specialized Yet Another Next Generation (YANG) data models with\nBlack Hole-sensitive Metric Matrix (BHMM) analysis. This paper details our\nmethod of selecting and analyzing four YANG models relevant to Black Hole\ndetection in ISP networks, focusing on routing protocols and ISP-specific\nconfigurations. Our BHMM approach derived from these models demonstrates a 10%\nimprovement in detection accuracy and a 13% increase in packet delivery rate,\nhighlighting the efficiency of our approach. Additionally, we evaluate the\nMachine Learning approach leveraged with BHMM analysis in two different network\nsettings, a commercial ISP network, and a scientific research-only network\ntopology. This evaluation also demonstrates the practical applicability of our\nmethod, yielding significantly improved prediction outcomes in both\nenvironments.\n","authors":["Elif Ak","Kiymet Kaya","Eren Ozaltun","Sule Gunduz Oguducu","Berk Canberk"],"pdf_url":"https://arxiv.org/pdf/2402.00831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13744v2","updated":"2024-02-01T18:14:20Z","published":"2024-01-24T19:01:22Z","title":"Conformal Prediction Sets Improve Human Decision Making","summary":" In response to everyday queries, humans explicitly signal uncertainty and\noffer alternative answers when they are unsure. Machine learning models that\noutput calibrated prediction sets through conformal prediction mimic this human\nbehaviour; larger sets signal greater uncertainty while providing alternatives.\nIn this work, we study the usefulness of conformal prediction sets as an aid\nfor human decision making by conducting a pre-registered randomized controlled\ntrial with conformal prediction sets provided to human subjects. With\nstatistical significance, we find that when humans are given conformal\nprediction sets their accuracy on tasks improves compared to fixed-size\nprediction sets with the same coverage guarantee. The results show that\nquantifying model uncertainty with conformal prediction is helpful for\nhuman-in-the-loop decision making and human-AI teams.\n","authors":["Jesse C. Cresswell","Yi Sui","Bhargava Kumar","Noël Vouitsis"],"pdf_url":"https://arxiv.org/pdf/2401.13744v2.pdf","comment":"Code available at\n https://github.com/layer6ai-labs/hitl-conformal-prediction"},{"id":"http://arxiv.org/abs/2402.00825v1","updated":"2024-02-01T18:11:22Z","published":"2024-02-01T18:11:22Z","title":"Resolution invariant deep operator network for PDEs with complex\n geometries","summary":" Neural operators (NO) are discretization invariant deep learning methods with\nfunctional output and can approximate any continuous operator. NO have\ndemonstrated the superiority of solving partial differential equations (PDEs)\nover other deep learning methods. However, the spatial domain of its input\nfunction needs to be identical to its output, which limits its applicability.\nFor instance, the widely used Fourier neural operator (FNO) fails to\napproximate the operator that maps the boundary condition to the PDE solution.\nTo address this issue, we propose a novel framework called resolution-invariant\ndeep operator (RDO) that decouples the spatial domain of the input and output.\nRDO is motivated by the Deep operator network (DeepONet) and it does not\nrequire retraining the network when the input/output is changed compared with\nDeepONet. RDO takes functional input and its output is also functional so that\nit keeps the resolution invariant property of NO. It can also resolve PDEs with\ncomplex geometries whereas NO fail. Various numerical experiments demonstrate\nthe advantage of our method over DeepONet and FNO.\n","authors":["Jianguo Huang","Yue Qiu"],"pdf_url":"https://arxiv.org/pdf/2402.00825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00823v1","updated":"2024-02-01T18:07:33Z","published":"2024-02-01T18:07:33Z","title":"SLIM: Skill Learning with Multiple Critics","summary":" Self-supervised skill learning aims to acquire useful behaviors that leverage\nthe underlying dynamics of the environment. Latent variable models, based on\nmutual information maximization, have been particularly successful in this task\nbut still struggle in the context of robotic manipulation. As it requires\nimpacting a possibly large set of degrees of freedom composing the environment,\nmutual information maximization fails alone in producing useful manipulation\nbehaviors. To address this limitation, we introduce SLIM, a multi-critic\nlearning approach for skill discovery with a particular focus on robotic\nmanipulation. Our main insight is that utilizing multiple critics in an\nactor-critic framework to gracefully combine multiple reward functions leads to\na significant improvement in latent-variable skill discovery for robotic\nmanipulation while overcoming possible interference occurring among rewards\nwhich hinders convergence to useful skills. Furthermore, in the context of\ntabletop manipulation, we demonstrate the applicability of our novel skill\ndiscovery approach to acquire safe and efficient motor primitives in a\nhierarchical reinforcement learning fashion and leverage them through planning,\nsurpassing the state-of-the-art approaches for skill discovery by a large\nmargin.\n","authors":["David Emukpere","Bingbing Wu","Julien Perez"],"pdf_url":"https://arxiv.org/pdf/2402.00823v1.pdf","comment":"IEEE ICRA 2024"},{"id":"http://arxiv.org/abs/2310.00113v3","updated":"2024-02-01T18:01:02Z","published":"2023-09-29T20:01:11Z","title":"HyperMask: Adaptive Hypernetwork-based Masks for Continual Learning","summary":" Artificial neural networks suffer from catastrophic forgetting when they are\nsequentially trained on multiple tasks. Many continual learning (CL) strategies\nare trying to overcome this problem. One of the most effective is the\nhypernetwork-based approach. The hypernetwork generates the weights of a target\nmodel based on the task's identity. The model's main limitation is that, in\npractice, the hypernetwork can produce completely different architectures for\nsubsequent tasks. To solve such a problem, we use the lottery ticket\nhypothesis, which postulates the existence of sparse subnetworks, named winning\ntickets, that preserve the performance of a whole network. In the paper, we\npropose a method called HyperMask, which trains a single network for all CL\ntasks. The hypernetwork produces semi-binary masks to obtain target subnetworks\ndedicated to consecutive tasks. Moreover, due to the lottery ticket hypothesis,\nwe can use a single network with weighted subnets. Depending on the task, the\nimportance of some weights may be dynamically enhanced while others may be\nweakened. HyperMask achieves competitive results in several CL datasets and, in\nsome scenarios, goes beyond the state-of-the-art scores, both with derived and\nunknown task identities.\n","authors":["Kamil Książek","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2310.00113v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00816v1","updated":"2024-02-01T17:55:08Z","published":"2024-02-01T17:55:08Z","title":"Leveraging Approximate Model-based Shielding for Probabilistic Safety\n Guarantees in Continuous Environments","summary":" Shielding is a popular technique for achieving safe reinforcement learning\n(RL). However, classical shielding approaches come with quite restrictive\nassumptions making them difficult to deploy in complex environments,\nparticularly those with continuous state or action spaces. In this paper we\nextend the more versatile approximate model-based shielding (AMBS) framework to\nthe continuous setting. In particular we use Safety Gym as our test-bed,\nallowing for a more direct comparison of AMBS with popular constrained RL\nalgorithms. We also provide strong probabilistic safety guarantees for the\ncontinuous setting. In addition, we propose two novel penalty techniques that\ndirectly modify the policy gradient, which empirically provide more stable\nconvergence in our experiments.\n","authors":["Alexander W. Goodall","Francesco Belardinelli"],"pdf_url":"https://arxiv.org/pdf/2402.00816v1.pdf","comment":"Accepted as an Extended Abstract at AAMAS 2024"},{"id":"http://arxiv.org/abs/2402.00811v1","updated":"2024-02-01T17:46:19Z","published":"2024-02-01T17:46:19Z","title":"An Analysis of the Variance of Diffusion-based Speech Enhancement","summary":" Diffusion models proved to be powerful models for generative speech\nenhancement. In recent SGMSE+ approaches, training involves a stochastic\ndifferential equation for the diffusion process, adding both Gaussian and\nenvironmental noise to the clean speech signal gradually. The speech\nenhancement performance varies depending on the choice of the stochastic\ndifferential equation that controls the evolution of the mean and the variance\nalong the diffusion processes when adding environmental and Gaussian noise. In\nthis work, we highlight that the scale of the variance is a dominant parameter\nfor speech enhancement performance and show that it controls the tradeoff\nbetween noise attenuation and speech distortions. More concretely, we show that\na larger variance increases the noise attenuation and allows for reducing the\ncomputational footprint, as fewer function evaluations for generating the\nestimate are required.\n","authors":["Bunlong Lay","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2402.00811v1.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2402.00809v1","updated":"2024-02-01T17:45:26Z","published":"2024-02-01T17:45:26Z","title":"Position Paper: Bayesian Deep Learning in the Age of Large-Scale AI","summary":" In the current landscape of deep learning research, there is a predominant\nemphasis on achieving high predictive accuracy in supervised tasks involving\nlarge image and language datasets. However, a broader perspective reveals a\nmultitude of overlooked metrics, tasks, and data types, such as uncertainty,\nactive and continual learning, and scientific data, that demand attention.\nBayesian deep learning (BDL) constitutes a promising avenue, offering\nadvantages across these diverse settings. This paper posits that BDL can\nelevate the capabilities of deep learning. It revisits the strengths of BDL,\nacknowledges existing challenges, and highlights some exciting research avenues\naimed at addressing these obstacles. Looking ahead, the discussion focuses on\npossible ways to combine large-scale foundation models with BDL to unlock their\nfull potential.\n","authors":["Theodore Papamarkou","Maria Skoularidou","Konstantina Palla","Laurence Aitchison","Julyan Arbel","David Dunson","Maurizio Filippone","Vincent Fortuin","Philipp Hennig","Aliaksandr Hubin","Alexander Immer","Theofanis Karaletsos","Mohammad Emtiyaz Khan","Agustinus Kristiadi","Yingzhen Li","Jose Miguel Hernandez Lobato","Stephan Mandt","Christopher Nemeth","Michael A. Osborne","Tim G. J. Rudner","David Rügamer","Yee Whye Teh","Max Welling","Andrew Gordon Wilson","Ruqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00807v1","updated":"2024-02-01T17:44:11Z","published":"2024-02-01T17:44:11Z","title":"Distilling Conditional Diffusion Models for Offline Reinforcement\n Learning through Trajectory Stitching","summary":" Deep generative models have recently emerged as an effective approach to\noffline reinforcement learning. However, their large model size poses\nchallenges in computation. We address this issue by proposing a knowledge\ndistillation method based on data augmentation. In particular, high-return\ntrajectories are generated from a conditional diffusion model, and they are\nblended with the original trajectories through a novel stitching algorithm that\nleverages a new reward generator. Applying the resulting dataset to behavioral\ncloning, the learned shallow policy whose size is much smaller outperforms or\nnearly matches deep generative planners on several D4RL benchmarks.\n","authors":["Shangzhe Li","Xinhua Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00803v1","updated":"2024-02-01T17:40:10Z","published":"2024-02-01T17:40:10Z","title":"Signal Quality Auditing for Time-series Data","summary":" Signal quality assessment (SQA) is required for monitoring the reliability of\ndata acquisition systems, especially in AI-driven Predictive Maintenance (PMx)\napplication contexts. SQA is vital for addressing \"silent failures\" of data\nacquisition hardware and software, which when unnoticed, misinform the users of\ndata, creating the risk for incorrect decisions with unintended or even\ncatastrophic consequences. We have developed an open-source software\nimplementation of signal quality indices (SQIs) for the analysis of time-series\ndata. We codify a range of SQIs, demonstrate them using established benchmark\ndata, and show that they can be effective for signal quality assessment. We\nalso study alternative approaches to denoising time-series data in an attempt\nto improve the quality of the already degraded signal, and evaluate them\nempirically on relevant real-world data. To our knowledge, our software toolkit\nis the first to provide an open source implementation of a broad range of\nsignal quality assessment and improvement techniques validated on publicly\navailable benchmark data for ease of reproducibility. The generality of our\nframework can be easily extended to assessing reliability of arbitrary\ntime-series measurements in complex systems, especially when morphological\npatterns of the waveform shapes and signal periodicity are of key interest in\ndownstream analyses.\n","authors":["Chufan Gao","Nicholas Gisolfi","Artur Dubrawski"],"pdf_url":"https://arxiv.org/pdf/2402.00803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12264v2","updated":"2024-02-01T17:35:09Z","published":"2023-08-23T17:32:06Z","title":"Enhancing Energy-Awareness in Deep Learning through Fine-Grained Energy\n Measurement","summary":" With the increasing usage, scale, and complexity of Deep Learning (DL)\nmodels, their rapidly growing energy consumption has become a critical concern.\nPromoting green development and energy awareness at different granularities is\nthe need of the hour to limit carbon emissions of DL systems. However, the lack\nof standard and repeatable tools to accurately measure and optimize energy\nconsumption at a fine granularity (e.g., at method level) hinders progress in\nthis area. This paper introduces FECoM (Fine-grained Energy Consumption Meter),\na framework for fine-grained DL energy consumption measurement. FECoM enables\nresearchers and developers to profile DL APIs from energy perspective. FECoM\naddresses the challenges of measuring energy consumption at fine-grained level\nby using static instrumentation and considering various factors, including\ncomputational load and temperature stability. We assess FECoM's capability to\nmeasure fine-grained energy consumption for one of the most popular open-source\nDL frameworks, namely TensorFlow. Using FECoM, we also investigate the impact\nof parameter size and execution time on energy consumption, enriching our\nunderstanding of TensorFlow APIs' energy profiles. Furthermore, we elaborate on\nthe considerations, issues, and challenges that one needs to consider while\ndesigning and implementing a fine-grained energy consumption measurement tool.\nThis work will facilitate further advances in DL energy measurement and the\ndevelopment of energy-aware practices for DL systems.\n","authors":["Saurabhsingh Rajput","Tim Widmayer","Ziyuan Shang","Maria Kechagia","Federica Sarro","Tushar Sharma"],"pdf_url":"https://arxiv.org/pdf/2308.12264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14163v2","updated":"2024-02-01T17:34:22Z","published":"2023-05-23T15:27:35Z","title":"Leveraging Open Information Extraction for More Robust Domain Transfer\n of Event Trigger Detection","summary":" Event detection is a crucial information extraction task in many domains,\nsuch as Wikipedia or news. The task typically relies on trigger detection (TD)\n-- identifying token spans in the text that evoke specific events. While the\nnotion of triggers should ideally be universal across domains, domain transfer\nfor TD from high- to low-resource domains results in significant performance\ndrops. We address the problem of negative transfer in TD by coupling triggers\nbetween domains using subject-object relations obtained from a rule-based open\ninformation extraction (OIE) system. We demonstrate that OIE relations injected\nthrough multi-task training can act as mediators between triggers in different\ndomains, enhancing zero- and few-shot TD domain transfer and reducing\nperformance drops, in particular when transferring from a high-resource source\ndomain (Wikipedia) to a low(er)-resource target domain (news). Additionally, we\ncombine this improved transfer with masked language modeling on the target\ndomain, observing further TD transfer gains. Finally, we demonstrate that the\ngains are robust to the choice of the OIE system.\n","authors":["David Dukić","Kiril Gashteovski","Goran Glavaš","Jan Šnajder"],"pdf_url":"https://arxiv.org/pdf/2305.14163v2.pdf","comment":"Accepted at EACL 2024 Findings"},{"id":"http://arxiv.org/abs/2402.00798v1","updated":"2024-02-01T17:30:50Z","published":"2024-02-01T17:30:50Z","title":"Formal-LLM: Integrating Formal Language and Natural Language for\n Controllable LLM-based Agents","summary":" Recent advancements on Large Language Models (LLMs) enable AI Agents to\nautomatically generate and execute multi-step plans to solve complex tasks.\nHowever, since LLM's content generation process is hardly controllable, current\nLLM-based agents frequently generate invalid or non-executable plans, which\njeopardizes the performance of the generated plans and corrupts users' trust in\nLLM-based agents. In response, this paper proposes a novel ``Formal-LLM''\nframework for LLM-based agents by integrating the expressiveness of natural\nlanguage and the precision of formal language. Specifically, the framework\nallows human users to express their requirements or constraints for the\nplanning process as an automaton. A stack-based LLM plan generation process is\nthen conducted under the supervision of the automaton to ensure that the\ngenerated plan satisfies the constraints, making the planning process\ncontrollable. We conduct experiments on both benchmark tasks and practical\nreal-life tasks, and our framework achieves over 50% overall performance\nincrease, which validates the feasibility and effectiveness of employing\nFormal-LLM to guide the plan generation of agents, preventing the agents from\ngenerating invalid and unsuccessful plans. Further, more controllable LLM-based\nagents can facilitate the broader utilization of LLM in application scenarios\nwhere high validity of planning is essential. The work is open-sourced at\nhttps://github.com/agiresearch/Formal-LLM.\n","authors":["Zelong Li","Wenyue Hua","Hao Wang","He Zhu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00798v1.pdf","comment":"21 pages, 6 figures; working in process, suggestions are welcome"},{"id":"http://arxiv.org/abs/2402.00795v1","updated":"2024-02-01T17:28:10Z","published":"2024-02-01T17:28:10Z","title":"LLMs learn governing principles of dynamical systems, revealing an\n in-context neural scaling law","summary":" Pretrained large language models (LLMs) are surprisingly effective at\nperforming zero-shot tasks, including time-series forecasting. However,\nunderstanding the mechanisms behind such capabilities remains highly\nchallenging due to the complexity of the models. In this paper, we study LLMs'\nability to extrapolate the behavior of dynamical systems whose evolution is\ngoverned by principles of physical interest. Our results show that LLaMA 2, a\nlanguage model trained primarily on texts, achieves accurate predictions of\ndynamical system time series without fine-tuning or prompt engineering.\nMoreover, the accuracy of the learned physical rules increases with the length\nof the input context window, revealing an in-context version of neural scaling\nlaw. Along the way, we present a flexible and efficient algorithm for\nextracting probability density functions of multi-digit numbers directly from\nLLMs.\n","authors":["Toni J. B. Liu","Nicolas Boullé","Raphaël Sarfati","Christopher J. Earls"],"pdf_url":"https://arxiv.org/pdf/2402.00795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00794v1","updated":"2024-02-01T17:25:51Z","published":"2024-02-01T17:25:51Z","title":"ReAGent: Towards A Model-agnostic Feature Attribution Method for\n Generative Language Models","summary":" Feature attribution methods (FAs), such as gradients and attention, are\nwidely employed approaches to derive the importance of all input features to\nthe model predictions. Existing work in natural language processing has mostly\nfocused on developing and testing FAs for encoder-only language models (LMs) in\nclassification tasks. However, it is unknown if it is faithful to use these FAs\nfor decoder-only models on text generation, due to the inherent differences\nbetween model architectures and task settings respectively. Moreover, previous\nwork has demonstrated that there is no `one-wins-all' FA across models and\ntasks. This makes the selection of a FA computationally expensive for large LMs\nsince input importance derivation often requires multiple forward and backward\npasses including gradient computations that might be prohibitive even with\naccess to large compute. To address these issues, we present a model-agnostic\nFA for generative LMs called Recursive Attribution Generator (ReAGent). Our\nmethod updates the token importance distribution in a recursive manner. For\neach update, we compute the difference in the probability distribution over the\nvocabulary for predicting the next token between using the original input and\nusing a modified version where a part of the input is replaced with RoBERTa\npredictions. Our intuition is that replacing an important token in the context\nshould have resulted in a larger change in the model's confidence in predicting\nthe token than replacing an unimportant token. Our method can be universally\napplied to any generative LM without accessing internal model weights or\nadditional training and fine-tuning, as most other FAs require. We extensively\ncompare the faithfulness of ReAGent with seven popular FAs across six\ndecoder-only LMs of various sizes. The results show that our method\nconsistently provides more faithful token importance distributions.\n","authors":["Zhixue Zhao","Boxuan Shan"],"pdf_url":"https://arxiv.org/pdf/2402.00794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00793v1","updated":"2024-02-01T17:23:54Z","published":"2024-02-01T17:23:54Z","title":"Distinguishing the Indistinguishable: Human Expertise in Algorithmic\n Prediction","summary":" We introduce a novel framework for incorporating human expertise into\nalgorithmic predictions. Our approach focuses on the use of human judgment to\ndistinguish inputs which `look the same' to any feasible predictive algorithm.\nWe argue that this framing clarifies the problem of human/AI collaboration in\nprediction tasks, as experts often have access to information -- particularly\nsubjective information -- which is not encoded in the algorithm's training\ndata. We use this insight to develop a set of principled algorithms for\nselectively incorporating human feedback only when it improves the performance\nof any feasible predictor. We find empirically that although algorithms often\noutperform their human counterparts on average, human judgment can\nsignificantly improve algorithmic predictions on specific instances (which can\nbe identified ex-ante). In an X-ray classification task, we find that this\nsubset constitutes nearly 30% of the patient population. Our approach provides\na natural way of uncovering this heterogeneity and thus enabling effective\nhuman-AI collaboration.\n","authors":["Rohan Alur","Manish Raghavan","Devavrat Shah"],"pdf_url":"https://arxiv.org/pdf/2402.00793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00789v1","updated":"2024-02-01T17:21:53Z","published":"2024-02-01T17:21:53Z","title":"Graph-Mamba: Towards Long-Range Graph Sequence Modeling with Selective\n State Spaces","summary":" Attention mechanisms have been widely used to capture long-range dependencies\namong nodes in Graph Transformers. Bottlenecked by the quadratic computational\ncost, attention mechanisms fail to scale in large graphs. Recent improvements\nin computational efficiency are mainly achieved by attention sparsification\nwith random or heuristic-based graph subsampling, which falls short in\ndata-dependent context reasoning. State space models (SSMs), such as Mamba,\nhave gained prominence for their effectiveness and efficiency in modeling\nlong-range dependencies in sequential data. However, adapting SSMs to\nnon-sequential graph data presents a notable challenge. In this work, we\nintroduce Graph-Mamba, the first attempt to enhance long-range context modeling\nin graph networks by integrating a Mamba block with the input-dependent node\nselection mechanism. Specifically, we formulate graph-centric node\nprioritization and permutation strategies to enhance context-aware reasoning,\nleading to a substantial improvement in predictive performance. Extensive\nexperiments on ten benchmark datasets demonstrate that Graph-Mamba outperforms\nstate-of-the-art methods in long-range graph prediction tasks, with a fraction\nof the computational cost in both FLOPs and GPU memory consumption. The code\nand models are publicly available at https://github.com/bowang-lab/Graph-Mamba.\n","authors":["Chloe Wang","Oleksii Tsepa","Jun Ma","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2402.00789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00787v1","updated":"2024-02-01T17:21:45Z","published":"2024-02-01T17:21:45Z","title":"Learning and Calibrating Heterogeneous Bounded Rational Market Behaviour\n with Multi-Agent Reinforcement Learning","summary":" Agent-based models (ABMs) have shown promise for modelling various real world\nphenomena incompatible with traditional equilibrium analysis. However, a\ncritical concern is the manual definition of behavioural rules in ABMs. Recent\ndevelopments in multi-agent reinforcement learning (MARL) offer a way to\naddress this issue from an optimisation perspective, where agents strive to\nmaximise their utility, eliminating the need for manual rule specification.\nThis learning-focused approach aligns with established economic and financial\nmodels through the use of rational utility-maximising agents. However, this\nrepresentation departs from the fundamental motivation for ABMs: that realistic\ndynamics emerging from bounded rationality and agent heterogeneity can be\nmodelled. To resolve this apparent disparity between the two approaches, we\npropose a novel technique for representing heterogeneous processing-constrained\nagents within a MARL framework. The proposed approach treats agents as\nconstrained optimisers with varying degrees of strategic skills, permitting\ndeparture from strict utility maximisation. Behaviour is learnt through\nrepeated simulations with policy gradients to adjust action likelihoods. To\nallow efficient computation, we use parameterised shared policy learning with\ndistributions of agent skill levels. Shared policy learning avoids the need for\nagents to learn individual policies yet still enables a spectrum of bounded\nrational behaviours. We validate our model's effectiveness using real-world\ndata on a range of canonical $n$-agent settings, demonstrating significantly\nimproved predictive capability.\n","authors":["Benjamin Patrick Evans","Sumitra Ganesh"],"pdf_url":"https://arxiv.org/pdf/2402.00787v1.pdf","comment":"Accepted as a full paper at AAMAS 2024"},{"id":"http://arxiv.org/abs/2402.00786v1","updated":"2024-02-01T17:17:55Z","published":"2024-02-01T17:17:55Z","title":"CroissantLLM: A Truly Bilingual French-English Language Model","summary":" We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T\nEnglish and French tokens, to bring to the research and industrial community a\nhigh-performance, fully open-sourced bilingual model that runs swiftly on\nconsumer-grade local hardware. To that end, we pioneer the approach of training\nan intrinsically bilingual model with a 1:1 English-to-French pretraining data\nratio, a custom tokenizer, and bilingual finetuning datasets. We release the\ntraining dataset, notably containing a French split with manually curated,\nhigh-quality, and varied data sources. To assess performance outside of\nEnglish, we craft a novel benchmark, FrenchBench, consisting of an array of\nclassification and generation tasks, covering various orthogonal aspects of\nmodel performance in the French Language. Additionally, rooted in transparency\nand to foster further Large Language Model research, we release codebases, and\ndozens of checkpoints across various model sizes, training data distributions,\nand training steps, as well as fine-tuned Chat models, and strong translation\nmodels. We evaluate our model through the FMTI framework, and validate 81 % of\nthe transparency criteria, far beyond the scores of even most open initiatives.\nThis work enriches the NLP landscape, breaking away from previous\nEnglish-centric work in order to strengthen our understanding of\nmultilinguality in language models.\n","authors":["Manuel Faysse","Patrick Fernandes","Nuno Guerreiro","António Loison","Duarte Alves","Caio Corro","Nicolas Boizard","João Alves","Ricardo Rei","Pedro Martins","Antoni Bigata Casademunt","François Yvon","André Martins","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.00786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00782v1","updated":"2024-02-01T17:10:35Z","published":"2024-02-01T17:10:35Z","title":"Dense Reward for Free in Reinforcement Learning from Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) has been credited as the\nkey advance that has allowed Large Language Models (LLMs) to effectively follow\ninstructions and produce useful assistance. Classically, this involves\ngenerating completions from the LLM in response to a query before using a\nseparate reward model to assign a score to the full completion. As an\nauto-regressive process, the LLM has to take many \"actions\" (selecting\nindividual tokens) and only receives a single, sparse reward at the end of an\nepisode, a setup that is known to be difficult to optimise in traditional\nreinforcement learning. In this work we leverage the fact that the reward model\ncontains more information than just its scalar output, in particular, it\ncalculates an attention map over tokens as part of the transformer\narchitecture. We use these attention weights to redistribute the reward along\nthe whole completion, effectively densifying the signal and highlighting the\nmost important tokens, all without incurring extra computational cost or\nrequiring any additional modelling. We demonstrate that, theoretically, this\napproach is equivalent to potential-based reward shaping, ensuring that the\noptimal policy remains unchanged. Empirically, we show that it stabilises\ntraining, accelerates the rate of learning, and, in practical cases, may lead\nto better local optima.\n","authors":["Alex J. Chan","Hao Sun","Samuel Holt","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2402.00782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00776v1","updated":"2024-02-01T17:05:37Z","published":"2024-02-01T17:05:37Z","title":"Hybrid Quantum Vision Transformers for Event Classification in High\n Energy Physics","summary":" Models based on vision transformer architectures are considered\nstate-of-the-art when it comes to image classification tasks. However, they\nrequire extensive computational resources both for training and deployment. The\nproblem is exacerbated as the amount and complexity of the data increases.\nQuantum-based vision transformer models could potentially alleviate this issue\nby reducing the training and operating time while maintaining the same\npredictive power. Although current quantum computers are not yet able to\nperform high-dimensional tasks yet, they do offer one of the most efficient\nsolutions for the future. In this work, we construct several variations of a\nquantum hybrid vision transformer for a classification problem in high energy\nphysics (distinguishing photons and electrons in the electromagnetic\ncalorimeter). We test them against classical vision transformer architectures.\nOur findings indicate that the hybrid models can achieve comparable performance\nto their classical analogues with a similar number of parameters.\n","authors":["Eyup B. Unlu","Marçal Comajoan Cara","Gopal Ramesh Dahale","Zhongtian Dong","Roy T. Forestano","Sergei Gleyzer","Daniel Justice","Kyoungchul Kong","Tom Magorsch","Konstantin T. Matchev","Katia Matcheva"],"pdf_url":"https://arxiv.org/pdf/2402.00776v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.00774v1","updated":"2024-02-01T17:04:04Z","published":"2024-02-01T17:04:04Z","title":"Mesh motion in fluid-structure interaction with deep operator networks","summary":" A mesh motion model based on deep operator networks is presented. The model\nis trained on and evaluated against a biharmonic mesh motion model on a\nfluid-structure interaction benchmark problem and further evaluated in a\nsetting where biharmonic mesh motion fails. The performance of the proposed\nmesh motion model is comparable to the biharmonic mesh motion on the test\nproblems.\n","authors":["Ottar Hellan"],"pdf_url":"https://arxiv.org/pdf/2402.00774v1.pdf","comment":"9 pages, 5 figures, submitted to proceedings of ENUMATH 2023"},{"id":"http://arxiv.org/abs/2402.00769v1","updated":"2024-02-01T16:58:11Z","published":"2024-02-01T16:58:11Z","title":"AnimateLCM: Accelerating the Animation of Personalized Diffusion Models\n and Adapters with Decoupled Consistency Learning","summary":" Video diffusion models has been gaining increasing attention for its ability\nto produce videos that are both coherent and of high fidelity. However, the\niterative denoising process makes it computationally intensive and\ntime-consuming, thus limiting its applications. Inspired by the Consistency\nModel (CM) that distills pretrained image diffusion models to accelerate the\nsampling with minimal steps and its successful extension Latent Consistency\nModel (LCM) on conditional image generation, we propose AnimateLCM, allowing\nfor high-fidelity video generation within minimal steps. Instead of directly\nconducting consistency learning on the raw video dataset, we propose a\ndecoupled consistency learning strategy that decouples the distillation of\nimage generation priors and motion generation priors, which improves the\ntraining efficiency and enhance the generation visual quality. Additionally, to\nenable the combination of plug-and-play adapters in stable diffusion community\nto achieve various functions (e.g., ControlNet for controllable generation). we\npropose an efficient strategy to adapt existing adapters to our distilled\ntext-conditioned video consistency model or train adapters from scratch without\nharming the sampling speed. We validate the proposed strategy in\nimage-conditioned video generation and layout-conditioned video generation, all\nachieving top-performing results. Experimental results validate the\neffectiveness of our proposed method. Code and weights will be made public.\nMore details are available at https://github.com/G-U-N/AnimateLCM.\n","authors":["Fu-Yun Wang","Zhaoyang Huang","Xiaoyu Shi","Weikang Bian","Guanglu Song","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2402.00769v1.pdf","comment":"Project Page: https://animatelcm.github.io/"},{"id":"http://arxiv.org/abs/2112.00365v2","updated":"2024-02-01T16:58:01Z","published":"2021-12-01T09:26:01Z","title":"Probability-Generating Function Kernels for Spherical Data","summary":" Probability-generating function (PGF) kernels are introduced, which\nconstitute a class of kernels supported on the unit hypersphere, for the\npurposes of spherical data analysis. PGF kernels generalize RBF kernels in the\ncontext of spherical data. The properties of PGF kernels are studied. A\nsemi-parametric learning algorithm is introduced to enable the use of PGF\nkernels with spherical data.\n","authors":["Theodore Papamarkou","Alexey Lindo"],"pdf_url":"https://arxiv.org/pdf/2112.00365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05866v3","updated":"2024-02-01T16:52:14Z","published":"2023-10-09T17:03:08Z","title":"Generative quantum machine learning via denoising diffusion\n probabilistic models","summary":" Deep generative models are key-enabling technology to computer vision, text\ngeneration and large language models. Denoising diffusion probabilistic models\n(DDPMs) have recently gained much attention due to their ability to generate\ndiverse and high-quality samples in many computer vision tasks, as well as to\nincorporate flexible model architectures and relatively simple training scheme.\nQuantum generative models, empowered by entanglement and superposition, have\nbrought new insight to learning classical and quantum data. Inspired by the\nclassical counterpart, we propose the \\emph{quantum denoising diffusion\nprobabilistic model} (QuDDPM) to enable efficiently trainable generative\nlearning of quantum data. QuDDPM adopts sufficient layers of circuits to\nguarantee expressivity, while introduces multiple intermediate training tasks\nas interpolation between the target distribution and noise to avoid barren\nplateau and guarantee efficient training. We provide bounds on the learning\nerror and demonstrate QuDDPM's capability in learning correlated quantum noise\nmodel, quantum many-body phases and topological structure of quantum data. The\nresults provide a paradigm for versatile and efficient quantum generative\nlearning.\n","authors":["Bingzhi Zhang","Peng Xu","Xiaohui Chen","Quntao Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.05866v3.pdf","comment":"5+10 pages, 16 figures. PRL accepted version. Code available at:\n https://github.com/francis-hsu/quantgenmdl"},{"id":"http://arxiv.org/abs/2402.00761v1","updated":"2024-02-01T16:51:11Z","published":"2024-02-01T16:51:11Z","title":"Control-Theoretic Techniques for Online Adaptation of Deep Neural\n Networks in Dynamical Systems","summary":" Deep neural networks (DNNs), trained with gradient-based optimization and\nbackpropagation, are currently the primary tool in modern artificial\nintelligence, machine learning, and data science. In many applications, DNNs\nare trained offline, through supervised learning or reinforcement learning, and\ndeployed online for inference. However, training DNNs with standard\nbackpropagation and gradient-based optimization gives no intrinsic performance\nguarantees or bounds on the DNN, which is essential for applications such as\ncontrols. Additionally, many offline-training and online-inference problems,\nsuch as sim2real transfer of reinforcement learning policies, experience domain\nshift from the training distribution to the real-world distribution. To address\nthese stability and transfer learning issues, we propose using techniques from\ncontrol theory to update DNN parameters online. We formulate the\nfully-connected feedforward DNN as a continuous-time dynamical system, and we\npropose novel last-layer update laws that guarantee desirable error convergence\nunder various conditions on the time derivative of the DNN input vector. We\nfurther show that training the DNN under spectral normalization controls the\nupper bound of the error trajectories of the online DNN predictions, which is\ndesirable when numerically differentiated quantities or noisy state\nmeasurements are input to the DNN. The proposed online DNN adaptation laws are\nvalidated in simulation to learn the dynamics of the Van der Pol system under\ndomain shift, where parameters are varied in inference from the training\ndataset. The simulations demonstrate the effectiveness of using\ncontrol-theoretic techniques to derive performance improvements and guarantees\nin DNN-based learning systems.\n","authors":["Jacob G. Elkins","Farbod Fahimi"],"pdf_url":"https://arxiv.org/pdf/2402.00761v1.pdf","comment":"Preprint version"},{"id":"http://arxiv.org/abs/2402.00760v1","updated":"2024-02-01T16:50:41Z","published":"2024-02-01T16:50:41Z","title":"EuroPED-NN: Uncertainty aware surrogate model","summary":" This work successfully generates uncertainty aware surrogate models, via the\nBayesian neural network with noise contrastive prior (BNN-NCP) technique, of\nthe EuroPED plasma pedestal model using data from the JET-ILW pedestal database\nand subsequent model evaluations. All this conform EuroPED-NN. The BNN-NCP\ntechnique is proven to be a good fit for uncertainty aware surrogate models,\nmatching the output results as a regular neural network, providing prediction's\nconfidence as uncertainties, and highlighting the out of distribution (OOD)\nregions using surrogate model uncertainties. This provides critical insights\ninto model robustness and reliability. EuroPED-NN has been physically\nvalidated, first, analyzing electron density\n$n_e\\!\\left(\\psi_{\\text{pol}}=0.94\\right)$ with respect to increasing plasma\ncurrent, $I_p$, and second, validating the $\\Delta-\\beta_{p,ped}$ relation\nassociated with the EuroPED model. Affirming the robustness of the underlying\nphysics learned by the surrogate model.\n","authors":["A. Panera Alvarez","A. Ho","A. Jarvinen","S. Saarelma","S. Wiesen","JET Contributors"],"pdf_url":"https://arxiv.org/pdf/2402.00760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12258v2","updated":"2024-02-01T16:50:23Z","published":"2024-01-21T16:59:45Z","title":"Emergent Dominance Hierarchies in Reinforcement Learning Agents","summary":" Modern Reinforcement Learning (RL) algorithms are able to outperform humans\nin a wide variety of tasks. Multi-agent reinforcement learning (MARL) settings\npresent additional challenges, and successful cooperation in mixed-motive\ngroups of agents depends on a delicate balancing act between individual and\ngroup objectives. Social conventions and norms, often inspired by human\ninstitutions, are used as tools for striking this balance.\n In this paper, we examine a fundamental, well-studied social convention that\nunderlies cooperation in both animal and human societies: dominance\nhierarchies.\n We adapt the ethological theory of dominance hierarchies to artificial\nagents, borrowing the established terminology and definitions with as few\namendments as possible. We demonstrate that populations of RL agents, operating\nwithout explicit programming or intrinsic rewards, can invent, learn, enforce,\nand transmit a dominance hierarchy to new populations. The dominance\nhierarchies that emerge have a similar structure to those studied in chickens,\nmice, fish, and other species.\n","authors":["Ram Rachum","Yonatan Nakar","Bill Tomlinson","Nitay Alon","Reuth Mirsky"],"pdf_url":"https://arxiv.org/pdf/2401.12258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00759v1","updated":"2024-02-01T16:49:27Z","published":"2024-02-01T16:49:27Z","title":"Building Expressive and Tractable Probabilistic Generative Models: A\n Review","summary":" We present a comprehensive survey of the advancements and techniques in the\nfield of tractable probabilistic generative modeling, primarily focusing on\nProbabilistic Circuits (PCs). We provide a unified perspective on the inherent\ntrade-offs between expressivity and the tractability, highlighting the design\nprinciples and algorithmic extensions that have enabled building expressive and\nefficient PCs, and provide a taxonomy of the field. We also discuss recent\nefforts to build deep and hybrid PCs by fusing notions from deep neural models,\nand outline the challenges and open questions that can guide future research in\nthis evolving field.\n","authors":["Sahil Sidheekh","Sriraam Natarajan"],"pdf_url":"https://arxiv.org/pdf/2402.00759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06021v4","updated":"2024-02-01T16:45:42Z","published":"2023-03-10T16:22:38Z","title":"Machine learning for sports betting: should model selection be based on\n accuracy or calibration?","summary":" Sports betting's recent federal legalisation in the USA coincides with the\ngolden age of machine learning. If bettors can leverage data to reliably\npredict the probability of an outcome, they can recognise when the bookmaker's\nodds are in their favour. As sports betting is a multi-billion dollar industry\nin the USA alone, identifying such opportunities could be extremely lucrative.\nMany researchers have applied machine learning to the sports outcome prediction\nproblem, generally using accuracy to evaluate the performance of predictive\nmodels. We hypothesise that for the sports betting problem, model calibration\nis more important than accuracy. To test this hypothesis, we train models on\nNBA data over several seasons and run betting experiments on a single season,\nusing published odds. We show that using calibration, rather than accuracy, as\nthe basis for model selection leads to greater returns, on average (return on\ninvestment of $+34.69\\%$ versus $-35.17\\%$) and in the best case ($+36.93\\%$\nversus $+5.56\\%$). These findings suggest that for sports betting (or any\nprobabilistic decision-making problem), calibration is a more important metric\nthan accuracy. Sports bettors who wish to increase profits should therefore\nselect their predictive model based on calibration, rather than accuracy.\n","authors":["Conor Walsh","Alok Joshi"],"pdf_url":"https://arxiv.org/pdf/2303.06021v4.pdf","comment":"15 pages, 5 Figures. Paper submitted to Elsevier's Machine Learning\n with Applications"},{"id":"http://arxiv.org/abs/2402.00751v1","updated":"2024-02-01T16:43:04Z","published":"2024-02-01T16:43:04Z","title":"Unlearnable Algorithms for In-context Learning","summary":" Machine unlearning is a desirable operation as models get increasingly\ndeployed on data with unknown provenance. However, achieving exact unlearning\n-- obtaining a model that matches the model distribution when the data to be\nforgotten was never used -- is challenging or inefficient, often requiring\nsignificant retraining. In this paper, we focus on efficient unlearning methods\nfor the task adaptation phase of a pretrained large language model (LLM). We\nobserve that an LLM's ability to do in-context learning for task adaptation\nallows for efficient exact unlearning of task adaptation training data. We\nprovide an algorithm for selecting few-shot training examples to prepend to the\nprompt given to an LLM (for task adaptation), ERASE, whose unlearning operation\ncost is independent of model and dataset size, meaning it scales to large\nmodels and datasets. We additionally compare our approach to fine-tuning\napproaches and discuss the trade-offs between the two approaches. This leads us\nto propose a new holistic measure of unlearning cost which accounts for varying\ninference costs, and conclude that in-context learning can often be more\nfavourable than fine-tuning for deployments involving unlearning requests.\n","authors":["Andrei Muresanu","Anvith Thudi","Michael R. Zhang","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2402.00751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02546v3","updated":"2024-02-01T16:42:42Z","published":"2023-11-05T02:33:30Z","title":"On the Second-Order Convergence of Biased Policy Gradient Algorithms","summary":" Since the objective functions of reinforcement learning problems are\ntypically highly nonconvex, it is desirable that policy gradient, the most\npopular algorithm, escapes saddle points and arrives at second-order stationary\npoints. Existing results only consider vanilla policy gradient algorithms with\nunbiased gradient estimators, but practical implementations under the\ninfinite-horizon discounted reward setting are biased due to finite-horizon\nsampling. Moreover, actor-critic methods, whose second-order convergence has\nnot yet been established, are also biased due to the critic approximation of\nthe value function. We provide a novel second-order analysis of biased policy\ngradient methods, including the vanilla gradient estimator computed from\nMonte-Carlo sampling of trajectories as well as the double-loop actor-critic\nalgorithm, where in the inner loop the critic improves the approximation of the\nvalue function via TD(0) learning. Separately, we also establish the\nconvergence of TD(0) on Markov chains irrespective of initial state\ndistribution.\n","authors":["Siqiao Mu","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2311.02546v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05578v3","updated":"2024-02-01T16:42:05Z","published":"2024-01-10T23:01:35Z","title":"Fast Cerebral Blood Flow Analysis via Extreme Learning Machine","summary":" We introduce a rapid and precise analytical approach for analyzing cerebral\nblood flow (CBF) using Diffuse Correlation Spectroscopy (DCS) with the\napplication of the Extreme Learning Machine (ELM). Our evaluation of ELM and\nexisting algorithms involves a comprehensive set of metrics. We assess these\nalgorithms using synthetic datasets for both semi-infinite and multi-layer\nmodels. The results demonstrate that ELM consistently achieves higher fidelity\nacross various noise levels and optical parameters, showcasing robust\ngeneralization ability and outperforming iterative fitting algorithms. Through\na comparison with a computationally efficient neural network, ELM attains\ncomparable accuracy with reduced training and inference times. Notably, the\nabsence of a back-propagation process in ELM during training results in\nsignificantly faster training speeds compared to existing neural network\napproaches. This proposed strategy holds promise for edge computing\napplications with online training capabilities.\n","authors":["Xi Chen","Zhenya Zang","Xingda Li"],"pdf_url":"https://arxiv.org/pdf/2401.05578v3.pdf","comment":"Not ready to submission. Need further correction"},{"id":"http://arxiv.org/abs/2402.00743v1","updated":"2024-02-01T16:39:45Z","published":"2024-02-01T16:39:45Z","title":"Benefits of Transformer: In-Context Learning in Linear Regression Tasks\n with Unstructured Data","summary":" In practice, it is observed that transformer-based models can learn concepts\nin context in the inference stage. While existing literature, e.g.,\n\\citet{zhang2023trained,huang2023context}, provide theoretical explanations on\nthis in-context learning ability, they assume the input $x_i$ and the output\n$y_i$ for each sample are embedded in the same token (i.e., structured data).\nHowever, in reality, they are presented in two tokens (i.e., unstructured data\n\\cite{wibisono2023role}). In this case, this paper conducts experiments in\nlinear regression tasks to study the benefits of the architecture of\ntransformers and provides some corresponding theoretical intuitions to explain\nwhy the transformer can learn from unstructured data. We study the exact\ncomponents in a transformer that facilitate the in-context learning. In\nparticular, we observe that (1) a transformer with two layers of softmax\n(self-)attentions with look-ahead attention mask can learn from the prompt if\n$y_i$ is in the token next to $x_i$ for each example; (2) positional encoding\ncan further improve the performance; and (3) multi-head attention with a high\ninput embedding dimension has a better prediction performance than single-head\nattention.\n","authors":["Yue Xing","Xiaofeng Lin","Namjoon Suh","Qifan Song","Guang Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.00743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05580v3","updated":"2024-02-01T16:36:43Z","published":"2024-01-10T23:10:51Z","title":"Enhancing Blood Flow Assessment in Diffuse Correlation Spectroscopy: A\n Transfer Learning Approach with Noise Robustness Analysis","summary":" Diffuse correlation spectroscopy (DCS) is an emerging noninvasive technique\nthat measures the tissue blood flow, by using near-infrared coherent\npoint-source illumination to detect spectral changes. While machine learning\nhas demonstrated significant potential for measuring blood flow index (BFi), an\nopen question concerning the success of this approach pertains to its\nrobustness in scenarios involving deviations between datasets with varying\nSignal-to-Noise Ratios (SNRs) originating from diverse clinical applications\nand various setups. This study proposes a transfer learning approach, aims to\nassess the influence of SNRs on the generalization ability of learned features,\nand demonstrate the robustness for transfer learning. A synthetic dataset with\nvarying levels of added noise is utilized to simulate different SNRs. The\nproposed network takes a 1x64 autocorrelation curve as input and generates BFi\nand the correlation parameter beta. The proposed model demonstrates excellent\nperformance across different SNRs, exhibiting enhanced fitting accuracy,\nparticularly for low SNR datasets when compared with other fitting methods.\nThis highlights its potential for clinical diagnosis and treatment across\nvarious scenarios under different clinical setups.\n","authors":["Xi Chen","Xingda Li"],"pdf_url":"https://arxiv.org/pdf/2401.05580v3.pdf","comment":"Not ready for submission. Need further changes"},{"id":"http://arxiv.org/abs/2310.18001v2","updated":"2024-02-01T16:36:01Z","published":"2023-10-27T09:17:15Z","title":"DP-SGD with weight clipping","summary":" Recently, due to the popularity of deep neural networks and other methods\nwhose training typically relies on the optimization of an objective function,\nand due to concerns for data privacy, there is a lot of interest in\ndifferentially private gradient descent methods. To achieve differential\nprivacy guarantees with a minimum amount of noise, it is important to be able\nto bound precisely the sensitivity of the information which the participants\nwill observe. In this study, we present a novel approach that mitigates the\nbias arising from traditional gradient clipping. By leveraging a public upper\nbound of the Lipschitz value of the current model and its current location\nwithin the search domain, we can achieve refined noise level adjustments. We\npresent a new algorithm with improved differential privacy guarantees and a\nsystematic empirical evaluation, showing that our new approach outperforms\nexisting approaches also in practice.\n","authors":["Antoine Barczewski","Jan Ramon"],"pdf_url":"https://arxiv.org/pdf/2310.18001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00732v1","updated":"2024-02-01T16:30:00Z","published":"2024-02-01T16:30:00Z","title":"MobilityDL: A Review of Deep Learning From Trajectory Data","summary":" Trajectory data combines the complexities of time series, spatial data, and\n(sometimes irrational) movement behavior. As data availability and computing\npower have increased, so has the popularity of deep learning from trajectory\ndata. This review paper provides the first comprehensive overview of deep\nlearning approaches for trajectory data. We have identified eight specific\nmobility use cases which we analyze with regards to the deep learning models\nand the training data used. Besides a comprehensive quantitative review of the\nliterature since 2018, the main contribution of our work is the data-centric\nanalysis of recent work in this field, placing it along the mobility data\ncontinuum which ranges from detailed dense trajectories of individual movers\n(quasi-continuous tracking data), to sparse trajectories (such as check-in\ndata), and aggregated trajectories (crowd information).\n","authors":["Anita Graser","Anahid Jalali","Jasmin Lampert","Axel Weißenfeld","Krzysztof Janowicz"],"pdf_url":"https://arxiv.org/pdf/2402.00732v1.pdf","comment":"Submitted to Geoinformatica"},{"id":"http://arxiv.org/abs/2402.00728v1","updated":"2024-02-01T16:25:00Z","published":"2024-02-01T16:25:00Z","title":"Dropout-Based Rashomon Set Exploration for Efficient Predictive\n Multiplicity Estimation","summary":" Predictive multiplicity refers to the phenomenon in which classification\ntasks may admit multiple competing models that achieve almost-equally-optimal\nperformance, yet generate conflicting outputs for individual samples. This\npresents significant concerns, as it can potentially result in systemic\nexclusion, inexplicable discrimination, and unfairness in practical\napplications. Measuring and mitigating predictive multiplicity, however, is\ncomputationally challenging due to the need to explore all such\nalmost-equally-optimal models, known as the Rashomon set, in potentially huge\nhypothesis spaces. To address this challenge, we propose a novel framework that\nutilizes dropout techniques for exploring models in the Rashomon set. We\nprovide rigorous theoretical derivations to connect the dropout parameters to\nproperties of the Rashomon set, and empirically evaluate our framework through\nextensive experimentation. Numerical results show that our technique\nconsistently outperforms baselines in terms of the effectiveness of predictive\nmultiplicity metric estimation, with runtime speedup up to $20\\times \\sim\n5000\\times$. With efficient Rashomon set exploration and metric estimation,\nmitigation of predictive multiplicity is then achieved through dropout ensemble\nand model selection.\n","authors":["Hsiang Hsu","Guihong Li","Shaohan Hu"," Chun-Fu"," Chen"],"pdf_url":"https://arxiv.org/pdf/2402.00728v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2402.00724v1","updated":"2024-02-01T16:14:54Z","published":"2024-02-01T16:14:54Z","title":"Automatic Segmentation of the Spinal Cord Nerve Rootlets","summary":" Precise identification of spinal nerve rootlets is relevant to delineate\nspinal levels for the study of functional activity in the spinal cord. The goal\nof this study was to develop an automatic method for the semantic segmentation\nof spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI)\nscans. Images from two open-access MRI datasets were used to train a 3D\nmulti-class convolutional neural network using an active learning approach to\nsegment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal\nlevel. The method was tested on 3T T2-weighted images from datasets unseen\nduring training to assess inter-site, inter-session, and inter-resolution\nvariability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation\nacross rootlets levels), suggesting a good performance. The method also\ndemonstrated low inter-vendor and inter-site variability (coefficient of\nvariation <= 1.41 %), as well as low inter-session variability (coefficient of\nvariation <= 1.30 %) indicating stable predictions across different MRI\nvendors, sites, and sessions. The proposed methodology is open-source and\nreadily available in the Spinal Cord Toolbox (SCT) v6.2 and higher.\n","authors":["Jan Valosek","Theo Mathieu","Raphaelle Schlienger","Olivia S. Kowalczyk","Julien Cohen-Adad"],"pdf_url":"https://arxiv.org/pdf/2402.00724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00722v1","updated":"2024-02-01T16:14:32Z","published":"2024-02-01T16:14:32Z","title":"Neural Style Transfer with Twin-Delayed DDPG for Shared Control of\n Robotic Manipulators","summary":" Neural Style Transfer (NST) refers to a class of algorithms able to\nmanipulate an element, most often images, to adopt the appearance or style of\nanother one. Each element is defined as a combination of Content and Style: the\nContent can be conceptually defined as the what and the Style as the how of\nsaid element. In this context, we propose a custom NST framework for\ntransferring a set of styles to the motion of a robotic manipulator, e.g., the\nsame robotic task can be carried out in an angry, happy, calm, or sad way. An\nautoencoder architecture extracts and defines the Content and the Style of the\ntarget robot motions. A Twin Delayed Deep Deterministic Policy Gradient (TD3)\nnetwork generates the robot control policy using the loss defined by the\nautoencoder. The proposed Neural Policy Style Transfer TD3 (NPST3) alters the\nrobot motion by introducing the trained style. Such an approach can be\nimplemented either offline, for carrying out autonomous robot motions in\ndynamic environments, or online, for adapting at runtime the style of a\nteleoperated robot. The considered styles can be learned online from human\ndemonstrations. We carried out an evaluation with human subjects enrolling 73\nvolunteers, asking them to recognize the style behind some representative\nrobotic motions. Results show a good recognition rate, proving that it is\npossible to convey different styles to a robot using this approach.\n","authors":["Raul Fernandez-Fernandez","Marco Aggravi","Paolo Robuffo Giordano","Juan G. Victores","Claudio Pacchierotti"],"pdf_url":"https://arxiv.org/pdf/2402.00722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00711v1","updated":"2024-02-01T16:06:35Z","published":"2024-02-01T16:06:35Z","title":"Explaining Text Classifiers with Counterfactual Representations","summary":" One well motivated explanation method for classifiers leverages\ncounterfactuals which are hypothetical events identical to real observations in\nall aspects except for one categorical feature. Constructing such\ncounterfactual poses specific challenges for texts, however, as some attribute\nvalues may not necessarily align with plausible real-world events. In this\npaper we propose a simple method for generating counterfactuals by intervening\nin the space of text representations which bypasses this limitation. We argue\nthat our interventions are minimally disruptive and that they are theoretically\nsound as they align with counterfactuals as defined in Pearl's causal inference\nframework. To validate our method, we first conduct experiments on a synthetic\ndataset of counterfactuals, allowing for a direct comparison between classifier\npredictions based on ground truth counterfactuals (obtained through explicit\ntext interventions) and our counterfactuals, derived through interventions in\nthe representation space. Second, we study a real world scenario where our\ncounterfactuals can be leveraged both for explaining a classifier and for bias\nmitigation.\n","authors":["Pirmin Lemberger","Antoine Saillenfest"],"pdf_url":"https://arxiv.org/pdf/2402.00711v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2107.08020v3","updated":"2024-02-01T16:05:28Z","published":"2021-07-16T17:21:14Z","title":"Online Graph Topology Learning from Matrix-valued Time Series","summary":" This paper is concerned with the statistical analysis of matrix-valued time\nseries. These are data collected over a network of sensors (typically a set of\nspatial locations) along time, where a vector of features is observed per time\ninstant per sensor. Thus each sensor is characterized by a vectorial time\nseries. We would like to identify the dependency structure among these sensors\nand represent it by a graph. When there is only one feature per sensor, the\nvector auto-regressive models have been widely adapted to infer the structure\nof Granger causality. The resulting graph is referred to as causal graph. Our\nfirst contribution is then extending VAR models to matrix-variate models to\nserve the purpose of graph learning. Secondly, we propose two online procedures\nrespectively in low and high dimensions, which can update quickly the estimates\nof coefficients when new samples arrive. In particular in high dimensional\nregime, a novel Lasso-type is introduced and we develop its homotopy algorithms\nfor the online learning. We also provide an adaptive tuning procedure for the\nregularization parameter. Lastly, we consider that, the application of AR\nmodels onto data usually requires detrending the raw data, however, this step\nis forbidden in online context. Therefore, we augment the proposed AR models by\nincorporating trend as extra parameter, and then adapt the online algorithms to\nthe augmented data models, which allow us to simultaneously learn the graph and\ntrend from streaming samples. In this work, we consider primarily the periodic\ntrend. Numerical experiments using both synthetic and real data are performed,\nwhose results support the effectiveness of the proposed methods.\n","authors":["Yiye Jiang","Jérémie Bigot","Sofian Maabout"],"pdf_url":"https://arxiv.org/pdf/2107.08020v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09624v3","updated":"2024-02-01T16:04:43Z","published":"2023-02-19T16:58:53Z","title":"Breaking the Communication-Privacy-Accuracy Tradeoff with\n $f$-Differential Privacy","summary":" We consider a federated data analytics problem in which a server coordinates\nthe collaborative data analysis of multiple users with privacy concerns and\nlimited communication capability. The commonly adopted compression schemes\nintroduce information loss into local data while improving communication\nefficiency, and it remains an open problem whether such discrete-valued\nmechanisms provide any privacy protection. In this paper, we study the local\ndifferential privacy guarantees of discrete-valued mechanisms with finite\noutput space through the lens of $f$-differential privacy (DP). More\nspecifically, we advance the existing literature by deriving tight $f$-DP\nguarantees for a variety of discrete-valued mechanisms, including the binomial\nnoise and the binomial mechanisms that are proposed for privacy preservation,\nand the sign-based methods that are proposed for data compression, in\nclosed-form expressions. We further investigate the amplification in privacy by\nsparsification and propose a ternary stochastic compressor. By leveraging\ncompression for privacy amplification, we improve the existing methods by\nremoving the dependency of accuracy (in terms of mean square error) on\ncommunication cost in the popular use case of distributed mean estimation,\ntherefore breaking the three-way tradeoff between privacy, communication, and\naccuracy. Finally, we discuss the Byzantine resilience of the proposed\nmechanism and its application in federated learning.\n","authors":["Richeng Jin","Zhonggen Su","Caijun Zhong","Zhaoyang Zhang","Tony Quek","Huaiyu Dai"],"pdf_url":"https://arxiv.org/pdf/2302.09624v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00707v1","updated":"2024-02-01T16:04:04Z","published":"2024-02-01T16:04:04Z","title":"Non-Exchangeable Conformal Language Generation with Nearest Neighbors","summary":" Quantifying uncertainty in automatically generated text is important for\nletting humans check potential hallucinations and making systems more reliable.\nConformal prediction is an attractive framework to provide predictions imbued\nwith statistical guarantees, however, its application to text generation is\nchallenging since any i.i.d. assumptions are not realistic. In this paper, we\nbridge this gap by leveraging recent results on non-exchangeable conformal\nprediction, which still ensures bounds on coverage. The result,\nnon-exchangeable conformal nucleus sampling, is a novel extension of the\nconformal prediction framework to generation based on nearest neighbors. Our\nmethod can be used post-hoc for an arbitrary model without extra training and\nsupplies token-level, calibrated prediction sets equipped with statistical\nguarantees. Experiments in machine translation and language modeling show\nencouraging results in generation quality. By also producing tighter prediction\nsets with good coverage, we thus give a more theoretically principled way to\nperform sampling with conformal guarantees.\n","authors":["Dennis Ulmer","Chrysoula Zerva","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2402.00707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00705v1","updated":"2024-02-01T16:00:21Z","published":"2024-02-01T16:00:21Z","title":"Combining the Strengths of Dutch Survey and Register Data in a Data\n Challenge to Predict Fertility (PreFer)","summary":" The social sciences have produced an impressive body of research on\ndeterminants of fertility outcomes, or whether and when people have children.\nHowever, the strength of these determinants and underlying theories are rarely\nevaluated on their predictive ability on new data. This prevents us from\nsystematically comparing studies, hindering the evaluation and accumulation of\nknowledge. In this paper, we present two datasets which can be used to study\nthe predictability of fertility outcomes in the Netherlands. One dataset is\nbased on the LISS panel, a longitudinal survey which includes thousands of\nvariables on a wide range of topics, including individual preferences and\nvalues. The other is based on the Dutch register data which lacks attitudinal\ndata but includes detailed information about the life courses of millions of\nDutch residents. We provide information about the datasets and the samples, and\ndescribe the fertility outcome of interest. We also introduce the fertility\nprediction data challenge PreFer which is based on these datasets and will\nstart in Spring 2024. We outline the ways in which measuring the predictability\nof fertility outcomes using these datasets and combining their strengths in the\ndata challenge can advance our understanding of fertility behaviour and\ncomputational social science. We further provide details for participants on\nhow to take part in the data challenge.\n","authors":["Elizaveta Sivak","Paulina Pankowska","Adrienne Mendrik","Tom Emery","Javier Garcia-Bernardo","Seyit Hocuk","Kasia Karpinska","Angelica Maineri","Joris Mulder","Malvina Nissim","Gert Stulp"],"pdf_url":"https://arxiv.org/pdf/2402.00705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07303v2","updated":"2024-02-01T16:00:03Z","published":"2023-05-12T08:16:06Z","title":"Multi-Relational Hyperbolic Word Embeddings from Natural Language\n Definitions","summary":" Natural language definitions possess a recursive, self-explanatory semantic\nstructure that can support representation learning methods able to preserve\nexplicit conceptual relations and constraints in the latent space. This paper\npresents a multi-relational model that explicitly leverages such a structure to\nderive word embeddings from definitions. By automatically extracting the\nrelations linking defined and defining terms from dictionaries, we demonstrate\nhow the problem of learning word embeddings can be formalised via a\ntranslational framework in Hyperbolic space and used as a proxy to capture the\nglobal semantic structure of definitions. An extensive empirical analysis\ndemonstrates that the framework can help imposing the desired structural\nconstraints while preserving the semantic mapping required for controllable and\ninterpretable traversal. Moreover, the experiments reveal the superiority of\nthe Hyperbolic word embeddings over the Euclidean counterparts and demonstrate\nthat the multi-relational approach can obtain competitive results when compared\nto state-of-the-art neural models, with the advantage of being intrinsically\nmore efficient and interpretable.\n","authors":["Marco Valentino","Danilo S. Carvalho","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2305.07303v2.pdf","comment":"Accepted at the 18th Conference of the European Chapter of the\n Association for Computational Linguistics (EACL 2024)"},{"id":"http://arxiv.org/abs/2402.00699v1","updated":"2024-02-01T15:55:50Z","published":"2024-02-01T15:55:50Z","title":"PeaTMOSS: A Dataset and Initial Analysis of Pre-Trained Models in\n Open-Source Software","summary":" The development and training of deep learning models have become increasingly\ncostly and complex. Consequently, software engineers are adopting pre-trained\nmodels (PTMs) for their downstream applications. The dynamics of the PTM supply\nchain remain largely unexplored, signaling a clear need for structured datasets\nthat document not only the metadata but also the subsequent applications of\nthese models. Without such data, the MSR community cannot comprehensively\nunderstand the impact of PTM adoption and reuse. This paper presents the\nPeaTMOSS dataset, which comprises metadata for 281,638 PTMs and detailed\nsnapshots for all PTMs with over 50 monthly downloads (14,296 PTMs), along with\n28,575 open-source software repositories from GitHub that utilize these models.\nAdditionally, the dataset includes 44,337 mappings from 15,129 downstream\nGitHub repositories to the 2,530 PTMs they use. To enhance the dataset's\ncomprehensiveness, we developed prompts for a large language model to\nautomatically extract model metadata, including the model's training datasets,\nparameters, and evaluation metrics. Our analysis of this dataset provides the\nfirst summary statistics for the PTM supply chain, showing the trend of PTM\ndevelopment and common shortcomings of PTM package documentation. Our example\napplication reveals inconsistencies in software licenses across PTMs and their\ndependent projects. PeaTMOSS lays the foundation for future research, offering\nrich opportunities to investigate the PTM supply chain. We outline mining\nopportunities on PTMs, their downstream usage, and cross-cutting questions.\n","authors":["Wenxin Jiang","Jerin Yasmin","Jason Jones","Nicholas Synovic","Jiashen Kuo","Nathaniel Bielanski","Yuan Tian","George K. Thiruvathukal","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2402.00699v1.pdf","comment":"Accepted at MSR'24"},{"id":"http://arxiv.org/abs/2402.00695v1","updated":"2024-02-01T15:51:46Z","published":"2024-02-01T15:51:46Z","title":"Approximating Optimal Morphing Attacks using Template Inversion","summary":" Recent works have demonstrated the feasibility of inverting face recognition\nsystems, enabling to recover convincing face images using only their\nembeddings. We leverage such template inversion models to develop a novel type\nofdeep morphing attack based on inverting a theoretical optimal morph\nembedding, which is obtained as an average of the face embeddings of source\nimages. We experiment with two variants of this approach: the first one\nexploits a fully self-contained embedding-to-image inversion model, while the\nsecond leverages the synthesis network of a pretrained StyleGAN network for\nincreased morph realism. We generate morphing attacks from several source\ndatasets and study the effectiveness of those attacks against several face\nrecognition networks. We showcase that our method can compete with and\nregularly beat the previous state of the art for deep-learning based morph\ngeneration in terms of effectiveness, both in white-box and black-box attack\nscenarios, and is additionally much faster to run. We hope this might\nfacilitate the development of large scale deep morph datasets for training\ndetection models.\n","authors":["Laurent Colbois","Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2402.00695v1.pdf","comment":"Published at the IEEE International Joint Conference on Biometrics\n (IJCB) 2023"},{"id":"http://arxiv.org/abs/2402.00678v1","updated":"2024-02-01T15:38:21Z","published":"2024-02-01T15:38:21Z","title":"Real Evaluations Tractability using Continuous Goal-Directed Actions in\n Smart City Applications","summary":" One of the most important challenges of Smart City Applications is to adapt\nthe system to interact with non-expert users. Robot imitation frameworks aim to\nsimplify and reduce times of robot programming by allowing users to program\ndirectly through demonstrations. In classical frameworks, actions are modeled\nusing joint or Cartesian space trajectories. Other features, such as visual\nones, are not always well represented with these pure geometrical approaches.\nContinuous Goal-Directed Actions (CGDA) is an alternative to these methods, as\nit encodes actions as changes of any feature that can be extracted from the\nenvironment. As a consequence of this, the robot joint trajectories for\nexecution must be fully computed to comply with this feature-agnostic encoding.\nThis is achieved using Evolutionary Algorithms (EA), which usually requires too\nmany evaluations to perform this evolution step in the actual robot. Current\nstrategies involve performing evaluations in a simulation, transferring the\nfinal joint trajectory to the actual robot. Smart City applications involve\nworking in highly dynamic and complex environments, where having a precise\nmodel is not always achievable. Our goal is to study the tractability of\nperforming these evaluations directly in a real-world scenario. Two different\napproaches to reduce the number of evaluations using EA, are proposed and\ncompared. In the first approach, Particle Swarm Optimization (PSO)-based\nmethods have been studied and compared within CGDA: naive PSO, Fitness\nInheritance PSO (FI-PSO), and Adaptive Fuzzy Fitness Granulation with PSO\n(AFFG-PSO). The second approach studied the introduction of geometrical and\nvelocity constraints within CGDA. The effects of both approaches were analyzed\nand compared in the wax and paint actions, two CGDA commonly studied use cases.\nResults from this paper depict an important reduction in the number of\nevaluations.\n","authors":["Raul Fernandez-Fernandez","Juan G. Victores","David Estevez","Carlos Balaguer"],"pdf_url":"https://arxiv.org/pdf/2402.00678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00677v1","updated":"2024-02-01T15:37:42Z","published":"2024-02-01T15:37:42Z","title":"Neural Policy Style Transfer","summary":" Style Transfer has been proposed in a number of fields: fine arts, natural\nlanguage processing, and fixed trajectories. We scale this concept up to\ncontrol policies within a Deep Reinforcement Learning infrastructure. Each\nnetwork is trained to maximize the expected reward, which typically encodes the\ngoal of an action, and can be described as the content. The expressive power of\ndeep neural networks enables encoding a secondary task, which can be described\nas the style. The Neural Policy Style Transfer (NPST) algorithm is proposed to\ntransfer the style of one policy to another, while maintaining the content of\nthe latter. Different policies are defined via Deep Q-Network architectures.\nThese models are trained using demonstrations through Inverse Reinforcement\nLearning. Two different sets of user demonstrations are performed, one for\ncontent and other for style. Different styles are encoded as defined by user\ndemonstrations. The generated policy is the result of feeding a content policy\nand a style policy to the NPST algorithm. Experiments are performed in a\ncatch-ball game inspired by the Deep Reinforcement Learning classical Atari\ngames; and a real-world painting scenario with a full-sized humanoid robot,\nbased on previous works of the authors. The implementation of three different\nQ-Network architectures (Shallow, Deep and Deep Recurrent Q-Network) to encode\nthe policies within the NPST framework is proposed and the results obtained in\nthe experiments with each of these architectures compared.\n","authors":["Raul Fernandez-Fernandez","Juan G. Victores","Jennifer J. Gago","David Estevez","Carlos Balaguer"],"pdf_url":"https://arxiv.org/pdf/2402.00677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00676v1","updated":"2024-02-01T15:37:23Z","published":"2024-02-01T15:37:23Z","title":"Deep Robot Sketching: An application of Deep Q-Learning Networks for\n human-like sketching","summary":" The current success of Reinforcement Learning algorithms for its performance\nin complex environments has inspired many recent theoretical approaches to\ncognitive science. Artistic environments are studied within the cognitive\nscience community as rich, natural, multi-sensory, multi-cultural environments.\nIn this work, we propose the introduction of Reinforcement Learning for\nimproving the control of artistic robot applications. Deep Q-learning Neural\nNetworks (DQN) is one of the most successful algorithms for the implementation\nof Reinforcement Learning in robotics. DQN methods generate complex control\npolicies for the execution of complex robot applications in a wide set of\nenvironments. Current art painting robot applications use simple control laws\nthat limits the adaptability of the frameworks to a set of simple environments.\nIn this work, the introduction of DQN within an art painting robot application\nis proposed. The goal is to study how the introduction of a complex control\npolicy impacts the performance of a basic art painting robot application. The\nmain expected contribution of this work is to serve as a first baseline for\nfuture works introducing DQN methods for complex art painting robot frameworks.\nExperiments consist of real world executions of human drawn sketches using the\nDQN generated policy and TEO, the humanoid robot. Results are compared in terms\nof similarity and obtained reward with respect to the reference inputs\n","authors":["Raul Fernandez-Fernandez","Juan G. Victores","Carlos Balaguer"],"pdf_url":"https://arxiv.org/pdf/2402.00676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13149v4","updated":"2024-02-01T15:24:08Z","published":"2023-07-24T22:22:32Z","title":"Discovering interpretable elastoplasticity models via the neural\n polynomial method enabled symbolic regressions","summary":" Conventional neural network elastoplasticity models are often perceived as\nlacking interpretability. This paper introduces a two-step machine learning\napproach that returns mathematical models interpretable by human experts. In\nparticular, we introduce a surrogate model where yield surfaces are expressed\nin terms of a set of single-variable feature mappings obtained from supervised\nlearning. A post-processing step is then used to re-interpret the set of\nsingle-variable neural network mapping functions into mathematical form through\nsymbolic regression. This divide-and-conquer approach provides several\nimportant advantages. First, it enables us to overcome the scaling issue of\nsymbolic regression algorithms. From a practical perspective, it enhances the\nportability of learned models for partial differential equation solvers written\nin different programming languages. Finally, it enables us to have a concrete\nunderstanding of the attributes of the materials, such as convexity and\nsymmetries of models, through automated derivations and reasoning. Numerical\nexamples have been provided, along with an open-source code to enable\nthird-party validation.\n","authors":["Bahador Bahmani","Hyoung Suk Suh","WaiChing Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00659v1","updated":"2024-02-01T15:18:48Z","published":"2024-02-01T15:18:48Z","title":"Modeling Freight Mode Choice Using Machine Learning Classifiers: A\n Comparative Study Using the Commodity Flow Survey (CFS) Data","summary":" This study explores the usefulness of machine learning classifiers for\nmodeling freight mode choice. We investigate eight commonly used machine\nlearning classifiers, namely Naive Bayes, Support Vector Machine, Artificial\nNeural Network, K-Nearest Neighbors, Classification and Regression Tree, Random\nForest, Boosting and Bagging, along with the classical Multinomial Logit model.\nUS 2012 Commodity Flow Survey data are used as the primary data source; we\naugment it with spatial attributes from secondary data sources. The performance\nof the classifiers is compared based on prediction accuracy results. The\ncurrent research also examines the role of sample size and training-testing\ndata split ratios on the predictive ability of the various approaches. In\naddition, the importance of variables is estimated to determine how the\nvariables influence freight mode choice. The results show that the tree-based\nensemble classifiers perform the best. Specifically, Random Forest produces the\nmost accurate predictions, closely followed by Boosting and Bagging. With\nregard to variable importance, shipment characteristics, such as shipment\ndistance, industry classification of the shipper and shipment size, are the\nmost significant factors for freight mode choice decisions.\n","authors":["Majbah Uddin","Sabreena Anowar","Naveen Eluru"],"pdf_url":"https://arxiv.org/pdf/2402.00659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00654v1","updated":"2024-02-01T15:14:16Z","published":"2024-02-01T15:14:16Z","title":"Improving the accuracy of freight mode choice models: A case study using\n the 2017 CFS PUF data set and ensemble learning techniques","summary":" The US Census Bureau has collected two rounds of experimental data from the\nCommodity Flow Survey, providing shipment-level characteristics of nationwide\ncommodity movements, published in 2012 (i.e., Public Use Microdata) and in 2017\n(i.e., Public Use File). With this information, data-driven methods have become\nincreasingly valuable for understanding detailed patterns in freight logistics.\nIn this study, we used the 2017 Commodity Flow Survey Public Use File data set\nto explore building a high-performance freight mode choice model, considering\nthree main improvements: (1) constructing local models for each separate\ncommodity/industry category; (2) extracting useful geographical features,\nparticularly the derived distance of each freight mode between\norigin/destination zones; and (3) applying additional ensemble learning methods\nsuch as stacking or voting to combine results from local and unified models for\nimproved performance. The proposed method achieved over 92% accuracy without\nincorporating external information, an over 19% increase compared to directly\nfitting Random Forests models over 10,000 samples. Furthermore, SHAP (Shapely\nAdditive Explanations) values were computed to explain the outputs and major\npatterns obtained from the proposed model. The model framework could enhance\nthe performance and interpretability of existing freight mode choice models.\n","authors":["Diyi Liu","Hyeonsup Lim","Majbah Uddin","Yuandong Liu","Lee D. Han","Ho-ling Hwang","Shih-Miao Chin"],"pdf_url":"https://arxiv.org/pdf/2402.00654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00653v1","updated":"2024-02-01T15:13:26Z","published":"2024-02-01T15:13:26Z","title":"Coherent Feed Forward Quantum Neural Network","summary":" Quantum machine learning, focusing on quantum neural networks (QNNs), remains\na vastly uncharted field of study. Current QNN models primarily employ\nvariational circuits on an ansatz or a quantum feature map, often requiring\nmultiple entanglement layers. This methodology not only increases the\ncomputational cost of the circuit beyond what is practical on near-term quantum\ndevices but also misleadingly labels these models as neural networks, given\ntheir divergence from the structure of a typical feed-forward neural network\n(FFNN). Moreover, the circuit depth and qubit needs of these models scale\npoorly with the number of data features, resulting in an efficiency challenge\nfor real-world machine-learning tasks. We introduce a bona fide QNN model,\nwhich seamlessly aligns with the versatility of a traditional FFNN in terms of\nits adaptable intermediate layers and nodes, absent from intermediate\nmeasurements such that our entire model is coherent. This model stands out with\nits reduced circuit depth and number of requisite C-NOT gates to outperform\nprevailing QNN models. Furthermore, the qubit count in our model remains\nunaffected by the data's feature quantity. We test our proposed model on\nvarious benchmarking datasets such as the diagnostic breast cancer (Wisconsin)\nand credit card fraud detection datasets. We compare the outcomes of our model\nwith the existing QNN methods to showcase the advantageous efficacy of our\napproach, even with a reduced requirement on quantum resources. Our model paves\nthe way for application of quantum neural networks to real relevant machine\nlearning problems.\n","authors":["Utkarsh Singh","Aaron Z. Goldberg","Khabat Heshami"],"pdf_url":"https://arxiv.org/pdf/2402.00653v1.pdf","comment":"11 pages, 7 figures. Comments welcome!"},{"id":"http://arxiv.org/abs/2402.00645v1","updated":"2024-02-01T15:07:31Z","published":"2024-02-01T15:07:31Z","title":"Spectrally Transformed Kernel Regression","summary":" Unlabeled data is a key component of modern machine learning. In general, the\nrole of unlabeled data is to impose a form of smoothness, usually from the\nsimilarity information encoded in a base kernel, such as the\n$\\epsilon$-neighbor kernel or the adjacency matrix of a graph. This work\nrevisits the classical idea of spectrally transformed kernel regression (STKR),\nand provides a new class of general and scalable STKR estimators able to\nleverage unlabeled data. Intuitively, via spectral transformation, STKR\nexploits the data distribution for which unlabeled data can provide additional\ninformation. First, we show that STKR is a principled and general approach, by\ncharacterizing a universal type of \"target smoothness\", and proving that any\nsufficiently smooth function can be learned by STKR. Second, we provide\nscalable STKR implementations for the inductive setting and a general\ntransformation function, while prior work is mostly limited to the transductive\nsetting. Third, we derive statistical guarantees for two scenarios: STKR with a\nknown polynomial transformation, and STKR with kernel PCA when the\ntransformation is unknown. Overall, we believe that this work helps deepen our\nunderstanding of how to work with unlabeled data, and its generality makes it\neasier to inspire new methods.\n","authors":["Runtian Zhai","Rattana Pukdee","Roger Jin","Maria-Florina Balcan","Pradeep Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2402.00645v1.pdf","comment":"ICLR 2024 spotlight. 36 pages"},{"id":"http://arxiv.org/abs/2401.10306v2","updated":"2024-02-01T15:05:43Z","published":"2024-01-18T13:51:48Z","title":"Physics-constrained convolutional neural networks for inverse problems\n in spatiotemporal partial differential equations","summary":" We propose a physics-constrained convolutional neural network (PC-CNN) to\nsolve two types of inverse problems in partial differential equations (PDEs),\nwhich are nonlinear and vary both in space and time. In the first inverse\nproblem, we are given data that is offset by spatially varying systematic error\n(i.e., the bias, also known as the epistemic uncertainty). The task is to\nuncover from the biased data the true state, which is the solution of the PDE.\nIn the second inverse problem, we are given sparse information on the solution\nof a PDE. The task is to reconstruct the solution in space with\nhigh-resolution. First, we present the PC-CNN, which constrains the PDE with a\nsimple time-windowing scheme to handle sequential data. Second, we analyse the\nperformance of the PC-CNN for uncovering solutions from biased data. We analyse\nboth linear and nonlinear convection-diffusion equations, and the Navier-Stokes\nequations, which govern the spatiotemporally chaotic dynamics of turbulent\nflows. We find that the PC-CNN correctly recovers the true solution for a\nvariety of biases, which are parameterised as non-convex functions. Third, we\nanalyse the performance of the PC-CNN for reconstructing solutions from biased\ndata for the turbulent flow. We reconstruct the spatiotemporal chaotic solution\non a high-resolution grid from only 2\\% of the information contained in it. For\nboth tasks, we further analyse the Navier-Stokes solutions. We find that the\ninferred solutions have a physical spectral energy content, whereas traditional\nmethods, such as interpolation, do not. This work opens opportunities for\nsolving inverse problems with partial differential equations.\n","authors":["Daniel Kelshaw","Luca Magri"],"pdf_url":"https://arxiv.org/pdf/2401.10306v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.04600,\n arXiv:2306.10990"},{"id":"http://arxiv.org/abs/2401.08224v3","updated":"2024-02-01T15:02:55Z","published":"2024-01-16T09:22:12Z","title":"Privacy Preserving Adaptive Experiment Design","summary":" Adaptive experiment is widely adopted to estimate conditional average\ntreatment effect (CATE) in clinical trials and many other scenarios. While the\nprimary goal in experiment is to maximize estimation accuracy, due to the\nimperative of social welfare, it's also crucial to provide treatment with\nsuperior outcomes to patients, which is measured by regret in contextual bandit\nframework. These two objectives often lead to contrast optimal allocation\nmechanism. Furthermore, privacy concerns arise in clinical scenarios containing\nsensitive data like patients health records. Therefore, it's essential for the\ntreatment allocation mechanism to incorporate robust privacy protection\nmeasures. In this paper, we investigate the tradeoff between loss of social\nwelfare and statistical power in contextual bandit experiment. We propose a\nmatched upper and lower bound for the multi-objective optimization problem, and\nthen adopt the concept of Pareto optimality to mathematically characterize the\noptimality condition. Furthermore, we propose differentially private algorithms\nwhich still matches the lower bound, showing that privacy is \"almost free\".\nAdditionally, we derive the asymptotic normality of the estimator, which is\nessential in statistical inference and hypothesis testing.\n","authors":["Jiachun Li","Kaining Shi","David Simchi-Levi"],"pdf_url":"https://arxiv.org/pdf/2401.08224v3.pdf","comment":"Update an algorithm and the title of our paper"},{"id":"http://arxiv.org/abs/2402.00638v1","updated":"2024-02-01T14:54:17Z","published":"2024-02-01T14:54:17Z","title":"Random Forest-Based Prediction of Stroke Outcome","summary":" We research into the clinical, biochemical and neuroimaging factors\nassociated with the outcome of stroke patients to generate a predictive model\nusing machine learning techniques for prediction of mortality and morbidity 3\nmonths after admission. The dataset consisted of patients with ischemic stroke\n(IS) and non-traumatic intracerebral hemorrhage (ICH) admitted to Stroke Unit\nof a European Tertiary Hospital prospectively registered. We identified the\nmain variables for machine learning Random Forest (RF), generating a predictive\nmodel that can estimate patient mortality/morbidity. In conclusion, machine\nlearning algorithms RF can be effectively used in stroke patients for long-term\noutcome prediction of mortality and morbidity.\n","authors":["Carlos Fernandez-Lozano","Pablo Hervella","Virginia Mato-Abad","Manuel Rodriguez-Yanez","Sonia Suarez-Garaboa","Iria Lopez-Dequidt","Ana Estany-Gestal","Tomas Sobrino","Francisco Campos","Jose Castillo","Santiago Rodriguez-Yanez","Ramon Iglesias-Rey"],"pdf_url":"https://arxiv.org/pdf/2402.00638v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.04308v2","updated":"2024-02-01T14:54:00Z","published":"2023-07-10T02:27:38Z","title":"Towards Cross-Table Masked Pretraining for Web Data Mining","summary":" Tabular data pervades the landscape of the World Wide Web, playing a\nfoundational role in the digital architecture that underpins online\ninformation. Given the recent influence of large-scale pretrained models like\nChatGPT and SAM across various domains, exploring the application of\npretraining techniques for mining tabular data on the web has emerged as a\nhighly promising research direction. Indeed, there have been some recent works\naround this topic where most (if not all) of them are limited in the scope of a\nfixed-schema/single table. Due to the scale of the dataset and the parameter\nsize of the prior models, we believe that we have not reached the ''BERT\nmoment'' for the ubiquitous tabular data. The development on this line\nsignificantly lags behind the counterpart research domains such as natural\nlanguage processing. In this work, we first identify the crucial challenges\nbehind tabular data pretraining, particularly overcoming the cross-table\nhurdle. As a pioneering endeavor, this work mainly (i)-contributes a\nhigh-quality real-world tabular dataset, (ii)-proposes an innovative, generic,\nand efficient cross-table pretraining framework, dubbed as CM2, where the core\nto it comprises a semantic-aware tabular neural network that uniformly encodes\nheterogeneous tables without much restriction and (iii)-introduces a novel\npretraining objective -- prompt Masked Table Modeling (pMTM) -- inspired by NLP\nbut intricately tailored to scalable pretraining on tables. Our extensive\nexperiments demonstrate CM2's state-of-the-art performance and validate that\ncross-table pretraining can enhance various downstream tasks.\n","authors":["Chao Ye","Guoshan Lu","Haobo Wang","Liyao Li","Sai Wu","Gang Chen","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.04308v2.pdf","comment":"Accepted to WWW 2024"},{"id":"http://arxiv.org/abs/2402.00626v1","updated":"2024-02-01T14:41:20Z","published":"2024-02-01T14:41:20Z","title":"Vision-LLMs Can Fool Themselves with Self-Generated Typographic Attacks","summary":" Recently, significant progress has been made on Large Vision-Language Models\n(LVLMs); a new class of VL models that make use of large pre-trained language\nmodels. Yet, their vulnerability to Typographic attacks, which involve\nsuperimposing misleading text onto an image remain unstudied. Furthermore,\nprior work typographic attacks rely on sampling a random misleading class from\na predefined set of classes. However, the random chosen class might not be the\nmost effective attack. To address these issues, we first introduce a novel\nbenchmark uniquely designed to test LVLMs vulnerability to typographic attacks.\nFurthermore, we introduce a new and more effective typographic attack:\nSelf-Generated typographic attacks. Indeed, our method, given an image, make\nuse of the strong language capabilities of models like GPT-4V by simply\nprompting them to recommend a typographic attack. Using our novel benchmark, we\nuncover that typographic attacks represent a significant threat against\nLVLM(s). Furthermore, we uncover that typographic attacks recommended by GPT-4V\nusing our new method are not only more effective against GPT-4V itself compared\nto prior work attacks, but also against a host of less capable yet popular open\nsource models like LLaVA, InstructBLIP, and MiniGPT4.\n","authors":["Maan Qraitem","Nazia Tasnim","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2402.00626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00623v1","updated":"2024-02-01T14:39:59Z","published":"2024-02-01T14:39:59Z","title":"Bayesian Causal Inference with Gaussian Process Networks","summary":" Causal discovery and inference from observational data is an essential\nproblem in statistics posing both modeling and computational challenges. These\nare typically addressed by imposing strict assumptions on the joint\ndistribution such as linearity. We consider the problem of the Bayesian\nestimation of the effects of hypothetical interventions in the Gaussian Process\nNetwork (GPN) model, a flexible causal framework which allows describing the\ncausal relationships nonparametrically. We detail how to perform causal\ninference on GPNs by simulating the effect of an intervention across the whole\nnetwork and propagating the effect of the intervention on downstream variables.\nWe further derive a simpler computational approximation by estimating the\nintervention distribution as a function of local variables only, modeling the\nconditional distributions via additive Gaussian processes. We extend both\nframeworks beyond the case of a known causal graph, incorporating uncertainty\nabout the causal structure via Markov chain Monte Carlo methods. Simulation\nstudies show that our approach is able to identify the effects of hypothetical\ninterventions with non-Gaussian, non-linear observational data and accurately\nreflect the posterior uncertainty of the causal estimates. Finally we compare\nthe results of our GPN-based causal inference approach to existing methods on a\ndataset of $A.~thaliana$ gene expressions.\n","authors":["Enrico Giudice","Jack Kuipers","Giusi Moffa"],"pdf_url":"https://arxiv.org/pdf/2402.00623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13831v4","updated":"2024-02-01T14:14:56Z","published":"2023-07-25T21:59:17Z","title":"Relationship between Batch Size and Number of Steps Needed for Nonconvex\n Optimization of Stochastic Gradient Descent using Armijo Line Search","summary":" While stochastic gradient descent (SGD) can use various learning rates, such\nas constant or diminishing rates, the previous numerical results showed that\nSGD performs better than other deep learning optimizers using when it uses\nlearning rates given by line search methods. In this paper, we perform a\nconvergence analysis on SGD with a learning rate given by an Armijo line search\nfor nonconvex optimization indicating that the upper bound of the expectation\nof the squared norm of the full gradient becomes small when the number of steps\nand the batch size are large. Next, we show that, for SGD with the\nArmijo-line-search learning rate, the number of steps needed for nonconvex\noptimization is a monotone decreasing convex function of the batch size; that\nis, the number of steps needed for nonconvex optimization decreases as the\nbatch size increases. Furthermore, we show that the stochastic first-order\noracle (SFO) complexity, which is the stochastic gradient computation cost, is\na convex function of the batch size; that is, there exists a critical batch\nsize that minimizes the SFO complexity. Finally, we provide numerical results\nthat support our theoretical results. The numerical results indicate that the\nnumber of steps needed for training deep neural networks decreases as the batch\nsize increases and that there exist the critical batch sizes that can be\nestimated from the theoretical results.\n","authors":["Yuki Tsukada","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2307.13831v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00608v1","updated":"2024-02-01T14:02:06Z","published":"2024-02-01T14:02:06Z","title":"Deep Clustering Using the Soft Silhouette Score: Towards Compact and\n Well-Separated Clusters","summary":" Unsupervised learning has gained prominence in the big data era, offering a\nmeans to extract valuable insights from unlabeled datasets. Deep clustering has\nemerged as an important unsupervised category, aiming to exploit the non-linear\nmapping capabilities of neural networks in order to enhance clustering\nperformance. The majority of deep clustering literature focuses on minimizing\nthe inner-cluster variability in some embedded space while keeping the learned\nrepresentation consistent with the original high-dimensional dataset. In this\nwork, we propose soft silhoutte, a probabilistic formulation of the silhouette\ncoefficient. Soft silhouette rewards compact and distinctly separated\nclustering solutions like the conventional silhouette coefficient. When\noptimized within a deep clustering framework, soft silhouette guides the\nlearned representations towards forming compact and well-separated clusters. In\naddition, we introduce an autoencoder-based deep learning architecture that is\nsuitable for optimizing the soft silhouette objective function. The proposed\ndeep clustering method has been tested and compared with several well-studied\ndeep clustering methods on various benchmark datasets, yielding very\nsatisfactory clustering results.\n","authors":["Georgios Vardakas","Ioannis Papakostas","Aristidis Likas"],"pdf_url":"https://arxiv.org/pdf/2402.00608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00607v1","updated":"2024-02-01T13:59:04Z","published":"2024-02-01T13:59:04Z","title":"Are Synthetic Time-series Data Really not as Good as Real Data?","summary":" Time-series data presents limitations stemming from data quality issues, bias\nand vulnerabilities, and generalization problem. Integrating universal data\nsynthesis methods holds promise in improving generalization. However, current\nmethods cannot guarantee that the generator's output covers all unseen real\ndata. In this paper, we introduce InfoBoost -- a highly versatile cross-domain\ndata synthesizing framework with time series representation learning\ncapability. We have developed a method based on synthetic data that enables\nmodel training without the need for real data, surpassing the performance of\nmodels trained with real data. Additionally, we have trained a universal\nfeature extractor based on our synthetic data that is applicable to all\ntime-series data. Our approach overcomes interference from multiple sources\nrhythmic signal, noise interference, and long-period features that exceed\nsampling window capabilities. Through experiments, our non-deep-learning\nsynthetic data enables models to achieve superior reconstruction performance\nand universal explicit representation extraction without the need for real\ndata.\n","authors":["Fanzhe Fu","Junru Chen","Jing Zhang","Carl Yang","Lvbin Ma","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2402.00607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00592v1","updated":"2024-02-01T13:41:44Z","published":"2024-02-01T13:41:44Z","title":"Uncertainty-Aware Partial-Label Learning","summary":" In real-world applications, one often encounters ambiguously labeled data,\nwhere different annotators assign conflicting class labels. Partial-label\nlearning allows training classifiers in this weakly supervised setting. While\nstate-of-the-art methods already feature good predictive performance, they\noften suffer from miscalibrated uncertainty estimates. However, having\nwell-calibrated uncertainty estimates is important, especially in\nsafety-critical domains like medicine and autonomous driving. In this article,\nwe propose a novel nearest-neighbor-based partial-label-learning algorithm that\nleverages Dempster-Shafer theory. Extensive experiments on artificial and\nreal-world datasets show that the proposed method provides a well-calibrated\nuncertainty estimate and achieves competitive prediction performance.\nAdditionally, we prove that our algorithm is risk-consistent.\n","authors":["Tobias Fuchs","Florian Kalinke","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2402.00592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05821v2","updated":"2024-02-01T13:36:04Z","published":"2024-01-11T10:38:22Z","title":"Interpretable Concept Bottlenecks to Align Reinforcement Learning Agents","summary":" Goal misalignment, reward sparsity and difficult credit assignment are only a\nfew of the many issues that make it difficult for deep reinforcement learning\n(RL) agents to learn optimal policies. Unfortunately, the black-box nature of\ndeep neural networks impedes the inclusion of domain experts for inspecting the\nmodel and revising suboptimal policies. To this end, we introduce *Successive\nConcept Bottleneck Agents* (SCoBots), that integrate consecutive concept\nbottleneck (CB) layers. In contrast to current CB models, SCoBots do not just\nrepresent concepts as properties of individual objects, but also as relations\nbetween objects which is crucial for many RL tasks. Our experimental results\nprovide evidence of SCoBots' competitive performances, but also of their\npotential for domain experts to understand and regularize their behavior. Among\nother things, SCoBots enabled us to identify a previously unknown misalignment\nproblem in the iconic video game, Pong, and resolve it. Overall, SCoBots thus\nresult in more human-aligned RL agents. Our code is available at\nhttps://github.com/k4ntz/SCoBots .\n","authors":["Quentin Delfosse","Sebastian Sztwiertnia","Mark Rothermel","Wolfgang Stammer","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2401.05821v2.pdf","comment":"20 pages, 8 of main text, 8 of appendix, 3 main figures"},{"id":"http://arxiv.org/abs/2402.00576v1","updated":"2024-02-01T13:14:38Z","published":"2024-02-01T13:14:38Z","title":"Tropical Decision Boundaries for Neural Networks Are Robust Against\n Adversarial Attacks","summary":" We introduce a simple, easy to implement, and computationally efficient\ntropical convolutional neural network architecture that is robust against\nadversarial attacks. We exploit the tropical nature of piece-wise linear neural\nnetworks by embedding the data in the tropical projective torus in a single\nhidden layer which can be added to any model. We study the geometry of its\ndecision boundary theoretically and show its robustness against adversarial\nattacks on image datasets using computational experiments.\n","authors":["Kurt Pasque","Christopher Teska","Ruriko Yoshida","Keiji Miura","Jefferson Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05915v2","updated":"2024-02-01T13:11:56Z","published":"2023-09-12T02:05:43Z","title":"ACT: Empowering Decision Transformer with Dynamic Programming via\n Advantage Conditioning","summary":" Decision Transformer (DT), which employs expressive sequence modeling\ntechniques to perform action generation, has emerged as a promising approach to\noffline policy optimization. However, DT generates actions conditioned on a\ndesired future return, which is known to bear some weaknesses such as the\nsusceptibility to environmental stochasticity. To overcome DT's weaknesses, we\npropose to empower DT with dynamic programming. Our method comprises three\nsteps. First, we employ in-sample value iteration to obtain approximated value\nfunctions, which involves dynamic programming over the MDP structure. Second,\nwe evaluate action quality in context with estimated advantages. We introduce\ntwo types of advantage estimators, IAE and GAE, which are suitable for\ndifferent tasks. Third, we train an Advantage-Conditioned Transformer (ACT) to\ngenerate actions conditioned on the estimated advantages. Finally, during\ntesting, ACT generates actions conditioned on a desired advantage. Our\nevaluation results validate that, by leveraging the power of dynamic\nprogramming, ACT demonstrates effective trajectory stitching and robust action\ngeneration in spite of the environmental stochasticity, outperforming baseline\nmethods across various benchmarks. Additionally, we conduct an in-depth\nanalysis of ACT's various design choices through ablation studies. Our code is\navailable at https://github.com/LAMDA-RL/ACT.\n","authors":["Chen-Xiao Gao","Chenyang Wu","Mingjun Cao","Rui Kong","Zongzhang Zhang","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2309.05915v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2402.00568v1","updated":"2024-02-01T13:01:47Z","published":"2024-02-01T13:01:47Z","title":"Secure Supervised Learning-Based Smart Home Authentication Framework","summary":" The Smart home possesses the capability of facilitating home services to\ntheir users with the systematic advance in The Internet of Things (IoT) and\ninformation and communication technologies (ICT) in recent decades. The home\nservice offered by the smart devices helps the users in utilize maximized level\nof comfort for the objective of improving life quality. As the user and smart\ndevices communicate through an insecure channel, the smart home environment is\nprone to security and privacy problems. A secure authentication protocol needs\nto be established between the smart devices and the user, such that a situation\nfor device authentication can be made feasible in smart home environments. Most\nof the existing smart home authentication protocols were identified to fail in\nfacilitating a secure mutual authentication and increases the possibility of\nlunching the attacks of session key disclosure, impersonation and stolen smart\ndevice. In this paper, Secure Supervised Learning-based Smart Home\nAuthentication Framework (SSL-SHAF) is proposed as are liable mutual\nauthentication that can be contextually imposed for better security. The formal\nanalysis of the proposed SSL-SHAF confirmed better resistance against session\nkey disclosure, impersonation and stolen smart device attacks. The results of\nSSL-SHAF confirmed minimized computational costs and security compared to the\nbaseline protocols considered for investigation.\n","authors":["K. Swapna Sudha","N. Jeyanthi","Celestine Iwendi"],"pdf_url":"https://arxiv.org/pdf/2402.00568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05739v4","updated":"2024-02-01T13:01:25Z","published":"2023-11-09T20:52:36Z","title":"Adaptive Compression-Aware Split Learning and Inference for Enhanced\n Network Efficiency","summary":" The growing number of AI-driven applications in mobile devices has led to\nsolutions that integrate deep learning models with the available edge-cloud\nresources. Due to multiple benefits such as reduction in on-device energy\nconsumption, improved latency, improved network usage, and certain privacy\nimprovements, split learning, where deep learning models are split away from\nthe mobile device and computed in a distributed manner, has become an\nextensively explored topic. Incorporating compression-aware methods (where\nlearning adapts to compression level of the communicated data) has made split\nlearning even more advantageous. This method could even offer a viable\nalternative to traditional methods, such as federated learning techniques. In\nthis work, we develop an adaptive compression-aware split learning method\n('deprune') to improve and train deep learning models so that they are much\nmore network-efficient, which would make them ideal to deploy in weaker devices\nwith the help of edge-cloud resources. This method is also extended ('prune')\nto very quickly train deep learning models through a transfer learning\napproach, which trades off little accuracy for much more network-efficient\ninference abilities. We show that the 'deprune' method can reduce network usage\nby 4x when compared with a split-learning approach (that does not use our\nmethod) without loss of accuracy, while also improving accuracy over\ncompression-aware split-learning by 4 percent. Lastly, we show that the 'prune'\nmethod can reduce the training time for certain models by up to 6x without\naffecting the accuracy when compared against a compression-aware split-learning\napproach.\n","authors":["Akrit Mudvari","Antero Vainio","Iason Ofeidis","Sasu Tarkoma","Leandros Tassiulas"],"pdf_url":"https://arxiv.org/pdf/2311.05739v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00564v1","updated":"2024-02-01T12:50:48Z","published":"2024-02-01T12:50:48Z","title":"A Single Graph Convolution Is All You Need: Efficient Grayscale Image\n Classification","summary":" Image classifiers often rely on convolutional neural networks (CNN) for their\ntasks, which are inherently more heavyweight than multilayer perceptrons\n(MLPs), which can be problematic in real-time applications. Additionally, many\nimage classification models work on both RGB and grayscale datasets.\nClassifiers that operate solely on grayscale images are much less common.\nGrayscale image classification has diverse applications, including but not\nlimited to medical image classification and synthetic aperture radar (SAR)\nautomatic target recognition (ATR). Thus, we present a novel grayscale (single\nchannel) image classification approach using a vectorized view of images. We\nexploit the lightweightness of MLPs by viewing images as a vector and reducing\nour problem setting to the grayscale image classification setting. We find that\nusing a single graph convolutional layer batch-wise increases accuracy and\nreduces variance in the performance of our model. Moreover, we develop a\ncustomized accelerator on FPGA for the proposed model with several\noptimizations to improve its performance. Our experimental results on benchmark\ngrayscale image datasets demonstrate the effectiveness of the proposed model,\nachieving vastly lower latency (up to 16$\\times$ less) and competitive or\nleading performance compared to other state-of-the-art image classification\nmodels on various domain-specific grayscale image classification datasets.\n","authors":["Jacob Fein-Ashley","Tian Ye","Sachini Wickramasinghe","Bingyi Zhang","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2402.00564v1.pdf","comment":"6 pages of content, 1 page of references"},{"id":"http://arxiv.org/abs/2305.17665v2","updated":"2024-02-01T12:25:16Z","published":"2023-05-28T08:49:24Z","title":"Acceleration of stochastic gradient descent with momentum by averaging:\n finite-sample rates and asymptotic normality","summary":" Stochastic gradient descent with momentum (SGDM) has been widely used in many\nmachine learning and statistical applications. Despite the observed empirical\nbenefits of SGDM over traditional SGD, the theoretical understanding of the\nrole of momentum for different learning rates in the optimization process\nremains widely open. We analyze the finite-sample convergence rate of SGDM\nunder the strongly convex settings and show that, with a large batch size, the\nmini-batch SGDM converges faster than the mini-batch SGD to a neighborhood of\nthe optimal value. Additionally, our findings, supported by theoretical\nanalysis and numerical experiments, indicate that SGDM permits broader choices\nof learning rates. Furthermore, we analyze the Polyak-averaging version of the\nSGDM estimator, establish its asymptotic normality, and justify its asymptotic\nequivalence to the averaged SGD. The asymptotic distribution of the averaged\nSGDM enables uncertainty quantification of the algorithm output and statistical\ninference of the model parameters.\n","authors":["Kejie Tang","Weidong Liu","Yichen Zhang","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2305.17665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00544v1","updated":"2024-02-01T12:13:35Z","published":"2024-02-01T12:13:35Z","title":"Quantum-Assisted Hilbert-Space Gaussian Process Regression","summary":" Gaussian processes are probabilistic models that are commonly used as\nfunctional priors in machine learning. Due to their probabilistic nature, they\ncan be used to capture the prior information on the statistics of noise,\nsmoothness of the functions, and training data uncertainty. However, their\ncomputational complexity quickly becomes intractable as the size of the data\nset grows. We propose a Hilbert space approximation-based quantum algorithm for\nGaussian process regression to overcome this limitation. Our method consists of\na combination of classical basis function expansion with quantum computing\ntechniques of quantum principal component analysis, conditional rotations, and\nHadamard and Swap tests. The quantum principal component analysis is used to\nestimate the eigenvalues while the conditional rotations and the Hadamard and\nSwap tests are employed to evaluate the posterior mean and variance of the\nGaussian process. Our method provides polynomial computational complexity\nreduction over the classical method.\n","authors":["Ahmad Farooq","Cristian A. Galvis-Florez","Simo Särkkä"],"pdf_url":"https://arxiv.org/pdf/2402.00544v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.09769v2","updated":"2024-02-01T12:12:21Z","published":"2024-01-18T07:36:38Z","title":"Learning from Graphs with Heterophily: Progress and Future","summary":" Graphs are structured data that models complex relations between real-world\nentities. Heterophilous graphs, where linked nodes are prone to be with\ndifferent labels or dissimilar features, have recently attracted significant\nattention and found many applications. Meanwhile, increasing efforts have been\nmade to advance learning from heterophilous graphs. Although there exist\nsurveys on the relevant topic, they focus on heterophilous GNNs, which are only\nsub-topics of heterophilous graph learning. In this survey, we comprehensively\noverview existing works on learning from graphs with heterophily.First, we\ncollect over 180 publications and introduce the development of this field.\nThen, we systematically categorize existing methods based on a hierarchical\ntaxonomy including learning strategies, model architectures and practical\napplications. Finally, we discuss the primary challenges of existing studies\nand highlight promising avenues for future research.More publication details\nand corresponding open-source codes can be accessed and will be continuously\nupdated at our\nrepositories:https://github.com/gongchenghua/Awesome-Survey-Graphs-with-Heterophily.\n","authors":["Chenghua Gong","Yao Cheng","Xiang Li","Caihua Shan","Siqiang Luo"],"pdf_url":"https://arxiv.org/pdf/2401.09769v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2305.02930v2","updated":"2024-02-01T12:06:30Z","published":"2023-05-04T15:30:10Z","title":"Piecewise Normalizing Flows","summary":" Normalizing flows are an established approach for modelling complex\nprobability densities through invertible transformations from a base\ndistribution. However, the accuracy with which the target distribution can be\ncaptured by the normalizing flow is strongly influenced by the topology of the\nbase distribution. A mismatch between the topology of the target and the base\ncan result in a poor performance, as is typically the case for multi-modal\nproblems. A number of different works have attempted to modify the topology of\nthe base distribution to better match the target, either through the use of\nGaussian Mixture Models (Izmailov et al., 2020; Ardizzone et al., 2020;\nHagemann & Neumayer, 2021) or learned accept/reject sampling (Stimper et al.,\n2022). We introduce piecewise normalizing flows which divide the target\ndistribution into clusters, with topologies that better match the standard\nnormal base distribution, and train a series of flows to model complex\nmulti-modal targets. We demonstrate the performance of the piecewise flows\nusing some standard benchmarks and compare the accuracy of the flows to the\napproach taken in Stimper et al. (2022) for modelling multi-modal\ndistributions. We find that our approach consistently outperforms the approach\nin Stimper et al. (2022) with a higher emulation accuracy on the standard\nbenchmarks.\n","authors":["Harry Bevins","Will Handley","Thomas Gessey-Jones"],"pdf_url":"https://arxiv.org/pdf/2305.02930v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.00534v1","updated":"2024-02-01T12:01:43Z","published":"2024-02-01T12:01:43Z","title":"A Manifold Representation of the Key in Vision Transformers","summary":" Vision Transformers implement multi-head self-attention (MSA) via stacking\nmultiple attention blocks. The query, key, and value are often intertwined and\ngenerated within those blocks via a single, shared linear transformation. This\npaper explores the concept of disentangling the key from the query and value,\nand adopting a manifold representation for the key. Our experiments reveal that\ndecoupling and endowing the key with a manifold structure can enhance the model\nperformance. Specifically, ViT-B exhibits a 0.87% increase in top-1 accuracy,\nwhile Swin-T sees a boost of 0.52% in top-1 accuracy on the ImageNet-1K\ndataset, with eight charts in the manifold key. Our approach also yields\npositive results in object detection and instance segmentation tasks on the\nCOCO dataset. Through detailed ablation studies, we establish that these\nperformance gains are not merely due to the simplicity of adding more\nparameters and computations. Future research may investigate strategies for\ncutting the budget of such representations and aim for further performance\nimprovements based on our findings.\n","authors":["Li Meng","Morten Goodwin","Anis Yazidi","Paal Engelstad"],"pdf_url":"https://arxiv.org/pdf/2402.00534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00531v1","updated":"2024-02-01T11:58:28Z","published":"2024-02-01T11:58:28Z","title":"Preconditioning for Physics-Informed Neural Networks","summary":" Physics-informed neural networks (PINNs) have shown promise in solving\nvarious partial differential equations (PDEs). However, training pathologies\nhave negatively affected the convergence and prediction accuracy of PINNs,\nwhich further limits their practical applications. In this paper, we propose to\nuse condition number as a metric to diagnose and mitigate the pathologies in\nPINNs. Inspired by classical numerical analysis, where the condition number\nmeasures sensitivity and stability, we highlight its pivotal role in the\ntraining dynamics of PINNs. We prove theorems to reveal how condition number is\nrelated to both the error control and convergence of PINNs. Subsequently, we\npresent an algorithm that leverages preconditioning to improve the condition\nnumber. Evaluations of 18 PDE problems showcase the superior performance of our\nmethod. Significantly, in 7 of these problems, our method reduces errors by an\norder of magnitude. These empirical findings verify the critical role of the\ncondition number in PINNs' training.\n","authors":["Songming Liu","Chang Su","Jiachen Yao","Zhongkai Hao","Hang Su","Youjia Wu","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.00531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04916v2","updated":"2024-02-01T11:58:27Z","published":"2023-12-08T09:31:50Z","title":"EE-LLM: Large-Scale Training and Inference of Early-Exit Large Language\n Models with 3D Parallelism","summary":" We present EE-LLM, a framework for large-scale training and inference of\nearly-exit large language models (LLMs). While recent works have shown\npreliminary evidence for the efficacy of early exiting in accelerating LLM\ninference, EE-LLM makes a foundational step towards scaling up early-exit LLMs\nby supporting their training and inference with massive 3D parallelism. Built\nupon Megatron-LM, EE-LLM implements a variety of algorithmic innovations and\nperformance optimizations tailored to early exiting, including a lightweight\nmethod that facilitates backpropagation for the early-exit training objective\nwith pipeline parallelism, techniques of leveraging idle resources in the\noriginal pipeline schedule for computation related to early-exit layers, and\ntwo approaches of early-exit inference that are compatible with KV caching for\nautoregressive generation. Our analytical and empirical study shows that EE-LLM\nachieves great training efficiency with negligible computational overhead\ncompared to standard LLM training, as well as outstanding inference speedup\nwithout compromising output quality. To facilitate further research and\nadoption, we release EE-LLM at https://github.com/pan-x-c/EE-LLM.\n","authors":["Yanxi Chen","Xuchen Pan","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.04916v2.pdf","comment":"arXiv v2 update: extended related works and formal analysis of\n training efficiency. We will continuously update the codebase and arXiv\n version"},{"id":"http://arxiv.org/abs/2401.15098v2","updated":"2024-02-01T11:58:07Z","published":"2024-01-25T03:06:51Z","title":"Hierarchical Continual Reinforcement Learning via Large Language Model","summary":" The ability to learn continuously in dynamic environments is a crucial\nrequirement for reinforcement learning (RL) agents applying in the real world.\nDespite the progress in continual reinforcement learning (CRL), existing\nmethods often suffer from insufficient knowledge transfer, particularly when\nthe tasks are diverse. To address this challenge, we propose a new framework,\nHierarchical Continual reinforcement learning via large language model\n(Hi-Core), designed to facilitate the transfer of high-level knowledge. Hi-Core\norchestrates a twolayer structure: high-level policy formulation by a large\nlanguage model (LLM), which represents agenerates a sequence of goals, and\nlow-level policy learning that closely aligns with goal-oriented RL practices,\nproducing the agent's actions in response to the goals set forth. The framework\nemploys feedback to iteratively adjust and verify highlevel policies, storing\nthem along with low-level policies within a skill library. When encountering a\nnew task, Hi-Core retrieves relevant experience from this library to help to\nlearning. Through experiments on Minigrid, Hi-Core has demonstrated its\neffectiveness in handling diverse CRL tasks, which outperforms popular\nbaselines.\n","authors":["Chaofan Pan","Xin Yang","Hao Wang","Wei Wei","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2401.15098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13514v2","updated":"2024-02-01T11:47:57Z","published":"2023-05-22T22:07:50Z","title":"Small Language Models Improve Giants by Rewriting Their Outputs","summary":" Despite the impressive performance of large language models (LLMs), they\noften lag behind specialized models in various tasks. LLMs only use a fraction\nof the existing training data for in-context learning, while task-specific\nmodels harness the full dataset for fine-tuning. In this work, we tackle the\nproblem of leveraging training data to improve the performance of LLMs without\nfine-tuning. Our approach directly targets LLM predictions without requiring\naccess to their weights. We create a pool of candidates from the LLM through\nfew-shot prompting and we employ a compact model, the LM-corrector (LMCor),\nspecifically trained to merge these candidates to produce an enhanced output.\nOur experiments on four natural language generation tasks demonstrate that even\na small LMCor model (250M) substantially improves the few-shot performance of\nLLMs (62B), matching and even outperforming standard fine-tuning. Furthermore,\nwe illustrate the robustness of LMCor against different prompts, thereby\nminimizing the need for extensive prompt engineering. Finally, we show that\nLMCor can be seamlessly integrated with different LLMs at inference, serving as\na plug-and-play module to improve their performance.\n","authors":["Giorgos Vernikos","Arthur Bražinskas","Jakub Adamek","Jonathan Mallinson","Aliaksei Severyn","Eric Malmi"],"pdf_url":"https://arxiv.org/pdf/2305.13514v2.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2402.00522v1","updated":"2024-02-01T11:43:13Z","published":"2024-02-01T11:43:13Z","title":"Understanding the Expressive Power and Mechanisms of Transformer for\n Sequence Modeling","summary":" We conduct a systematic study of the approximation properties of Transformer\nfor sequence modeling with long, sparse and complicated memory. We investigate\nthe mechanisms through which different components of Transformer, such as the\ndot-product self-attention, positional encoding and feed-forward layer, affect\nits expressive power, and we study their combined effects through establishing\nexplicit approximation rates. Our study reveals the roles of critical\nparameters in the Transformer, such as the number of layers and the number of\nattention heads, and these insights also provide natural suggestions for\nalternative architectures.\n","authors":["Mingze Wang","Weinan E"],"pdf_url":"https://arxiv.org/pdf/2402.00522v1.pdf","comment":"65 pages"},{"id":"http://arxiv.org/abs/2402.00518v1","updated":"2024-02-01T11:39:04Z","published":"2024-02-01T11:39:04Z","title":"EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit\n Large Language Models","summary":" This work introduces EE-Tuning, a lightweight and economical solution to\ntraining/tuning early-exit large language models (LLMs). In contrast to the\ncommon approach of full-parameter pre-training, EE-Tuning augments any\npre-trained (and possibly fine-tuned) standard LLM with additional early-exit\nlayers that are tuned in a parameter-efficient manner, which requires\nsignificantly less computational resources and training data. Our\nimplementation of EE-Tuning achieves outstanding training efficiency via\nextensive performance optimizations, as well as scalability due to its full\ncompatibility with 3D parallelism. Results of systematic experiments validate\nthe efficacy of EE-Tuning, confirming that effective early-exit LLM inference\ncan be achieved with a limited training budget. In hope of making early-exit\nLLMs accessible to the community, we release the source code of our\nimplementation of EE-Tuning at https://github.com/pan-x-c/EE-LLM.\n","authors":["Xuchen Pan","Yanxi Chen","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.00518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00515v1","updated":"2024-02-01T11:31:26Z","published":"2024-02-01T11:31:26Z","title":"Developing A Multi-Agent and Self-Adaptive Framework with Deep\n Reinforcement Learning for Dynamic Portfolio Risk Management","summary":" Deep or reinforcement learning (RL) approaches have been adapted as reactive\nagents to quickly learn and respond with new investment strategies for\nportfolio management under the highly turbulent financial market environments\nin recent years. In many cases, due to the very complex correlations among\nvarious financial sectors, and the fluctuating trends in different financial\nmarkets, a deep or reinforcement learning based agent can be biased in\nmaximising the total returns of the newly formulated investment portfolio while\nneglecting its potential risks under the turmoil of various market conditions\nin the global or regional sectors. Accordingly, a multi-agent and self-adaptive\nframework namely the MASA is proposed in which a sophisticated multi-agent\nreinforcement learning (RL) approach is adopted through two cooperating and\nreactive agents to carefully and dynamically balance the trade-off between the\noverall portfolio returns and their potential risks. Besides, a very flexible\nand proactive agent as the market observer is integrated into the MASA\nframework to provide some additional information on the estimated market trends\nas valuable feedbacks for multi-agent RL approach to quickly adapt to the\never-changing market conditions. The obtained empirical results clearly reveal\nthe potential strengths of our proposed MASA framework based on the multi-agent\nRL approach against many well-known RL-based approaches on the challenging data\nsets of the CSI 300, Dow Jones Industrial Average and S&P 500 indexes over the\npast 10 years. More importantly, our proposed MASA framework shed lights on\nmany possible directions for future investigation.\n","authors":["Zhenglong Li","Vincent Tam","Kwan L. Yeung"],"pdf_url":"https://arxiv.org/pdf/2402.00515v1.pdf","comment":"Accepted by The 23rd International Conference on Autonomous Agents\n and Multi-Agent Systems"},{"id":"http://arxiv.org/abs/2310.00692v3","updated":"2024-02-01T11:15:37Z","published":"2023-10-01T14:58:20Z","title":"A Theoretical Analysis of Noise Geometry in Stochastic Gradient Descent","summary":" In this paper, we provide a theoretical study of noise geometry for minibatch\nstochastic gradient descent (SGD), a phenomenon where noise aligns favorably\nwith the geometry of local landscape. We propose two metrics, derived from\nanalyzing how noise influences the loss and subspace projection dynamics, to\nquantify the alignment strength. We show that for (over-parameterized) linear\nmodels and two-layer nonlinear networks, when measured by these metrics, the\nalignment can be provably guaranteed under conditions independent of the degree\nof over-parameterization. To showcase the utility of our noise geometry\ncharacterizations, we present a refined analysis of the mechanism by which SGD\nescapes from sharp minima. We reveal that unlike gradient descent (GD), which\nescapes along the sharpest directions, SGD tends to escape from flatter\ndirections and cyclical learning rates can exploit this SGD characteristic to\nnavigate more effectively towards flatter regions. Lastly, extensive\nexperiments are provided to support our theoretical findings.\n","authors":["Mingze Wang","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2310.00692v3.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2402.00501v1","updated":"2024-02-01T11:12:00Z","published":"2024-02-01T11:12:00Z","title":"Equivalence of the Empirical Risk Minimization to Regularization on the\n Family of f-Divergences","summary":" The solution to empirical risk minimization with $f$-divergence\nregularization (ERM-$f$DR) is presented under mild conditions on $f$. Under\nsuch conditions, the optimal measure is shown to be unique. Examples of the\nsolution for particular choices of the function $f$ are presented. Previously\nknown solutions to common regularization choices are obtained by leveraging the\nflexibility of the family of $f$-divergences. These include the unique\nsolutions to empirical risk minimization with relative entropy regularization\n(Type-I and Type-II). The analysis of the solution unveils the following\nproperties of $f$-divergences when used in the ERM-$f$DR problem: $i\\bigl)$\n$f$-divergence regularization forces the support of the solution to coincide\nwith the support of the reference measure, which introduces a strong inductive\nbias that dominates the evidence provided by the training data; and $ii\\bigl)$\nany $f$-divergence regularization is equivalent to a different $f$-divergence\nregularization with an appropriate transformation of the empirical risk\nfunction.\n","authors":["Francisco Daunas","Iñaki Esnaola","Samir M. Perlaza","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2402.00501v1.pdf","comment":"Submitted to the IEEE Symposium in Information Theory 2024. arXiv\n admin note: text overlap with arXiv:2306.07123"},{"id":"http://arxiv.org/abs/2401.11261v2","updated":"2024-02-01T10:44:08Z","published":"2024-01-20T16:01:18Z","title":"Diffusion Model Conditioning on Gaussian Mixture Model and Negative\n Gaussian Mixture Gradient","summary":" Diffusion models (DMs) are a type of generative model that has a huge impact\non image synthesis and beyond. They achieve state-of-the-art generation results\nin various generative tasks. A great diversity of conditioning inputs, such as\ntext or bounding boxes, are accessible to control the generation. In this work,\nwe propose a conditioning mechanism utilizing Gaussian mixture models (GMMs) as\nfeature conditioning to guide the denoising process. Based on set theory, we\nprovide a comprehensive theoretical analysis that shows that conditional latent\ndistribution based on features and classes is significantly different, so that\nconditional latent distribution on features produces fewer defect generations\nthan conditioning on classes. Two diffusion models conditioned on the Gaussian\nmixture model are trained separately for comparison. Experiments support our\nfindings. A novel gradient function called the negative Gaussian mixture\ngradient (NGMG) is proposed and applied in diffusion model training with an\nadditional classifier. Training stability has improved. We also theoretically\nprove that NGMG shares the same benefit as the Earth Mover distance\n(Wasserstein) as a more sensible cost function when learning distributions\nsupported by low-dimensional manifolds.\n","authors":["Weiguo Lu","Xuan Wu","Deng Ding","Jinqiao Duan","Jirong Zhuang","Gangnan Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.11261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00759v3","updated":"2024-02-01T10:40:09Z","published":"2023-04-03T07:20:43Z","title":"FedIN: Federated Intermediate Layers Learning for Model Heterogeneity","summary":" Federated learning (FL) facilitates edge devices to cooperatively train a\nglobal shared model while maintaining the training data locally and privately.\nHowever, a common assumption in FL requires the participating edge devices to\nhave similar computation resources and train on an identical global model\narchitecture. In this study, we propose an FL method called Federated\nIntermediate Layers Learning (FedIN), supporting heterogeneous models without\nrelying on any public dataset. Instead, FedIN leverages the inherent knowledge\nembedded in client model features to facilitate knowledge exchange. The\ntraining models in FedIN are partitioned into three distinct components: an\nextractor, intermediate layers, and a classifier. We capture client features by\nextracting the outputs of the extractor and the inputs of the classifier. To\nharness the knowledge from client features, we propose IN training for aligning\nthe intermediate layers based on features obtained from other clients. IN\ntraining only needs minimal memory and communication overhead by utilizing a\nsingle batch of client features. Additionally, we formulate and address a\nconvex optimization problem to mitigate the challenge of gradient divergence\ncaused by conflicts between IN training and local training. The experiment\nresults demonstrate the superior performance of FedIN in heterogeneous model\nenvironments compared to state-of-the-art algorithms. Furthermore, our ablation\nstudy demonstrates the effectiveness of IN training and the proposed solution\nfor alleviating gradient divergence.\n","authors":["Yun-Hin Chan","Zhihan Jiang","Jing Deng","Edith C. -H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2304.00759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00450v1","updated":"2024-02-01T09:36:56Z","published":"2024-02-01T09:36:56Z","title":"CPT: Competence-progressive Training Strategy for Few-shot Node\n Classification","summary":" Graph Neural Networks (GNNs) have made significant advancements in node\nclassification, but their success relies on sufficient labeled nodes per class\nin the training data. Real-world graph data often exhibits a long-tail\ndistribution with sparse labels, emphasizing the importance of GNNs' ability in\nfew-shot node classification, which entails categorizing nodes with limited\ndata. Traditional episodic meta-learning approaches have shown promise in this\ndomain, but they face an inherent limitation: it might lead the model to\nconverge to suboptimal solutions because of random and uniform task assignment,\nignoring task difficulty levels. This could lead the meta-learner to face\ncomplex tasks too soon, hindering proper learning. Ideally, the meta-learner\nshould start with simple concepts and advance to more complex ones, like human\nlearning. So, we introduce CPT, a novel two-stage curriculum learning method\nthat aligns task difficulty with the meta-learner's progressive competence,\nenhancing overall performance. Specifically, in CPT's initial stage, the focus\nis on simpler tasks, fostering foundational skills for engaging with complex\ntasks later. Importantly, the second stage dynamically adjusts task difficulty\nbased on the meta-learner's growing competence, aiming for optimal knowledge\nacquisition. Extensive experiments on popular node classification datasets\ndemonstrate significant improvements of our strategy over existing methods.\n","authors":["Qilong Yan","Yufeng Zhang","Jinghao Zhang","Jingpu Duan","Jian Yin"],"pdf_url":"https://arxiv.org/pdf/2402.00450v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2206.11972 by\n other authors"},{"id":"http://arxiv.org/abs/2310.19390v2","updated":"2024-02-01T09:35:33Z","published":"2023-10-30T09:52:48Z","title":"Implicit Manifold Gaussian Process Regression","summary":" Gaussian process regression is widely used because of its ability to provide\nwell-calibrated uncertainty estimates and handle small or sparse datasets.\nHowever, it struggles with high-dimensional data. One possible way to scale\nthis technique to higher dimensions is to leverage the implicit low-dimensional\nmanifold upon which the data actually lies, as postulated by the manifold\nhypothesis. Prior work ordinarily requires the manifold structure to be\nexplicitly provided though, i.e. given by a mesh or be known to be one of the\nwell-known manifolds like the sphere. In contrast, in this paper we propose a\nGaussian process regression technique capable of inferring implicit structure\ndirectly from data (labeled and unlabeled) in a fully differentiable way. For\nthe resulting model, we discuss its convergence to the Mat\\'ern Gaussian\nprocess on the assumed manifold. Our technique scales up to hundreds of\nthousands of data points, and may improve the predictive performance and\ncalibration of the standard Gaussian process regression in high-dimensional\nsettings.\n","authors":["Bernardo Fichera","Viacheslav Borovitskiy","Andreas Krause","Aude Billard"],"pdf_url":"https://arxiv.org/pdf/2310.19390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00447v1","updated":"2024-02-01T09:28:48Z","published":"2024-02-01T09:28:48Z","title":"A Survey of Data-Efficient Graph Learning","summary":" Graph-structured data, prevalent in domains ranging from social networks to\nbiochemical analysis, serve as the foundation for diverse real-world systems.\nWhile graph neural networks demonstrate proficiency in modeling this type of\ndata, their success is often reliant on significant amounts of labeled data,\nposing a challenge in practical scenarios with limited annotation resources. To\ntackle this problem, tremendous efforts have been devoted to enhancing graph\nmachine learning performance under low-resource settings by exploring various\napproaches to minimal supervision. In this paper, we introduce a novel concept\nof Data-Efficient Graph Learning (DEGL) as a research frontier, and present the\nfirst survey that summarizes the current progress of DEGL. We initiate by\nhighlighting the challenges inherent in training models with large labeled\ndata, paving the way for our exploration into DEGL. Next, we systematically\nreview recent advances on this topic from several key aspects, including\nself-supervised graph learning, semi-supervised graph learning, and few-shot\ngraph learning. Also, we state promising directions for future research,\ncontributing to the evolution of graph machine learning.\n","authors":["Wei Ju","Siyu Yi","Yifan Wang","Qingqing Long","Junyu Luo","Zhiping Xiao","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00435v1","updated":"2024-02-01T09:01:58Z","published":"2024-02-01T09:01:58Z","title":"A practical existence theorem for reduced order models based on\n convolutional autoencoders","summary":" In recent years, deep learning has gained increasing popularity in the fields\nof Partial Differential Equations (PDEs) and Reduced Order Modeling (ROM),\nproviding domain practitioners with new powerful data-driven techniques such as\nPhysics-Informed Neural Networks (PINNs), Neural Operators, Deep Operator\nNetworks (DeepONets) and Deep-Learning based ROMs (DL-ROMs). In this context,\ndeep autoencoders based on Convolutional Neural Networks (CNNs) have proven\nextremely effective, outperforming established techniques, such as the reduced\nbasis method, when dealing with complex nonlinear problems. However, despite\nthe empirical success of CNN-based autoencoders, there are only a few\ntheoretical results supporting these architectures, usually stated in the form\nof universal approximation theorems. In particular, although the existing\nliterature provides users with guidelines for designing convolutional\nautoencoders, the subsequent challenge of learning the latent features has been\nbarely investigated. Furthermore, many practical questions remain unanswered,\ne.g., the number of snapshots needed for convergence or the neural network\ntraining strategy. In this work, using recent techniques from sparse\nhigh-dimensional function approximation, we fill some of these gaps by\nproviding a new practical existence theorem for CNN-based autoencoders when the\nparameter-to-solution map is holomorphic. This regularity assumption arises in\nmany relevant classes of parametric PDEs, such as the parametric diffusion\nequation, for which we discuss an explicit application of our general theory.\n","authors":["Nicola Rares Franco","Simone Brugiapaglia"],"pdf_url":"https://arxiv.org/pdf/2402.00435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00433v1","updated":"2024-02-01T08:58:57Z","published":"2024-02-01T08:58:57Z","title":"Merging Multi-Task Models via Weight-Ensembling Mixture of Experts","summary":" Merging various task-specific Transformer-based models trained on different\ntasks into a single unified model can execute all the tasks concurrently.\nPrevious methods, exemplified by task arithmetic, have been proven to be both\neffective and scalable. Existing methods have primarily focused on seeking a\nstatic optimal solution within the original model parameter space. A notable\nchallenge is mitigating the interference between parameters of different\nmodels, which can substantially deteriorate performance. In this paper, we\npropose to merge most of the parameters while upscaling the MLP of the\nTransformer layers to a weight-ensembling mixture of experts (MoE) module,\nwhich can dynamically integrate shared and task-specific knowledge based on the\ninput, thereby providing a more flexible solution that can adapt to the\nspecific needs of each instance. Our key insight is that by identifying and\nseparating shared knowledge and task-specific knowledge, and then dynamically\nintegrating them, we can mitigate the parameter interference problem to a great\nextent. We conduct the conventional multi-task model merging experiments and\nevaluate the generalization and robustness of our method. The results\ndemonstrate the effectiveness of our method and provide a comprehensive\nunderstanding of our method. The code is available at\nhttps://anonymous.4open.science/r/weight-ensembling_MoE-67C9/\n","authors":["Anke Tang","Li Shen","Yong Luo","Nan Yin","Lefei Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.00433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00421v1","updated":"2024-02-01T08:37:13Z","published":"2024-02-01T08:37:13Z","title":"From PARIS to LE-PARIS: Toward Patent Response Automation with\n Recommender Systems and Collaborative Large Language Models","summary":" In patent prosecution, timely and effective responses to Office Actions (OAs)\nare crucial for acquiring patents, yet past automation and AI research have\nscarcely addressed this aspect. To address this gap, our study introduces the\nPatent Office Action Response Intelligence System (PARIS) and its advanced\nversion, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are\ndesigned to expedite the efficiency of patent attorneys in collaboratively\nhandling OA responses. The systems' key features include the construction of an\nOA Topics Database, development of Response Templates, and implementation of\nRecommender Systems and LLM-based Response Generation. Our validation involves\na multi-paradigmatic analysis using the USPTO Office Action database and\nlongitudinal data of attorney interactions with our systems over six years.\nThrough five studies, we examine the constructiveness of OA topics (studies 1\nand 2) using topic modeling and the proposed Delphi process, the efficacy of\nour proposed hybrid recommender system tailored for OA (both LLM-based and\nnon-LLM-based) (study 3), the quality of response generation (study 4), and the\npractical value of the systems in real-world scenarios via user studies (study\n5). Results demonstrate that both PARIS and LE-PARIS significantly meet key\nmetrics and positively impact attorney performance.\n","authors":["Jung-Mei Chu","Hao-Cheng Lo","Jieh Hsiang","Chun-Chieh Cho"],"pdf_url":"https://arxiv.org/pdf/2402.00421v1.pdf","comment":"14 pages, 4 figures, summitted to a journal"},{"id":"http://arxiv.org/abs/2211.04125v4","updated":"2024-02-01T08:37:00Z","published":"2022-11-08T09:45:39Z","title":"Efficacy of MRI data harmonization in the age of machine learning. A\n multicenter study across 36 datasets","summary":" Pooling publicly-available MRI data from multiple sites allows to assemble\nextensive groups of subjects, increase statistical power, and promote data\nreuse with machine learning techniques. The harmonization of multicenter data\nis necessary to reduce the confounding effect associated with non-biological\nsources of variability in the data. However, when applied to the entire dataset\nbefore machine learning, the harmonization leads to data leakage, because\ninformation outside the training set may affect model building, and potentially\nfalsely overestimate performance. We propose a 1) measurement of the efficacy\nof data harmonization; 2) harmonizer transformer, i.e., an implementation of\nthe ComBat harmonization allowing its encapsulation among the preprocessing\nsteps of a machine learning pipeline, avoiding data leakage. We tested these\ntools using brain T1-weighted MRI data from 1740 healthy subjects acquired at\n36 sites. After harmonization, the site effect was removed or reduced, and we\nshowed the data leakage effect in predicting individual age from MRI data,\nhighlighting that introducing the harmonizer transformer into a machine\nlearning pipeline allows for avoiding data leakage.\n","authors":["Chiara Marzi","Marco Giannelli","Andrea Barucci","Carlo Tessa","Mario Mascalchi","Stefano Diciotti"],"pdf_url":"https://arxiv.org/pdf/2211.04125v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00418v1","updated":"2024-02-01T08:36:16Z","published":"2024-02-01T08:36:16Z","title":"Short: Benchmarking transferable adversarial attacks","summary":" The robustness of deep learning models against adversarial attacks remains a\npivotal concern. This study presents, for the first time, an exhaustive review\nof the transferability aspect of adversarial attacks. It systematically\ncategorizes and critically evaluates various methodologies developed to augment\nthe transferability of adversarial attacks. This study encompasses a spectrum\nof techniques, including Generative Structure, Semantic Similarity, Gradient\nEditing, Target Modification, and Ensemble Approach. Concurrently, this paper\nintroduces a benchmark framework \\textit{TAA-Bench}, integrating ten leading\nmethodologies for adversarial attack transferability, thereby providing a\nstandardized and systematic platform for comparative analysis across diverse\nmodel architectures. Through comprehensive scrutiny, we delineate the efficacy\nand constraints of each method, shedding light on their underlying operational\nprinciples and practical utility. This review endeavors to be a quintessential\nresource for both scholars and practitioners in the field, charting the complex\nterrain of adversarial transferability and setting a foundation for future\nexplorations in this vital sector. The associated codebase is accessible at:\nhttps://github.com/KxPlaug/TAA-Bench\n","authors":["Zhibo Jin","Jiayu Zhang","Zhiyu Zhu","Huaming Chen"],"pdf_url":"https://arxiv.org/pdf/2402.00418v1.pdf","comment":"Accepted by NDSS 2024 Workshop"},{"id":"http://arxiv.org/abs/2211.01345v2","updated":"2024-02-01T08:29:55Z","published":"2022-09-26T09:02:30Z","title":"Generative machine learning methods for multivariate ensemble\n post-processing","summary":" Ensemble weather forecasts based on multiple runs of numerical weather\nprediction models typically show systematic errors and require post-processing\nto obtain reliable forecasts. Accurately modeling multivariate dependencies is\ncrucial in many practical applications, and various approaches to multivariate\npost-processing have been proposed where ensemble predictions are first\npost-processed separately in each margin and multivariate dependencies are then\nrestored via copulas. These two-step methods share common key limitations, in\nparticular the difficulty to include additional predictors in modeling the\ndependencies. We propose a novel multivariate post-processing method based on\ngenerative machine learning to address these challenges. In this new class of\nnonparametric data-driven distributional regression models, samples from the\nmultivariate forecast distribution are directly obtained as output of a\ngenerative neural network. The generative model is trained by optimizing a\nproper scoring rule which measures the discrepancy between the generated and\nobserved data, conditional on exogenous input variables. Our method does not\nrequire parametric assumptions on univariate distributions or multivariate\ndependencies and allows for incorporating arbitrary predictors. In two case\nstudies on multivariate temperature and wind speed forecasting at weather\nstations over Germany, our generative model shows significant improvements over\nstate-of-the-art methods and particularly improves the representation of\nspatial dependencies.\n","authors":["Jieyu Chen","Tim Janke","Florian Steinke","Sebastian Lerch"],"pdf_url":"https://arxiv.org/pdf/2211.01345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13903v4","updated":"2024-02-01T08:11:55Z","published":"2023-07-26T02:02:19Z","title":"Corruption-Robust Lipschitz Contextual Search","summary":" I study the problem of learning a Lipschitz function with corrupted binary\nsignals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d\n\\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds.\nIn each round $t$, the adversary selects a context vector $x_t$ in the input\nspace, and the learner makes a guess to the true function value $f(x_t)$ and\nreceives a binary signal indicating whether the guess is high or low. In a\ntotal of $C$ rounds, the signal may be corrupted, though the value of $C$ is\n\\emph{unknown} to the learner. The learner's goal is to incur a small\ncumulative loss. This work introduces the new algorithmic technique\n\\emph{agnostic checking} as well as new analysis techniques. I design\nalgorithms which: for the symmetric loss, the learner achieves regret $L\\cdot\nO(C\\log T)$ with $d = 1$ and $L\\cdot O_d(C\\log T + T^{(d-1)/d})$ with $d > 1$;\nfor the pricing loss, the learner achieves regret $L\\cdot \\widetilde{O}\n(T^{d/(d+1)} + C\\cdot T^{1/(d+1)})$.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2307.13903v4.pdf","comment":"Accepted at ALT 2024"},{"id":"http://arxiv.org/abs/2308.05061v4","updated":"2024-02-01T07:49:50Z","published":"2023-08-09T16:44:25Z","title":"Fine-Tune Language Models as Multi-Modal Differential Equation Solvers","summary":" In the growing domain of scientific machine learning, in-context operator\nlearning has shown notable potential in building foundation models, as in this\nframework the model is trained to learn operators and solve differential\nequations using prompted data, during the inference stage without weight\nupdates. However, the current model's overdependence on function data overlooks\nthe invaluable human insight into the operator. To address this, we present a\ntransformation of in-context operator learning into a multi-modal paradigm. In\nparticular, we take inspiration from the recent success of large language\nmodels, and propose using \"captions\" to integrate human knowledge about the\noperator, expressed through natural language descriptions and equations. Also,\nwe introduce a novel approach to train a language-model-like architecture, or\ndirectly fine-tune existing language models, for in-context operator learning.\nWe beat the baseline on single-modal learning tasks, and also demonstrated the\neffectiveness of multi-modal learning in enhancing performance and reducing\nfunction data requirements. The proposed method not only significantly enhanced\nthe development of the in-context operator learning paradigm, but also created\na new path for the application of language models.\n","authors":["Liu Yang","Siting Liu","Stanley J. Osher"],"pdf_url":"https://arxiv.org/pdf/2308.05061v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.05149v2","updated":"2024-02-01T07:38:07Z","published":"2022-01-13T18:57:30Z","title":"The curse of overparametrization in adversarial training: Precise\n analysis of robust generalization for random features regression","summary":" Successful deep learning models often involve training neural network\narchitectures that contain more parameters than the number of training samples.\nSuch overparametrized models have been extensively studied in recent years, and\nthe virtues of overparametrization have been established from both the\nstatistical perspective, via the double-descent phenomenon, and the\ncomputational perspective via the structural properties of the optimization\nlandscape.\n Despite the remarkable success of deep learning architectures in the\noverparametrized regime, it is also well known that these models are highly\nvulnerable to small adversarial perturbations in their inputs. Even when\nadversarially trained, their performance on perturbed inputs (robust\ngeneralization) is considerably worse than their best attainable performance on\nbenign inputs (standard generalization). It is thus imperative to understand\nhow overparametrization fundamentally affects robustness.\n In this paper, we will provide a precise characterization of the role of\noverparametrization on robustness by focusing on random features regression\nmodels (two-layer neural networks with random first layer weights). We consider\na regime where the sample size, the input dimension and the number of\nparameters grow in proportion to each other, and derive an asymptotically exact\nformula for the robust generalization error when the model is adversarially\ntrained. Our developed theory reveals the nontrivial effect of\noverparametrization on robustness and indicates that for adversarially trained\nrandom features models, high overparametrization can hurt robust\ngeneralization.\n","authors":["Hamed Hassani","Adel Javanmard"],"pdf_url":"https://arxiv.org/pdf/2201.05149v2.pdf","comment":"86 pages (main file: 25 pages and supplementary: 61 pages). To appear\n in the Annals of Statistics"},{"id":"http://arxiv.org/abs/2401.15122v2","updated":"2024-02-01T07:34:53Z","published":"2024-01-26T09:35:17Z","title":"A Multi-Grained Symmetric Differential Equation Model for Learning\n Protein-Ligand Binding Dynamics","summary":" In drug discovery, molecular dynamics (MD) simulation for protein-ligand\nbinding provides a powerful tool for predicting binding affinities, estimating\ntransport properties, and exploring pocket sites. There has been a long history\nof improving the efficiency of MD simulations through better numerical methods\nand, more recently, by utilizing machine learning (ML) methods. Yet, challenges\nremain, such as accurate modeling of extended-timescale simulations. To address\nthis issue, we propose NeuralMD, the first ML surrogate that can facilitate\nnumerical MD and provide accurate simulations in protein-ligand binding. We\npropose a principled approach that incorporates a novel physics-informed\nmulti-grained group symmetric framework. Specifically, we propose (1) a\nBindingNet model that satisfies group symmetry using vector frames and captures\nthe multi-level protein-ligand interactions, and (2) an augmented neural\ndifferential equation solver that learns the trajectory under Newtonian\nmechanics. For the experiment, we design ten single-trajectory and three\nmulti-trajectory binding simulation tasks. We show the efficiency and\neffectiveness of NeuralMD, with a 2000$\\times$ speedup over standard numerical\nMD simulation and outperforming all other ML approaches by up to 80% under the\nstability metric. We further qualitatively show that NeuralMD reaches more\nstable binding predictions compared to other machine learning methods.\n","authors":["Shengchao Liu","Weitao Du","Yanjing Li","Zhuoxinran Li","Vignesh Bhethanabotla","Nakul Rampal","Omar Yaghi","Christian Borgs","Anima Anandkumar","Hongyu Guo","Jennifer Chayes"],"pdf_url":"https://arxiv.org/pdf/2401.15122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00397v1","updated":"2024-02-01T07:33:31Z","published":"2024-02-01T07:33:31Z","title":"Multi-scale Traffic Pattern Bank for Cross-city Few-shot Traffic\n Forecasting","summary":" Traffic forecasting is crucial for intelligent transportation systems (ITS),\naiding in efficient resource allocation and effective traffic control. However,\nits effectiveness often relies heavily on abundant traffic data, while many\ncities lack sufficient data due to limited device support, posing a significant\nchallenge for traffic forecasting. Recognizing this challenge, we have made a\nnoteworthy observation: traffic patterns exhibit similarities across diverse\ncities. Building on this key insight, we propose a solution for the cross-city\nfew-shot traffic forecasting problem called Multi-scale Traffic Pattern Bank\n(MTPB). Primarily, MTPB initiates its learning process by leveraging data-rich\nsource cities, effectively acquiring comprehensive traffic knowledge through a\nspatial-temporal-aware pre-training process. Subsequently, the framework\nemploys advanced clustering techniques to systematically generate a multi-scale\ntraffic pattern bank derived from the learned knowledge. Next, the traffic data\nof the data-scarce target city could query the traffic pattern bank,\nfacilitating the aggregation of meta-knowledge. This meta-knowledge, in turn,\nassumes a pivotal role as a robust guide in subsequent processes involving\ngraph reconstruction and forecasting. Empirical assessments conducted on\nreal-world traffic datasets affirm the superior performance of MTPB, surpassing\nexisting methods across various categories and exhibiting numerous attributes\nconducive to the advancement of cross-city few-shot forecasting methodologies.\nThe code is available in https://github.com/zhyliu00/MTPB.\n","authors":["Zhanyu Liu","Guanjie Zheng","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2402.00397v1.pdf","comment":"Under review. Text overlap with arXiv:2308.09727"},{"id":"http://arxiv.org/abs/2402.00396v1","updated":"2024-02-01T07:32:24Z","published":"2024-02-01T07:32:24Z","title":"Efficient Exploration for LLMs","summary":" We present evidence of substantial benefit from efficient exploration in\ngathering human feedback to improve large language models. In our experiments,\nan agent sequentially generates queries while fitting a reward model to the\nfeedback received. Our best-performing agent generates queries using double\nThompson sampling, with uncertainty represented by an epistemic neural network.\nOur results demonstrate that efficient exploration enables high levels of\nperformance with far fewer queries. Further, both uncertainty estimation and\nthe choice of exploration scheme play critical roles.\n","authors":["Vikranth Dwaracherla","Seyed Mohammad Asghari","Botao Hao","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2402.00396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00393v1","updated":"2024-02-01T07:28:55Z","published":"2024-02-01T07:28:55Z","title":"Loss Function Considering Dead Zone for Neural Networks","summary":" It is important to reveal the inverse dynamics of manipulators to improve\ncontrol performance of model-based control. Neural networks (NNs) are promising\ntechniques to represent complicated inverse dynamics while they require a large\namount of motion data. However, motion data in dead zones of actuators is not\nsuitable for training models decreasing the number of useful training data. In\nthis study, based on the fact that the manipulator joint does not work\nirrespective of input torque in dead zones, we propose a new loss function that\nconsiders only errors of joints not in dead zones. The proposed method enables\nto increase in the amount of motion data available for training and the\naccuracy of the inverse dynamics computation. Experiments on actual equipment\nusing a three-degree-of-freedom (DOF) manipulator showed higher accuracy than\nconventional methods. We also confirmed and discussed the behavior of the model\nof the proposed method in dead zones.\n","authors":["Koki Inami","Koki Yamane","Sho Sakaino"],"pdf_url":"https://arxiv.org/pdf/2402.00393v1.pdf","comment":"6 pages, 6 figures, Accepted at AMC2024"},{"id":"http://arxiv.org/abs/2402.00388v1","updated":"2024-02-01T07:21:30Z","published":"2024-02-01T07:21:30Z","title":"Cumulative Distribution Function based General Temporal Point Processes","summary":" Temporal Point Processes (TPPs) hold a pivotal role in modeling event\nsequences across diverse domains, including social networking and e-commerce,\nand have significantly contributed to the advancement of recommendation systems\nand information retrieval strategies. Through the analysis of events such as\nuser interactions and transactions, TPPs offer valuable insights into\nbehavioral patterns, facilitating the prediction of future trends. However,\naccurately forecasting future events remains a formidable challenge due to the\nintricate nature of these patterns. The integration of Neural Networks with\nTPPs has ushered in the development of advanced deep TPP models. While these\nmodels excel at processing complex and nonlinear temporal data, they encounter\nlimitations in modeling intensity functions, grapple with computational\ncomplexities in integral computations, and struggle to capture long-range\ntemporal dependencies effectively. In this study, we introduce the CuFun model,\nrepresenting a novel approach to TPPs that revolves around the Cumulative\nDistribution Function (CDF). CuFun stands out by uniquely employing a monotonic\nneural network for CDF representation, utilizing past events as a scaling\nfactor. This innovation significantly bolsters the model's adaptability and\nprecision across a wide range of data scenarios. Our approach addresses several\ncritical issues inherent in traditional TPP modeling: it simplifies\nlog-likelihood calculations, extends applicability beyond predefined density\nfunction forms, and adeptly captures long-range temporal patterns. Our\ncontributions encompass the introduction of a pioneering CDF-based TPP model,\nthe development of a methodology for incorporating past event information into\nfuture event prediction, and empirical validation of CuFun's effectiveness\nthrough extensive experimentation on synthetic and real-world datasets.\n","authors":["Maolin Wang","Yu Pan","Zenglin Xu","Ruocheng Guo","Xiangyu Zhao","Wanyu Wang","Yiqi Wang","Zitao Liu","Langming Liu"],"pdf_url":"https://arxiv.org/pdf/2402.00388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04915v2","updated":"2024-02-01T07:03:32Z","published":"2023-10-07T21:00:14Z","title":"On Accelerating Diffusion-based Molecular Conformation Generation in\n SE(3)-invariant Space","summary":" Diffusion-based generative models in SE(3)-invariant space have demonstrated\npromising performance in molecular conformation generation, but typically\nrequire solving stochastic differential equations (SDEs) with thousands of\nupdate steps. Till now, it remains unclear how to effectively accelerate this\nprocedure explicitly in SE(3)-invariant space, which greatly hinders its wide\napplication in the real world. In this paper, we systematically study the\ndiffusion mechanism in SE(3)-invariant space via the lens of approximate errors\ninduced by existing methods. Thereby, we develop more precise approximate in\nSE(3) in the context of projected differential equations. Theoretical analysis\nis further provided as well as empirical proof relating hyper-parameters with\nsuch errors. Altogether, we propose a novel acceleration scheme for generating\nmolecular conformations in SE(3)-invariant space. Experimentally, our scheme\ncan generate high-quality conformations with 50x--100x speedup compared to\nexisting methods.\n","authors":["Zihan Zhou","Ruiying Liu","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2310.04915v2.pdf","comment":"We are currently developing a new manuscript that significantly\n expands upon and integrates the research presented here. The forthcoming\n paper includes broader analyses and more comprehensive findings, rendering\n the current version obsolete. We believe this decision will contribute to a\n clearer and more consolidated presentation of our research findings"},{"id":"http://arxiv.org/abs/2402.00376v1","updated":"2024-02-01T06:47:56Z","published":"2024-02-01T06:47:56Z","title":"Image2Points:A 3D Point-based Context Clusters GAN for High-Quality PET\n Image Reconstruction","summary":" To obtain high-quality Positron emission tomography (PET) images while\nminimizing radiation exposure, numerous methods have been proposed to\nreconstruct standard-dose PET (SPET) images from the corresponding low-dose PET\n(LPET) images. However, these methods heavily rely on voxel-based\nrepresentations, which fall short of adequately accounting for the precise\nstructure and fine-grained context, leading to compromised reconstruction. In\nthis paper, we propose a 3D point-based context clusters GAN, namely PCC-GAN,\nto reconstruct high-quality SPET images from LPET. Specifically, inspired by\nthe geometric representation power of points, we resort to a point-based\nrepresentation to enhance the explicit expression of the image structure, thus\nfacilitating the reconstruction with finer details. Moreover, a context\nclustering strategy is applied to explore the contextual relationships among\npoints, which mitigates the ambiguities of small structures in the\nreconstructed images. Experiments on both clinical and phantom datasets\ndemonstrate that our PCC-GAN outperforms the state-of-the-art reconstruction\nmethods qualitatively and quantitatively. Code is available at\nhttps://github.com/gluucose/PCCGAN.\n","authors":["Jiaqi Cui","Yan Wang","Lu Wen","Pinxian Zeng","Xi Wu","Jiliu Zhou","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2402.00376v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.00533v4","updated":"2024-02-01T06:10:00Z","published":"2023-10-01T00:52:24Z","title":"SELF: Self-Evolution with Language Feedback","summary":" Large Language Models (LLMs) have demonstrated remarkable versatility across\nvarious domains. To further advance LLMs, we propose 'SELF' (Self-Evolution\nwith Language Feedback), a novel approach that enables LLMs to self-improve\nthrough self-reflection, akin to human learning processes. SELF initiates with\na meta-skill learning process that equips the LLMs with capabilities for\nself-feedback and self-refinement. Subsequently, the model undergoes an\niterative process of self-evolution. In each iteration, it utilizes an\nunlabeled dataset of instructions to generate initial responses. These\nresponses are enhanced through self-feedback and self-refinement. The model is\nthen fine-tuned using this enhanced data. The model undergoes progressive\nimprovement through this iterative self-evolution process. Moreover, the SELF\nframework enables the model to apply self-refinement during inference, which\nfurther improves response quality. Our experiments in mathematics and general\ntasks demonstrate that SELF can enhance the capabilities of LLMs without human\nintervention. The SELF framework indicates a promising direction for the\nautonomous evolution of LLMs, transitioning them from passive information\nreceivers to active participants in their development.\n","authors":["Jianqiao Lu","Wanjun Zhong","Wenyong Huang","Yufei Wang","Qi Zhu","Fei Mi","Baojun Wang","Weichao Wang","Xingshan Zeng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00533v4.pdf","comment":"20 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2402.00355v1","updated":"2024-02-01T05:53:44Z","published":"2024-02-01T05:53:44Z","title":"Adaptive Primal-Dual Method for Safe Reinforcement Learning","summary":" Primal-dual methods have a natural application in Safe Reinforcement Learning\n(SRL), posed as a constrained policy optimization problem. In practice however,\napplying primal-dual methods to SRL is challenging, due to the inter-dependency\nof the learning rate (LR) and Lagrangian multipliers (dual variables) each time\nan embedded unconstrained RL problem is solved. In this paper, we propose,\nanalyze and evaluate adaptive primal-dual (APD) methods for SRL, where two\nadaptive LRs are adjusted to the Lagrangian multipliers so as to optimize the\npolicy in each iteration. We theoretically establish the convergence,\noptimality and feasibility of the APD algorithm. Finally, we conduct numerical\nevaluation of the practical APD algorithm with four well-known environments in\nBullet-Safey-Gym employing two state-of-the-art SRL algorithms: PPO-Lagrangian\nand DDPG-Lagrangian. All experiments show that the practical APD algorithm\noutperforms (or achieves comparable performance) and attains more stable\ntraining than the constant LR cases. Additionally, we substantiate the\nrobustness of selecting the two adaptive LRs by empirical evidence.\n","authors":["Weiqin Chen","James Onyejizu","Long Vu","Lan Hoang","Dharmashankar Subramanian","Koushik Kar","Sandipan Mishra","Santiago Paternain"],"pdf_url":"https://arxiv.org/pdf/2402.00355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00351v1","updated":"2024-02-01T05:35:25Z","published":"2024-02-01T05:35:25Z","title":"Machine Unlearning for Image-to-Image Generative Models","summary":" Machine unlearning has emerged as a new paradigm to deliberately forget data\nsamples from a given model in order to adhere to stringent regulations.\nHowever, existing machine unlearning methods have been primarily focused on\nclassification models, leaving the landscape of unlearning for generative\nmodels relatively unexplored. This paper serves as a bridge, addressing the gap\nby providing a unifying framework of machine unlearning for image-to-image\ngenerative models. Within this framework, we propose a\ncomputationally-efficient algorithm, underpinned by rigorous theoretical\nanalysis, that demonstrates negligible performance degradation on the retain\nsamples, while effectively removing the information from the forget samples.\nEmpirical studies on two large-scale datasets, ImageNet-1K and Places-365,\nfurther show that our algorithm does not rely on the availability of the retain\nsamples, which further complies with data retention policy. To our best\nknowledge, this work is the first that represents systemic, theoretical,\nempirical explorations of machine unlearning specifically tailored for\nimage-to-image generative models. Our code is available at\nhttps://github.com/jpmorganchase/l2l-generator-unlearning.\n","authors":["Guihong Li","Hsiang Hsu"," Chun-Fu"," Chen","Radu Marculescu"],"pdf_url":"https://arxiv.org/pdf/2402.00351v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2402.00348v1","updated":"2024-02-01T05:30:51Z","published":"2024-02-01T05:30:51Z","title":"ODICE: Revealing the Mystery of Distribution Correction Estimation via\n Orthogonal-gradient Update","summary":" In this study, we investigate the DIstribution Correction Estimation (DICE)\nmethods, an important line of work in offline reinforcement learning (RL) and\nimitation learning (IL). DICE-based methods impose state-action-level behavior\nconstraint, which is an ideal choice for offline learning. However, they\ntypically perform much worse than current state-of-the-art (SOTA) methods that\nsolely use action-level behavior constraint. After revisiting DICE-based\nmethods, we find there exist two gradient terms when learning the value\nfunction using true-gradient update: forward gradient (taken on the current\nstate) and backward gradient (taken on the next state). Using forward gradient\nbears a large similarity to many offline RL methods, and thus can be regarded\nas applying action-level constraint. However, directly adding the backward\ngradient may degenerate or cancel out its effect if these two gradients have\nconflicting directions. To resolve this issue, we propose a simple yet\neffective modification that projects the backward gradient onto the normal\nplane of the forward gradient, resulting in an orthogonal-gradient update, a\nnew learning rule for DICE-based methods. We conduct thorough theoretical\nanalyses and find that the projected backward gradient brings state-level\nbehavior regularization, which reveals the mystery of DICE-based methods: the\nvalue learning objective does try to impose state-action-level constraint, but\nneeds to be used in a corrected way. Through toy examples and extensive\nexperiments on complex offline RL and IL tasks, we demonstrate that DICE-based\nmethods using orthogonal-gradient updates (O-DICE) achieve SOTA performance and\ngreat robustness.\n","authors":["Liyuan Mao","Haoran Xu","Weinan Zhang","Xianyuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2402.00348v1.pdf","comment":"Spotlight @ ICLR 2024, first two authors contribute equally"},{"id":"http://arxiv.org/abs/2402.00347v1","updated":"2024-02-01T05:28:28Z","published":"2024-02-01T05:28:28Z","title":"Diverse Explanations from Data-driven and Domain-driven Perspectives for\n Machine Learning Models","summary":" Explanations of machine learning models are important, especially in\nscientific areas such as chemistry, biology, and physics, where they guide\nfuture laboratory experiments and resource requirements. These explanations can\nbe derived from well-trained machine learning models (data-driven perspective)\nor specific domain knowledge (domain-driven perspective). However, there exist\ninconsistencies between these perspectives due to accurate yet misleading\nmachine learning models and various stakeholders with specific needs, wants, or\naims. This paper calls attention to these inconsistencies and suggests a way to\nfind an accurate model with expected explanations that reinforce physical laws\nand meet stakeholders' requirements from a set of equally-good models, also\nknown as Rashomon sets. Our goal is to foster a comprehensive understanding of\nthese inconsistencies and ultimately contribute to the integration of\neXplainable Artificial Intelligence (XAI) into scientific domains.\n","authors":["Sichao Li","Amanda Barnard"],"pdf_url":"https://arxiv.org/pdf/2402.00347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00342v1","updated":"2024-02-01T05:13:14Z","published":"2024-02-01T05:13:14Z","title":"Survey of Privacy Threats and Countermeasures in Federated Learning","summary":" Federated learning is widely considered to be as a privacy-aware learning\nmethod because no training data is exchanged directly between clients.\nNevertheless, there are threats to privacy in federated learning, and privacy\ncountermeasures have been studied. However, we note that common and unique\nprivacy threats among typical types of federated learning have not been\ncategorized and described in a comprehensive and specific way. In this paper,\nwe describe privacy threats and countermeasures for the typical types of\nfederated learning; horizontal federated learning, vertical federated learning,\nand transfer federated learning.\n","authors":["Masahiro Hayashitani","Junki Mori","Isamu Teranishi"],"pdf_url":"https://arxiv.org/pdf/2402.00342v1.pdf","comment":"Scheduled for renewal by March 2024"},{"id":"http://arxiv.org/abs/2206.14397v3","updated":"2024-02-01T05:03:56Z","published":"2022-06-29T04:32:10Z","title":"Fair Machine Learning in Healthcare: A Review","summary":" The digitization of healthcare data coupled with advances in computational\ncapabilities has propelled the adoption of machine learning (ML) in healthcare.\nHowever, these methods can perpetuate or even exacerbate existing disparities,\nleading to fairness concerns such as the unequal distribution of resources and\ndiagnostic inaccuracies among different demographic groups. Addressing these\nfairness problem is paramount to prevent further entrenchment of social\ninjustices. In this survey, we analyze the intersection of fairness in machine\nlearning and healthcare disparities. We adopt a framework based on the\nprinciples of distributive justice to categorize fairness concerns into two\ndistinct classes: equal allocation and equal performance. We provide a critical\nreview of the associated fairness metrics from a machine learning standpoint\nand examine biases and mitigation strategies across the stages of the ML\nlifecycle, discussing the relationship between biases and their\ncountermeasures. The paper concludes with a discussion on the pressing\nchallenges that remain unaddressed in ensuring fairness in healthcare ML, and\nproposes several new research directions that hold promise for developing\nethical and equitable ML applications in healthcare.\n","authors":["Qizhang Feng","Mengnan Du","Na Zou","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2206.14397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00206v3","updated":"2024-02-01T04:56:50Z","published":"2023-08-01T00:05:02Z","title":"Synthetic Skull CT Generation with Generative Adversarial Networks to\n Train Deep Learning Models for Clinical Transcranial Ultrasound","summary":" Deep learning offers potential for various healthcare applications, yet\nrequires extensive datasets of curated medical images where data privacy, cost,\nand distribution mismatch across various acquisition centers could become major\nproblems. To overcome these challenges, we propose a generative adversarial\nnetwork (SkullGAN) to create large datasets of synthetic skull CT slices,\ngeared towards training models for transcranial ultrasound. With wide ranging\napplications in treatment of essential tremor, Parkinson's, and Alzheimer's\ndisease, transcranial ultrasound clinical pipelines can be significantly\noptimized via integration of deep learning. The main roadblock is the lack of\nsufficient skull CT slices for the purposes of training, which SkullGAN aims to\naddress. Actual CT slices of 38 healthy subjects were used for training. The\ngenerated synthetic skull images were then evaluated based on skull density\nratio, mean thickness, and mean intensity. Their fidelity was further analyzed\nusing t-distributed stochastic neighbor embedding (t-SNE), Fr\\'echet inception\ndistance (FID) score, and visual Turing test (VTT) taken by four staff clinical\nradiologists. SkullGAN-generated images demonstrated similar quantitative\nradiological features to real skulls. t-SNE failed to separate real and\nsynthetic samples from one another, and the FID score was 49. Expert\nradiologists achieved a 60\\% mean accuracy on the VTT. SkullGAN makes it\npossible for researchers to generate large numbers of synthetic skull CT\nsegments, necessary for training neural networks for medical applications\ninvolving the human skull, such as transcranial focused ultrasound, mitigating\nchallenges with access, privacy, capital, time, and the need for domain\nexpertise.\n","authors":["Kasra Naftchi-Ardebili","Karanpartap Singh","Reza Pourabolghasem","Pejman Ghanouni","Gerald R. Popelka","Kim Butts Pauly"],"pdf_url":"https://arxiv.org/pdf/2308.00206v3.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2402.00332v1","updated":"2024-02-01T04:35:37Z","published":"2024-02-01T04:35:37Z","title":"Comparing Spectral Bias and Robustness For Two-Layer Neural Networks:\n SGD vs Adaptive Random Fourier Features","summary":" We present experimental results highlighting two key differences resulting\nfrom the choice of training algorithm for two-layer neural networks. The\nspectral bias of neural networks is well known, while the spectral bias\ndependence on the choice of training algorithm is less studied. Our experiments\ndemonstrate that an adaptive random Fourier features algorithm (ARFF) can yield\na spectral bias closer to zero compared to the stochastic gradient descent\noptimizer (SGD). Additionally, we train two identically structured classifiers,\nemploying SGD and ARFF, to the same accuracy levels and empirically assess\ntheir robustness against adversarial noise attacks.\n","authors":["Aku Kammonen","Lisi Liang","Anamika Pandey","Raúl Tempone"],"pdf_url":"https://arxiv.org/pdf/2402.00332v1.pdf","comment":"6 Pages, 4 Figures; Accepted in the International Conference on\n Scientific Computing and Machine Learning"},{"id":"http://arxiv.org/abs/2402.00326v1","updated":"2024-02-01T04:17:56Z","published":"2024-02-01T04:17:56Z","title":"PirateNets: Physics-informed Deep Learning with Residual Adaptive\n Networks","summary":" While physics-informed neural networks (PINNs) have become a popular deep\nlearning framework for tackling forward and inverse problems governed by\npartial differential equations (PDEs), their performance is known to degrade\nwhen larger and deeper neural network architectures are employed. Our study\nidentifies that the root of this counter-intuitive behavior lies in the use of\nmulti-layer perceptron (MLP) architectures with non-suitable initialization\nschemes, which result in poor trainablity for the network derivatives, and\nultimately lead to an unstable minimization of the PDE residual loss. To\naddress this, we introduce Physics-informed Residual Adaptive Networks\n(PirateNets), a novel architecture that is designed to facilitate stable and\nefficient training of deep PINN models. PirateNets leverage a novel adaptive\nresidual connection, which allows the networks to be initialized as shallow\nnetworks that progressively deepen during training. We also show that the\nproposed initialization scheme allows us to encode appropriate inductive biases\ncorresponding to a given PDE system into the network architecture. We provide\ncomprehensive empirical evidence showing that PirateNets are easier to optimize\nand can gain accuracy from considerably increased depth, ultimately achieving\nstate-of-the-art results across various benchmarks. All code and data\naccompanying this manuscript will be made publicly available at\n\\url{https://github.com/PredictiveIntelligenceLab/jaxpi}.\n","authors":["Sifan Wang","Bowen Li","Yuhan Chen","Paris Perdikaris"],"pdf_url":"https://arxiv.org/pdf/2402.00326v1.pdf","comment":"29 Pages, 15 Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2402.00324v1","updated":"2024-02-01T04:17:15Z","published":"2024-02-01T04:17:15Z","title":"A Consistent Lebesgue Measure for Multi-label Learning","summary":" Multi-label loss functions are usually non-differentiable, requiring\nsurrogate loss functions for gradient-based optimisation. The consistency of\nsurrogate loss functions is not proven and is exacerbated by the conflicting\nnature of multi-label loss functions. To directly learn from multiple related,\nyet potentially conflicting multi-label loss functions, we propose a Consistent\nLebesgue Measure-based Multi-label Learner (CLML) and prove that CLML can\nachieve theoretical consistency under a Bayes risk framework. Empirical\nevidence supports our theory by demonstrating that: (1) CLML can consistently\nachieve state-of-the-art results; (2) the primary performance factor is the\nLebesgue measure design, as CLML optimises a simpler feedforward model without\nadditional label graph, perturbation-based conditioning, or semantic\nembeddings; and (3) an analysis of the results not only distinguishes CLML's\neffectiveness but also highlights inconsistencies between the surrogate and the\ndesired loss functions.\n","authors":["Kaan Demir","Bach Nguyen","Bing Xue","Mengjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00318v1","updated":"2024-02-01T04:05:24Z","published":"2024-02-01T04:05:24Z","title":"Analog-digital Scheduling for Federated Learning: A\n Communication-Efficient Approach","summary":" Over-the-air (OTA) computation has recently emerged as a\ncommunication-efficient Federated Learning (FL) paradigm to train machine\nlearning models over wireless networks. However, its performance is limited by\nthe device with the worst SNR, resulting in fast yet noisy updates. On the\nother hand, allocating orthogonal resource blocks (RB) to individual devices\nvia digital channels mitigates the noise problem, at the cost of increased\ncommunication latency. In this paper, we address this discrepancy and present\nADFL, a novel Analog-Digital FL scheme: in each round, the parameter server\n(PS) schedules each device to either upload its gradient via the analog OTA\nscheme or transmit its quantized gradient over an orthogonal RB using the\n``digital\" scheme. Focusing on a single FL round, we cast the optimal\nscheduling problem as the minimization of the mean squared error (MSE) on the\nestimated global gradient at the PS, subject to a delay constraint, yielding\nthe optimal device scheduling configuration and quantization bits for the\ndigital devices. Our simulation results show that ADFL, by scheduling most of\nthe devices in the OTA scheme while also occasionally employing the digital\nscheme for a few devices, consistently outperforms OTA-only and digital-only\nschemes, in both i.i.d. and non-i.i.d. settings.\n","authors":["Muhammad Faraz Ul Abrar","Nicolò Michelusi"],"pdf_url":"https://arxiv.org/pdf/2402.00318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00315v1","updated":"2024-02-01T03:56:48Z","published":"2024-02-01T03:56:48Z","title":"Online Distribution Learning with Local Private Constraints","summary":" We study the problem of online conditional distribution estimation with\n\\emph{unbounded} label sets under local differential privacy. Let $\\mathcal{F}$\nbe a distribution-valued function class with unbounded label set. We aim at\nestimating an \\emph{unknown} function $f\\in \\mathcal{F}$ in an online fashion\nso that at time $t$ when the context $\\boldsymbol{x}_t$ is provided we can\ngenerate an estimate of $f(\\boldsymbol{x}_t)$ under KL-divergence knowing only\na privatized version of the true labels sampling from $f(\\boldsymbol{x}_t)$.\nThe ultimate objective is to minimize the cumulative KL-risk of a finite\nhorizon $T$. We show that under $(\\epsilon,0)$-local differential privacy of\nthe privatized labels, the KL-risk grows as\n$\\tilde{\\Theta}(\\frac{1}{\\epsilon}\\sqrt{KT})$ upto poly-logarithmic factors\nwhere $K=|\\mathcal{F}|$. This is in stark contrast to the\n$\\tilde{\\Theta}(\\sqrt{T\\log K})$ bound demonstrated by Wu et al. (2023a) for\nbounded label sets. As a byproduct, our results recover a nearly tight upper\nbound for the hypothesis selection problem of gopi et al. (2020) established\nonly for the batch setting.\n","authors":["Jin Sima","Changlong Wu","Olgica Milenkovic","Wojciech Szpankowski"],"pdf_url":"https://arxiv.org/pdf/2402.00315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00313v1","updated":"2024-02-01T03:53:56Z","published":"2024-02-01T03:53:56Z","title":"Control in Stochastic Environment with Delays: A Model-based\n Reinforcement Learning Approach","summary":" In this paper we are introducing a new reinforcement learning method for\ncontrol problems in environments with delayed feedback. Specifically, our\nmethod employs stochastic planning, versus previous methods that used\ndeterministic planning. This allows us to embed risk preference in the policy\noptimization problem. We show that this formulation can recover the optimal\npolicy for problems with deterministic transitions. We contrast our policy with\ntwo prior methods from literature. We apply the methodology to simple tasks to\nunderstand its features. Then, we compare the performance of the methods in\ncontrolling multiple Atari games.\n","authors":["Zhiyuan Yao","Ionut Florescu","Chihoon Lee"],"pdf_url":"https://arxiv.org/pdf/2402.00313v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2402.00310v1","updated":"2024-02-01T03:49:10Z","published":"2024-02-01T03:49:10Z","title":"Seismic Traveltime Tomography with Label-free Learning","summary":" Deep learning techniques have been used to build velocity models (VMs) for\nseismic traveltime tomography and have shown encouraging performance in recent\nyears. However, they need to generate labeled samples (i.e., pairs of input and\nlabel) to train the deep neural network (NN) with end-to-end learning, and the\nreal labels for field data inversion are usually missing or very expensive.\nSome traditional tomographic methods can be implemented quickly, but their\neffectiveness is often limited by prior assumptions. To avoid generating\nlabeled samples, we propose a novel method by integrating deep learning and\ndictionary learning to enhance the VMs with low resolution by using the\ntraditional tomography-least square method (LSQR). We first design a type of\nshallow and simple NN to reduce computational cost followed by proposing a\ntwo-step strategy to enhance the VMs with low resolution: (1) Warming up. An\ninitial dictionary is trained from the estimation by LSQR through dictionary\nlearning method; (2) Dictionary optimization. The initial dictionary obtained\nin the warming-up step will be optimized by the NN, and then it will be used to\nreconstruct high-resolution VMs with the reference slowness and the estimation\nby LSQR. Furthermore, we design a loss function to minimize traveltime misfit\nto ensure that NN training is label-free, and the optimized dictionary can be\nobtained after each epoch of NN training. We demonstrate the effectiveness of\nthe proposed method through numerical tests.\n","authors":["Feng Wang","Bo Yang","Renfang Wang","Hong Qiu"],"pdf_url":"https://arxiv.org/pdf/2402.00310v1.pdf","comment":"15 pages, 19 figures. Submitted to IEEE Transactions on Geoscience\n and Remote Sensing"},{"id":"http://arxiv.org/abs/2402.00306v1","updated":"2024-02-01T03:39:15Z","published":"2024-02-01T03:39:15Z","title":"An Accurate and Low-Parameter Machine Learning Architecture for Next\n Location Prediction","summary":" Next location prediction is a discipline that involves predicting a users\nnext location. Its applications include resource allocation, quality of\nservice, energy efficiency, and traffic management. This paper proposes an\nenergy-efficient, small, and low parameter machine learning (ML) architecture\nfor accurate next location prediction, deployable on modest base stations and\nedge devices. To accomplish this we ran a hundred hyperparameter experiments on\nthe full human mobility patterns of an entire city, to determine an exact ML\narchitecture that reached a plateau of accuracy with the least amount of model\nparameters. We successfully achieved a reduction in the number of model\nparameters within published ML architectures from 202 million down to 2\nmillion. This reduced the total size of the model parameters from 791 MB down\nto 8 MB. Additionally, this decreased the training time by a factor of four,\nthe amount of graphics processing unit (GPU) memory needed for training by a\nfactor of twenty, and the overall accuracy was increased from 80.16% to 82.54%.\nThis improvement allows for modest base stations and edge devices which do not\nhave a large amount of memory or storage, to deploy and utilize the proposed ML\narchitecture for next location prediction.\n","authors":["Calvin Jary","Nafiseh Kahani"],"pdf_url":"https://arxiv.org/pdf/2402.00306v1.pdf","comment":"7 page conference paper. Paper was accepted and presented in person\n at the 2023 IEEE Future Networks World Forum, in Baltimore, Maryland, USA"},{"id":"http://arxiv.org/abs/2305.18460v3","updated":"2024-02-01T03:36:36Z","published":"2023-05-29T06:51:16Z","title":"Minimum Width of Leaky-ReLU Neural Networks for Uniform Universal\n Approximation","summary":" The study of universal approximation properties (UAP) for neural networks\n(NN) has a long history. When the network width is unlimited, only a single\nhidden layer is sufficient for UAP. In contrast, when the depth is unlimited,\nthe width for UAP needs to be not less than the critical width\n$w^*_{\\min}=\\max(d_x,d_y)$, where $d_x$ and $d_y$ are the dimensions of the\ninput and output, respectively. Recently, \\cite{cai2022achieve} shows that a\nleaky-ReLU NN with this critical width can achieve UAP for $L^p$ functions on a\ncompact domain ${K}$, \\emph{i.e.,} the UAP for $L^p({K},\\mathbb{R}^{d_y})$.\nThis paper examines a uniform UAP for the function class\n$C({K},\\mathbb{R}^{d_y})$ and gives the exact minimum width of the leaky-ReLU\nNN as $w_{\\min}=\\max(d_x,d_y)+\\Delta (d_x, d_y)$, where $\\Delta (d_x, d_y)$ is\nthe additional dimensions for approximating continuous functions with\ndiffeomorphisms via embedding. To obtain this result, we propose a novel\nlift-flow-discretization approach that shows that the uniform UAP has a deep\nconnection with topological theory.\n","authors":["Li'ang Li","Yifei Duan","Guanghua Ji","Yongqiang Cai"],"pdf_url":"https://arxiv.org/pdf/2305.18460v3.pdf","comment":"Include errata of the previous versions"},{"id":"http://arxiv.org/abs/2402.00300v1","updated":"2024-02-01T03:27:26Z","published":"2024-02-01T03:27:26Z","title":"Self-supervised learning of video representations from a child's\n perspective","summary":" Children learn powerful internal models of the world around them from a few\nyears of egocentric visual experience. Can such internal models be learned from\na child's visual experience with highly generic learning algorithms or do they\nrequire strong inductive biases? Recent advances in collecting large-scale,\nlongitudinal, developmentally realistic video datasets and generic\nself-supervised learning (SSL) algorithms are allowing us to begin to tackle\nthis nature vs. nurture question. However, existing work typically focuses on\nimage-based SSL algorithms and visual capabilities that can be learned from\nstatic images (e.g. object recognition), thus ignoring temporal aspects of the\nworld. To close this gap, here we train self-supervised video models on\nlongitudinal, egocentric headcam recordings collected from a child over a two\nyear period in their early development (6-31 months). The resulting models are\nhighly effective at facilitating the learning of action concepts from a small\nnumber of labeled examples; they have favorable data size scaling properties;\nand they display emergent video interpolation capabilities. Video models also\nlearn more robust object representations than image-based models trained with\nthe exact same data. These results suggest that important temporal aspects of a\nchild's internal model of the world may be learnable from their visual\nexperience using highly generic learning algorithms and without strong\ninductive biases.\n","authors":["A. Emin Orhan","Wentao Wang","Alex N. Wang","Mengye Ren","Brenden M. Lake"],"pdf_url":"https://arxiv.org/pdf/2402.00300v1.pdf","comment":"7 pages, 6 figures; code & models available from\n https://github.com/eminorhan/video-models"},{"id":"http://arxiv.org/abs/2402.00299v1","updated":"2024-02-01T03:20:53Z","published":"2024-02-01T03:20:53Z","title":"Attention-based Dynamic Multilayer Graph Neural Networks for Loan\n Default Prediction","summary":" Whereas traditional credit scoring tends to employ only individual borrower-\nor loan-level predictors, it has been acknowledged for some time that\nconnections between borrowers may result in default risk propagating over a\nnetwork. In this paper, we present a model for credit risk assessment\nleveraging a dynamic multilayer network built from a Graph Neural Network and a\nRecurrent Neural Network, each layer reflecting a different source of network\nconnection. We test our methodology in a behavioural credit scoring context\nusing a dataset provided by U.S. mortgage financier Freddie Mac, in which\ndifferent types of connections arise from the geographical location of the\nborrower and their choice of mortgage provider. The proposed model considers\nboth types of connections and the evolution of these connections over time. We\nenhance the model by using a custom attention mechanism that weights the\ndifferent time snapshots according to their importance. After testing multiple\nconfigurations, a model with GAT, LSTM, and the attention mechanism provides\nthe best results. Empirical results demonstrate that, when it comes to\npredicting probability of default for the borrowers, our proposed model brings\nboth better results and novel insights for the analysis of the importance of\nconnections and timestamps, compared to traditional methods.\n","authors":["Sahab Zandi","Kamesh Korangi","María Óskarsdóttir","Christophe Mues","Cristián Bravo"],"pdf_url":"https://arxiv.org/pdf/2402.00299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04072v4","updated":"2024-02-01T02:31:33Z","published":"2023-06-07T00:13:21Z","title":"Exploring Simple, High Quality Out-of-Distribution Detection with L2\n Normalization","summary":" We demonstrate that L2 normalization over feature space can produce capable\nperformance for Out-of-Distribution (OoD) detection for some models and\ndatasets. Although it does not demonstrate outright state-of-the-art\nperformance, this method is notable for its extreme simplicity: it requires\nonly two addition lines of code, and does not need specialized loss functions,\nimage augmentations, outlier exposure or extra parameter tuning. We also\nobserve that training may be more efficient for some datasets and\narchitectures. Notably, only 60 epochs with ResNet18 on CIFAR10 (or 100 epochs\nwith ResNet50) can produce performance within two percentage points (AUROC) of\nseveral state-of-the-art methods for some near and far OoD datasets. We provide\ntheoretical and empirical support for this method, and demonstrate viability\nacross five architectures and three In-Distribution (ID) datasets.\n","authors":["Jarrod Haas","William Yolland","Bernhard Rabus"],"pdf_url":"https://arxiv.org/pdf/2306.04072v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00284v1","updated":"2024-02-01T02:29:16Z","published":"2024-02-01T02:29:16Z","title":"PAP-REC: Personalized Automatic Prompt for Recommendation Language Model","summary":" Recently emerged prompt-based Recommendation Language Models (RLM) can solve\nmultiple recommendation tasks uniformly. The RLMs make full use of the\ninherited knowledge learned from the abundant pre-training data to solve the\ndownstream recommendation tasks by prompts, without introducing additional\nparameters or network training. However, handcrafted prompts require\nsignificant expertise and human effort since slightly rewriting prompts may\ncause massive performance changes. In this paper, we propose PAP-REC, a\nframework to generate the Personalized Automatic Prompt for RECommendation\nlanguage models to mitigate the inefficiency and ineffectiveness problems\nderived from manually designed prompts. Specifically, personalized automatic\nprompts allow different users to have different prompt tokens for the same\ntask, automatically generated using a gradient-based method. One challenge for\npersonalized automatic prompt generation for recommendation language models is\nthe extremely large search space, leading to a long convergence time. To\neffectively and efficiently address the problem, we develop surrogate metrics\nand leverage an alternative updating schedule for prompting recommendation\nlanguage models. Experimental results show that our PAP-REC framework manages\nto generate personalized prompts, and the automatically generated prompts\noutperform manually constructed prompts and also outperform various baseline\nrecommendation models. The source code of the work is available at\nhttps://github.com/rutgerswiselab/PAP-REC.\n","authors":["Zelong Li","Jianchao Ji","Yingqiang Ge","Wenyue Hua","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10371v3","updated":"2024-02-01T02:20:36Z","published":"2024-01-18T20:35:47Z","title":"Langevin Unlearning: A New Perspective of Noisy Gradient Descent for\n Machine Unlearning","summary":" Machine unlearning has raised significant interest with the adoption of laws\nensuring the ``right to be forgotten''. Researchers have provided a\nprobabilistic notion of approximate unlearning under a similar definition of\nDifferential Privacy (DP), where privacy is defined as statistical\nindistinguishability to retraining from scratch. We propose Langevin\nunlearning, an unlearning framework based on noisy gradient descent with\nprivacy guarantees for approximate unlearning problems. Langevin unlearning\nunifies the DP learning process and the privacy-certified unlearning process\nwith many algorithmic benefits. These include approximate certified unlearning\nfor non-convex problems, complexity saving compared to retraining, sequential\nand batch unlearning for multiple unlearning requests. We verify the\npracticality of Langevin unlearning by studying its privacy-utility-complexity\ntrade-off via experiments on benchmark datasets, and also demonstrate its\nsuperiority against gradient-decent-plus-output-perturbation based approximate\nunlearning.\n","authors":["Eli Chien","Haoyu Wang","Ziang Chen","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10371v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12644v2","updated":"2024-02-01T02:14:20Z","published":"2022-09-26T12:47:08Z","title":"FORESEE: Prediction with Expansion-Compression Unscented Transform for\n Online Policy Optimization","summary":" Propagating state distributions through a generic, uncertain nonlinear\ndynamical model is known to be intractable and usually begets numerical or\nanalytical approximations. We introduce a method for state prediction, called\nthe Expansion-Compression Unscented Transform, and use it to solve a class of\nonline policy optimization problems. Our proposed algorithm propagates a finite\nnumber of sigma points through a state-dependent distribution, which dictates\nan increase in the number of sigma points at each time step to represent the\nresulting distribution; this is what we call the expansion operation. To keep\nthe algorithm scalable, we augment the expansion operation with a compression\noperation based on moment matching, thereby keeping the number of sigma points\nconstant across predictions over multiple time steps. Its performance is\nempirically shown to be comparable to Monte Carlo but at a much lower\ncomputational cost. Under state and control input constraints, the state\nprediction is subsequently used in tandem with a proposed variant of\nconstrained gradient-descent for online update of policy parameters in a\nreceding horizon fashion. The framework is implemented as a differentiable\ncomputational graph for policy training. We showcase our framework for a\nquadrotor stabilization task as part of a benchmark comparison in\nsafe-control-gym and for optimizing the parameters of a Control Barrier\nFunction based controller in a leader-follower problem.\n","authors":["Hardik Parwana","Dimitra Panagou"],"pdf_url":"https://arxiv.org/pdf/2209.12644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03140v3","updated":"2024-02-01T02:08:55Z","published":"2024-01-06T06:55:26Z","title":"Fair Sampling in Diffusion Models through Switching Mechanism","summary":" Diffusion models have shown their effectiveness in generation tasks by\nwell-approximating the underlying probability distribution. However, diffusion\nmodels are known to suffer from an amplified inherent bias from the training\ndata in terms of fairness. While the sampling process of diffusion models can\nbe controlled by conditional guidance, previous works have attempted to find\nempirical guidance to achieve quantitative fairness. To address this\nlimitation, we propose a fairness-aware sampling method called\n\\textit{attribute switching} mechanism for diffusion models. Without additional\ntraining, the proposed sampling can obfuscate sensitive attributes in generated\ndata without relying on classifiers. We mathematically prove and experimentally\ndemonstrate the effectiveness of the proposed method on two key aspects: (i)\nthe generation of fair data and (ii) the preservation of the utility of the\ngenerated data.\n","authors":["Yujin Choi","Jinseong Park","Hoki Kim","Jaewook Lee","Saeroom Park"],"pdf_url":"https://arxiv.org/pdf/2401.03140v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2401.16661v2","updated":"2024-02-01T02:08:47Z","published":"2024-01-30T01:24:43Z","title":"Generalization of LiNGAM that allows confounding","summary":" LiNGAM determines the variable order from cause to effect using additive\nnoise models, but it faces challenges with confounding. Previous methods\nmaintained LiNGAM's fundamental structure while trying to identify and address\nvariables affected by confounding. As a result, these methods required\nsignificant computational resources regardless of the presence of confounding,\nand they did not ensure the detection of all confounding types. In contrast,\nthis paper enhances LiNGAM by introducing LiNGAM-MMI, a method that quantifies\nthe magnitude of confounding using KL divergence and arranges the variables to\nminimize its impact. This method efficiently achieves a globally optimal\nvariable order through the shortest path problem formulation. LiNGAM-MMI\nprocesses data as efficiently as traditional LiNGAM in scenarios without\nconfounding while effectively addressing confounding situations. Our\nexperimental results suggest that LiNGAM-MMI more accurately determines the\ncorrect variable order, both in the presence and absence of confounding.\n","authors":["Joe Suzuki","Tian-Le Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17496v3","updated":"2024-02-01T01:43:58Z","published":"2023-10-26T15:52:34Z","title":"Tackling Interference Induced by Data Training Loops in A/B Tests: A\n Weighted Training Approach","summary":" In modern recommendation systems, the standard pipeline involves training\nmachine learning models on historical data to predict user behaviors and\nimprove recommendations continuously. However, these data training loops can\nintroduce interference in A/B tests, where data generated by control and\ntreatment algorithms, potentially with different distributions, are combined.\nTo address these challenges, we introduce a novel approach called weighted\ntraining. This approach entails training a model to predict the probability of\neach data point appearing in either the treatment or control data and\nsubsequently applying weighted losses during model training. We demonstrate\nthat this approach achieves the least variance among all estimators without\ncausing shifts in the training distributions. Through simulation studies, we\ndemonstrate the lower bias and variance of our approach compared to other\nmethods.\n","authors":["Nian Si"],"pdf_url":"https://arxiv.org/pdf/2310.17496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17429v2","updated":"2024-02-01T01:39:39Z","published":"2023-12-29T01:42:43Z","title":"Commonsense for Zero-Shot Natural Language Video Localization","summary":" Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited\npromising results in training NLVL models exclusively with raw video data by\ndynamically generating video segments and pseudo-query annotations. However,\nexisting pseudo-queries often lack grounding in the source video, resulting in\nunstructured and disjointed content. In this paper, we investigate the\neffectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we\npresent CORONET, a zero-shot NLVL framework that leverages commonsense to\nbridge the gap between videos and generated pseudo-queries via a commonsense\nenhancement module. CORONET employs Graph Convolution Networks (GCN) to encode\ncommonsense information extracted from a knowledge graph, conditioned on the\nvideo, and cross-attention mechanisms to enhance the encoded video and\npseudo-query representations prior to localization. Through empirical\nevaluations on two benchmark datasets, we demonstrate that CORONET surpasses\nboth zero-shot and weakly supervised baselines, achieving improvements up to\n32.13% across various recall thresholds and up to 6.33% in mIoU. These results\nunderscore the significance of leveraging commonsense reasoning for zero-shot\nNLVL.\n","authors":["Meghana Holla","Ismini Lourentzou"],"pdf_url":"https://arxiv.org/pdf/2312.17429v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.01472v2","updated":"2024-02-01T01:32:48Z","published":"2024-01-03T00:13:52Z","title":"A First Look at Information Highlighting in Stack Overflow Answers","summary":" Context: Navigating the knowledge of Stack Overflow (SO) remains challenging.\nTo make the posts vivid to users, SO allows users to write and edit posts with\nMarkdown or HTML so that users can leverage various formatting styles (e.g.,\nbold, italic, and code) to highlight the important information. Nonetheless,\nthere have been limited studies on the highlighted information. Objective: We\ncarried out the first large-scale exploratory study on the information\nhighlighted in SO answers in our recent study. To extend our previous study, we\ndevelop approaches to automatically recommend highlighted content with\nformatting styles using neural network architectures initially designed for the\nNamed Entity Recognition task. Method: In this paper, we studied 31,169,429\nanswers of Stack Overflow. For training recommendation models, we choose CNN\nand BERT models for each type of formatting (i.e., Bold, Italic, Code, and\nHeading) using the information highlighting dataset we collected from SO\nanswers. Results: Our models based on CNN architecture achieve precision\nranging from 0.71 to 0.82. The trained model for automatic code content\nhighlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming\nthe trained models for other formatting styles. The BERT models have even lower\nrecalls and F1 scores than the CNN models. Our analysis of failure cases\nindicates that the majority of the failure cases are missing identification\n(i.e., the model misses the content that is supposed to be highlighted) due to\nthe models tend to learn the frequently highlighted words while struggling to\nlearn less frequent words. Conclusion: Our findings suggest that it is possible\nto develop recommendation models for highlighting information for answers with\ndifferent formatting styles on Stack Overflow.\n","authors":["Shahla Shaan Ahmed","Shaowei Wang","Yuan Tian"," Tse-Hsun"," Chen","Haoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.01472v2.pdf","comment":"This work is submitted to Information and Software Technology Journal"},{"id":"http://arxiv.org/abs/2401.08426v2","updated":"2024-02-01T01:17:45Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper investigates the distinctions between gradient methods applied to\nnon-differentiable functions (NGDMs) and classical gradient descents (GDs)\ndesigned for differentiable functions. First, we demonstrate significant\ndifferences in the convergence properties of NGDMs compared to GDs, challenging\nthe applicability of the extensive neural network convergence literature based\non $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the\nparadoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing\nthat increasing the regularization penalty leads to an increase in the $L_{1}$\nnorm of optimal solutions in NGDMs. Consequently, we show that widely adopted\n$L_{1}$ penalization-based techniques for network pruning do not yield expected\nresults. Finally, we explore the Edge of Stability phenomenon, indicating its\ninapplicability even to Lipschitz continuous convex differentiable functions,\nleaving its relevance to non-convex non-differentiable neural networks\ninconclusive. Our analysis exposes misguided interpretations of NGDMs in widely\nreferenced papers and texts due to an overreliance on strong smoothness\nassumptions, emphasizing the necessity for a nuanced understanding of\nfoundational assumptions in the analysis of these systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00261v1","updated":"2024-02-01T01:11:15Z","published":"2024-02-01T01:11:15Z","title":"Understanding Neural Network Systems for Image Analysis using Vector\n Spaces and Inverse Maps","summary":" There is strong interest in developing mathematical methods that can be used\nto understand complex neural networks used in image analysis. In this paper, we\nintroduce techniques from Linear Algebra to model neural network layers as maps\nbetween signal spaces. First, we demonstrate how signal spaces can be used to\nvisualize weight spaces and convolutional layer kernels. We also demonstrate\nhow residual vector spaces can be used to further visualize information lost at\neach layer. Second, we introduce the concept of invertible networks and an\nalgorithm for computing input images that yield specific outputs. We\ndemonstrate our approach on two invertible networks and ResNet18.\n","authors":["Rebecca Pattichis","Marios S. Pattichis"],"pdf_url":"https://arxiv.org/pdf/2402.00261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00849v1","updated":"2024-02-01T18:40:03Z","published":"2024-02-01T18:40:03Z","title":"Score-based Causal Representation Learning: Linear and General\n Transformations","summary":" This paper addresses intervention-based causal representation learning (CRL)\nunder a general nonparametric latent causal model and an unknown transformation\nthat maps the latent variables to the observed variables. Linear and general\ntransformations are investigated. The paper addresses both the\n\\emph{identifiability} and \\emph{achievability} aspects. Identifiability refers\nto determining algorithm-agnostic conditions that ensure recovering the true\nlatent causal variables and the latent causal graph underlying them.\nAchievability refers to the algorithmic aspects and addresses designing\nalgorithms that achieve identifiability guarantees. By drawing novel\nconnections between \\emph{score functions} (i.e., the gradients of the\nlogarithm of density functions) and CRL, this paper designs a \\emph{score-based\nclass of algorithms} that ensures both identifiability and achievability.\nFirst, the paper focuses on \\emph{linear} transformations and shows that one\nstochastic hard intervention per node suffices to guarantee identifiability. It\nalso provides partial identifiability guarantees for soft interventions,\nincluding identifiability up to ancestors for general causal models and perfect\nlatent graph recovery for sufficiently non-linear causal models. Secondly, it\nfocuses on \\emph{general} transformations and shows that two stochastic hard\ninterventions per node suffice for identifiability. Notably, one does\n\\emph{not} need to know which pair of interventional environments have the same\nnode intervened.\n","authors":["Burak Varıcı","Emre Acartürk","Karthikeyan Shanmugam","Abhishek Kumar","Ali Tajer"],"pdf_url":"https://arxiv.org/pdf/2402.00849v1.pdf","comment":"Linear transformations: stronger results for hard and soft\n interventions than our previous paper Score-based Causal Representation\n Learning with Interventions (https://arxiv.org/abs/2301.08230). General\n transformations: results also appear in our paper General Identifiability and\n Achievability for Causal Representation Learning (arXiv:2310.15450) accepted\n to AISTATS 2024 (oral)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2302.05087v3","updated":"2024-02-01T17:32:03Z","published":"2023-02-10T07:11:37Z","title":"Generalized Video Anomaly Event Detection: Systematic Taxonomy and\n Comparison of Deep Models","summary":" Video Anomaly Detection (VAD) serves as a pivotal technology in the\nintelligent surveillance systems, enabling the temporal or spatial\nidentification of anomalous events within videos. While existing reviews\npredominantly concentrate on conventional unsupervised methods, they often\noverlook the emergence of weakly-supervised and fully-unsupervised approaches.\nTo address this gap, this survey extends the conventional scope of VAD beyond\nunsupervised methods, encompassing a broader spectrum termed Generalized Video\nAnomaly Event Detection (GVAED). By skillfully incorporating recent\nadvancements rooted in diverse assumptions and learning frameworks, this survey\nintroduces an intuitive taxonomy that seamlessly navigates through\nunsupervised, weakly-supervised, supervised and fully-unsupervised VAD\nmethodologies, elucidating the distinctions and interconnections within these\nresearch trajectories. In addition, this survey facilitates prospective\nresearchers by assembling a compilation of research resources, including public\ndatasets, available codebases, programming tools, and pertinent literature.\nFurthermore, this survey quantitatively assesses model performance, delves into\nresearch challenges and directions, and outlines potential avenues for future\nexploration.\n","authors":["Yang Liu","Dingkang Yang","Yan Wang","Jing Liu","Jun Liu","Azzedine Boukerche","Peng Sun","Liang Song"],"pdf_url":"https://arxiv.org/pdf/2302.05087v3.pdf","comment":"Accepted by ACM Computing Surveys. For more information, please see\n our project page: https://github.com/fudanyliu/GVAED"},{"id":"http://arxiv.org/abs/2402.00622v1","updated":"2024-02-01T14:39:39Z","published":"2024-02-01T14:39:39Z","title":"Gain of Grain: A Film Grain Handling Toolchain for VVC-based Open\n Implementations","summary":" Film grain is a distinctive visual characteristic cherished by filmmakers and\ncinephiles for its ability to evoke nostalgia and artistic aesthetics. However,\nfaithful preservation of film grain during encoding poses unique challenges.\nFilm grain introduces random noise, complicating traditional compression\ntechniques. Consequently, specialized algorithms and encoding strategies have\nemerged, aiming to strike a harmonious equilibrium. This paper delves into the\nnuanced realm of film grain handling in Versatile Video Coding (VVC) encoding.\nWe explore the delicate balance between retaining the cinematic charm of film\ngrain and achieving efficient compression. Moreover, we discuss the importance\nof perceptual quality assessment and adaptive encoding techniques in preserving\nfilm grain fidelity. Additionally, we delve into the impact of film grain\nhandling on bitrate control and compression efficiency using VVenC, an open and\noptimized VVC encoder. Understanding the role of film grain and its nuanced\ntreatment within encoders becomes increasingly pivotal for delivering\nhigh-quality, grain-inclusive content in the digital age.\n","authors":["Vignesh V Menon","Adam Wieckowski","Jens Brandenburg","Benjamin Bross","Thomas Schierl","Detlev Marpe"],"pdf_url":"https://arxiv.org/pdf/2402.00622v1.pdf","comment":"2024 Mile High Video (MHV)"},{"id":"http://arxiv.org/abs/2312.08984v2","updated":"2024-02-01T04:53:30Z","published":"2023-12-14T14:29:53Z","title":"CL2CM: Improving Cross-Lingual Cross-Modal Retrieval via Cross-Lingual\n Knowledge Transfer","summary":" Cross-lingual cross-modal retrieval has garnered increasing attention\nrecently, which aims to achieve the alignment between vision and target\nlanguage (V-T) without using any annotated V-T data pairs. Current methods\nemploy machine translation (MT) to construct pseudo-parallel data pairs, which\nare then used to learn a multi-lingual and multi-modal embedding space that\naligns visual and target-language representations. However, the large\nheterogeneous gap between vision and text, along with the noise present in\ntarget language translations, poses significant challenges in effectively\naligning their representations. To address these challenges, we propose a\ngeneral framework, Cross-Lingual to Cross-Modal (CL2CM), which improves the\nalignment between vision and target language using cross-lingual transfer. This\napproach allows us to fully leverage the merits of multi-lingual pre-trained\nmodels (e.g., mBERT) and the benefits of the same modality structure, i.e.,\nsmaller gap, to provide reliable and comprehensive semantic correspondence\n(knowledge) for the cross-modal network. We evaluate our proposed approach on\ntwo multilingual image-text datasets, Multi30K and MSCOCO, and one video-text\ndataset, VATEX. The results clearly demonstrate the effectiveness of our\nproposed method and its high potential for large-scale retrieval.\n","authors":["Yabing Wang","Fan Wang","Jianfeng Dong","Hao Luo"],"pdf_url":"https://arxiv.org/pdf/2312.08984v2.pdf","comment":"Accepted by AAAI2024"}]},"2024-02-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.01629v1","updated":"2024-02-02T18:44:37Z","published":"2024-02-02T18:44:37Z","title":"Position Paper: Generalized grammar rules and structure-based\n generalization beyond classical equivariance for lexical tasks and\n transduction","summary":" Compositional generalization is one of the main properties which\ndifferentiates lexical learning in humans from state-of-art neural networks. We\npropose a general framework for building models that can generalize\ncompositionally using the concept of Generalized Grammar Rules (GGRs), a class\nof symmetry-based compositional constraints for transduction tasks, which we\nview as a transduction analogue of equivariance constraints in physics-inspired\ntasks. Besides formalizing generalized notions of symmetry for language\ntransduction, our framework is general enough to contain many existing works as\nspecial cases. We present ideas on how GGRs might be implemented, and in the\nprocess draw connections to reinforcement learning and other areas of research.\n","authors":["Mircea Petrache","Shubhendu Trivedi"],"pdf_url":"https://arxiv.org/pdf/2402.01629v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2402.01622v1","updated":"2024-02-02T18:39:51Z","published":"2024-02-02T18:39:51Z","title":"TravelPlanner: A Benchmark for Real-World Planning with Language Agents","summary":" Planning has been part of the core pursuit for artificial intelligence since\nits conception, but earlier AI agents mostly focused on constrained settings\nbecause many of the cognitive substrates necessary for human-level planning\nhave been lacking. Recently, language agents powered by large language models\n(LLMs) have shown interesting capabilities such as tool use and reasoning. Are\nthese language agents capable of planning in more complex settings that are out\nof the reach of prior AI agents? To advance this investigation, we propose\nTravelPlanner, a new planning benchmark that focuses on travel planning, a\ncommon real-world planning scenario. It provides a rich sandbox environment,\nvarious tools for accessing nearly four million data records, and 1,225\nmeticulously curated planning intents and reference plans. Comprehensive\nevaluations show that the current language agents are not yet capable of\nhandling such complex planning tasks-even GPT-4 only achieves a success rate of\n0.6%. Language agents struggle to stay on task, use the right tools to collect\ninformation, or keep track of multiple constraints. However, we note that the\nmere possibility for language agents to tackle such a complex problem is in\nitself non-trivial progress. TravelPlanner provides a challenging yet\nmeaningful testbed for future language agents.\n","authors":["Jian Xie","Kai Zhang","Jiangjie Chen","Tinghui Zhu","Renze Lou","Yuandong Tian","Yanghua Xiao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2402.01622v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.14279v4","updated":"2024-02-02T18:37:07Z","published":"2023-05-23T17:25:59Z","title":"Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs","summary":" Large language models (LLMs) have achieved widespread success on a variety of\nin-context few-shot tasks, but this success is typically evaluated via\ncorrectness rather than consistency. We argue that self-consistency is an\nimportant criteria for valid multi-step reasoning in tasks where the solution\nis composed of the answers to multiple sub-steps. We propose two types of\nself-consistency that are particularly important for multi-step reasoning --\nhypothetical consistency (a model's ability to predict what its output would be\nin a hypothetical other context) and compositional consistency (consistency of\na model's final outputs when intermediate sub-steps are replaced with the\nmodel's outputs for those steps). We demonstrate that multiple variants of the\nGPT-3/-4 models exhibit poor consistency rates across both types of consistency\non a variety of tasks.\n","authors":["Angelica Chen","Jason Phang","Alicia Parrish","Vishakh Padmakumar","Chen Zhao","Samuel R. Bowman","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2305.14279v4.pdf","comment":"Accepted to TMLR: https://openreview.net/forum?id=5nBqY1y96B"},{"id":"http://arxiv.org/abs/2402.01620v1","updated":"2024-02-02T18:35:14Z","published":"2024-02-02T18:35:14Z","title":"MAGDi: Structured Distillation of Multi-Agent Interaction Graphs\n Improves Reasoning in Smaller Language Models","summary":" Multi-agent interactions between Large Language Model (LLM) agents have shown\nmajor improvements on diverse reasoning tasks. However, these involve long\ngenerations from multiple models across several rounds, making them expensive.\nMoreover, these multi-agent approaches fail to provide a final, single model\nfor efficient inference. To address this, we introduce MAGDi, a new method for\nstructured distillation of the reasoning interactions between multiple LLMs\ninto smaller LMs. MAGDi teaches smaller models by representing multi-agent\ninteractions as graphs, augmenting a base student model with a graph encoder,\nand distilling knowledge using three objective functions: next-token\nprediction, a contrastive loss between correct and incorrect reasoning, and a\ngraph-based objective to model the interaction structure. Experiments on seven\nwidely-used commonsense and math reasoning benchmarks show that MAGDi improves\nthe reasoning capabilities of smaller models, outperforming several methods\nthat distill from a single teacher and multiple teachers. Moreover, MAGDi also\ndemonstrates an order of magnitude higher efficiency over its teachers. We\nconduct extensive analyses to show that MAGDi (1) enhances the generalizability\nto out-of-domain tasks, (2) scales positively with the size and strength of the\nbase student model, and (3) obtains larger improvements (via our multi-teacher\ntraining) when applying self-consistency - an inference technique that relies\non model diversity.\n","authors":["Justin Chih-Yao Chen","Swarnadeep Saha","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2402.01620v1.pdf","comment":"15 pages; First two authors contributed equally; GitHub:\n https://github.com/dinobby/MAGDi"},{"id":"http://arxiv.org/abs/2402.01619v1","updated":"2024-02-02T18:32:24Z","published":"2024-02-02T18:32:24Z","title":"KB-Plugin: A Plug-and-play Framework for Large Language Models to Induce\n Programs over Low-resourced Knowledge Bases","summary":" Program induction (PI) has become a promising paradigm for using knowledge\nbases (KBs) to help large language models (LLMs) answer complex\nknowledge-intensive questions. Nonetheless, PI typically relies on a large\nnumber of parallel question-program pairs to make the LLM aware of the schema\nof the given KB, and is thus challenging for many low-resourced KBs that lack\nannotated data. To this end, we propose KB-Plugin, a plug-and-play framework\nthat enables LLMs to induce programs over any low-resourced KB. Firstly,\nKB-Plugin adopts self-supervised learning to encode the detailed schema\ninformation of a given KB into a pluggable module, namely schema plugin.\nSecondly, KB-Plugin utilizes abundant annotated data from a rich-resourced KB\nto train another pluggable module, namely PI plugin, which can help the LLM\nextract question-relevant schema information from the schema plugin of any KB\nand utilize this information to induce programs over this KB. Experiments on\nfive heterogeneous KBQA datasets show that KB-Plugin achieves better or\ncomparable performance with 25$\\times$ smaller backbone LLM compared to SoTA PI\nmethods for low-resourced KBs, and even approaches the performance of\nsupervised methods. Our code and data are available at\nhttps://github.com/THU-KEG/KB-Plugin.\n","authors":["Jiajie Zhang","Shulin Cao","Linmei Hu","Ling Feng","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2402.01619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09896v5","updated":"2024-02-02T18:31:34Z","published":"2023-06-16T15:13:17Z","title":"Is Self-Repair a Silver Bullet for Code Generation?","summary":" Large language models have shown remarkable aptitude in code generation, but\nstill struggle to perform complex tasks. Self-repair -- in which the model\ndebugs and repairs its own code -- has recently become a popular way to boost\nperformance in these settings. However, despite its increasing popularity,\nexisting studies of self-repair have been limited in scope; in many settings,\nits efficacy thus remains poorly understood. In this paper, we analyze Code\nLlama, GPT-3.5 and GPT-4's ability to perform self-repair on problems taken\nfrom HumanEval and APPS. We find that when the cost of carrying out repair is\ntaken into account, performance gains are often modest, vary a lot between\nsubsets of the data, and are sometimes not present at all. We hypothesize that\nthis is because self-repair is bottlenecked by the model's ability to provide\nfeedback on its own code; using a stronger model to artificially boost the\nquality of the feedback, we observe substantially larger performance gains.\nSimilarly, a small-scale study in which we provide GPT-4 with feedback from\nhuman participants suggests that even for the strongest models, self-repair\nstill lags far behind what can be achieved with human-level debugging.\n","authors":["Theo X. Olausson","Jeevana Priya Inala","Chenglong Wang","Jianfeng Gao","Armando Solar-Lezama"],"pdf_url":"https://arxiv.org/pdf/2306.09896v5.pdf","comment":"Accepted to ICLR 2024. Added additional Code Llama experiments and\n fixed a data processing error harming Code Llama's reported self-repair\n performance on HumanEval"},{"id":"http://arxiv.org/abs/2402.01618v1","updated":"2024-02-02T18:31:15Z","published":"2024-02-02T18:31:15Z","title":"Style Vectors for Steering Generative Large Language Model","summary":" This research explores strategies for steering the output of large language\nmodels (LLMs) towards specific styles, such as sentiment, emotion, or writing\nstyle, by adding style vectors to the activations of hidden layers during text\ngeneration. We show that style vectors can be simply computed from recorded\nlayer activations for input texts in a specific style in contrast to more\ncomplex training-based approaches. Through a series of experiments, we\ndemonstrate the effectiveness of activation engineering using such style\nvectors to influence the style of generated text in a nuanced and\nparameterisable way, distinguishing it from prompt engineering. The presented\nresearch constitutes a significant step towards developing more adaptive and\neffective AI-empowered interactive systems.\n","authors":["Kai Konen","Sophie Jentzsch","Diaoulé Diallo","Peer Schütt","Oliver Bensch","Roxanne El Baff","Dominik Opitz","Tobias Hecking"],"pdf_url":"https://arxiv.org/pdf/2402.01618v1.pdf","comment":"Will be published as findings paper at EACL2024 - 18th Conference of\n the European Chapter of the Association for Computational Linguistics"},{"id":"http://arxiv.org/abs/2402.01613v1","updated":"2024-02-02T18:23:18Z","published":"2024-02-02T18:23:18Z","title":"Nomic Embed: Training a Reproducible Long Context Text Embedder","summary":" This technical report describes the training of nomic-embed-text-v1, the\nfirst fully reproducible, open-source, open-weights, open-data, 8192 context\nlength English text embedding model that outperforms both OpenAI Ada-002 and\nOpenAI text-embedding-3-small on short and long-context tasks. We release the\ntraining code and model weights under an Apache 2 license. In contrast with\nother open-source models, we release a training data loader with 235 million\ncurated text pairs that allows for the full replication of nomic-embed-text-v1.\nYou can find code and data to replicate the model at\nhttps://github.com/nomic-ai/contrastors\n","authors":["Zach Nussbaum","John X. Morris","Brandon Duderstadt","Andriy Mulyar"],"pdf_url":"https://arxiv.org/pdf/2402.01613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13382v2","updated":"2024-02-02T18:20:03Z","published":"2023-12-20T19:13:26Z","title":"DSPy Assertions: Computational Constraints for Self-Refining Language\n Model Pipelines","summary":" Chaining language model (LM) calls as composable modules is fueling a new way\nof programming, but ensuring LMs adhere to important constraints requires\nheuristic \"prompt engineering\". We introduce LM Assertions, a programming\nconstruct for expressing computational constraints that LMs should satisfy. We\nintegrate our constructs into the recent DSPy programming model for LMs, and\npresent new strategies that allow DSPy to compile programs with LM Assertions\ninto more reliable and accurate systems. We also propose strategies to use\nassertions at inference time for automatic self-refinement with LMs. We report\non four diverse case studies for text generation and find that LM Assertions\nimprove not only compliance with imposed rules but also downstream task\nperformance, passing constraints up to 164% more often and generating up to 37%\nmore higher-quality responses. Our reference implementation of LM Assertions is\nintegrated into DSPy at https://github.com/stanfordnlp/dspy\n","authors":["Arnav Singhvi","Manish Shetty","Shangyin Tan","Christopher Potts","Koushik Sen","Matei Zaharia","Omar Khattab"],"pdf_url":"https://arxiv.org/pdf/2312.13382v2.pdf","comment":"Arnav*, Manish*, Shangyin* contributed equally to this work"},{"id":"http://arxiv.org/abs/2401.15963v2","updated":"2024-02-02T18:11:27Z","published":"2024-01-29T08:47:31Z","title":"NoFunEval: Funny How Code LMs Falter on Requirements Beyond Functional\n Correctness","summary":" Existing evaluation benchmarks of language models of code (code LMs) focus\nalmost exclusively on whether the LMs can generate functionally-correct code.\nIn real-world software engineering, developers think beyond functional\ncorrectness. They have requirements on \"how\" a functionality should be\nimplemented to meet overall system design objectives like efficiency, security,\nand maintainability. They would also trust the code LMs more if the LMs\ndemonstrate robust understanding of requirements and code semantics.\n We propose a new benchmark NoFunEval to evaluate code LMs on non-functional\nrequirements and simple classification instances for both functional and\nnon-functional requirements. We propose a prompting method, Coding Concepts\n(CoCo), as a way for a developer to communicate the domain knowledge to the\nLMs. We conduct an extensive evaluation of twenty-two code LMs. Our finding is\nthat they generally falter when tested on our benchmark, hinting at fundamental\nblindspots in their training setups. Surprisingly, even the classification\naccuracy on functional-correctness instances derived from the popular HumanEval\nbenchmark is low, calling in question the depth of their comprehension and the\nsource of their success in generating functionally-correct code in the first\nplace. We will release our benchmark and evaluation scripts publicly at\nhttps://aka.ms/NoFunEval.\n","authors":["Manav Singhal","Tushar Aggarwal","Abhijeet Awasthi","Nagarajan Natarajan","Aditya Kanade"],"pdf_url":"https://arxiv.org/pdf/2401.15963v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.13836v2","updated":"2024-02-02T18:07:37Z","published":"2023-10-20T22:20:50Z","title":"Foundation Model's Embedded Representations May Detect Distribution\n Shift","summary":" Sampling biases can cause distribution shifts between train and test datasets\nfor supervised learning tasks, obscuring our ability to understand the\ngeneralization capacity of a model. This is especially important considering\nthe wide adoption of pre-trained foundational neural networks -- whose behavior\nremains poorly understood -- for transfer learning (TL) tasks. We present a\ncase study for TL on the Sentiment140 dataset and show that many pre-trained\nfoundation models encode different representations of Sentiment140's manually\ncurated test set $M$ from the automatically labeled training set $P$,\nconfirming that a distribution shift has occurred. We argue training on $P$ and\nmeasuring performance on $M$ is a biased measure of generalization. Experiments\non pre-trained GPT-2 show that the features learnable from $P$ do not improve\n(and in fact hamper) performance on $M$. Linear probes on pre-trained GPT-2's\nrepresentations are robust and may even outperform overall fine-tuning,\nimplying a fundamental importance for discerning distribution shift in\ntrain/test splits for model interpretation.\n","authors":["Max Vargas","Adam Tsou","Andrew Engel","Tony Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.13836v2.pdf","comment":"17 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.17026v3","updated":"2024-02-02T18:04:58Z","published":"2023-05-26T15:35:43Z","title":"How Powerful are Decoder-Only Transformer Neural Models?","summary":" In this article we prove that the general transformer neural model\nundergirding modern large language models (LLMs) is Turing complete under\nreasonable assumptions. This is the first work to directly address the Turing\ncompleteness of the underlying technology employed in GPT-x as past work has\nfocused on the more expressive, full auto-encoder transformer architecture.\nFrom this theoretical analysis, we show that the sparsity/compressibility of\nthe word embedding is an important consideration for Turing completeness to\nhold. We also show that Transformers are are a variant of B machines studied by\nHao Wang.\n","authors":["Jesse Roberts"],"pdf_url":"https://arxiv.org/pdf/2305.17026v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00786v2","updated":"2024-02-02T17:43:41Z","published":"2024-02-01T17:17:55Z","title":"CroissantLLM: A Truly Bilingual French-English Language Model","summary":" We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T\nEnglish and French tokens, to bring to the research and industrial community a\nhigh-performance, fully open-sourced bilingual model that runs swiftly on\nconsumer-grade local hardware. To that end, we pioneer the approach of training\nan intrinsically bilingual model with a 1:1 English-to-French pretraining data\nratio, a custom tokenizer, and bilingual finetuning datasets. We release the\ntraining dataset, notably containing a French split with manually curated,\nhigh-quality, and varied data sources. To assess performance outside of\nEnglish, we craft a novel benchmark, FrenchBench, consisting of an array of\nclassification and generation tasks, covering various orthogonal aspects of\nmodel performance in the French Language. Additionally, rooted in transparency\nand to foster further Large Language Model research, we release codebases, and\ndozens of checkpoints across various model sizes, training data distributions,\nand training steps, as well as fine-tuned Chat models, and strong translation\nmodels. We evaluate our model through the FMTI framework, and validate 81 % of\nthe transparency criteria, far beyond the scores of even most open initiatives.\nThis work enriches the NLP landscape, breaking away from previous\nEnglish-centric work in order to strengthen our understanding of\nmultilinguality in language models.\n","authors":["Manuel Faysse","Patrick Fernandes","Nuno M. Guerreiro","António Loison","Duarte M. Alves","Caio Corro","Nicolas Boizard","João Alves","Ricardo Rei","Pedro H. Martins","Antoni Bigata Casademunt","François Yvon","André F. T. Martins","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.00786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09043v3","updated":"2024-02-02T17:40:52Z","published":"2023-12-14T15:40:27Z","title":"Topic Bias in Emotion Classification","summary":" Emotion corpora are typically sampled based on keyword/hashtag search or by\nasking study participants to generate textual instances. In any case, these\ncorpora are not uniform samples representing the entirety of a domain. We\nhypothesize that this practice of data acquisition leads to unrealistic\ncorrelations between overrepresented topics in these corpora that harm the\ngeneralizability of models. Such topic bias could lead to wrong predictions for\ninstances like \"I organized the service for my aunt's funeral.\" when funeral\nevents are over-represented for instances labeled with sadness, despite the\nemotion of pride being more appropriate here. In this paper, we study this\ntopic bias both from the data and the modeling perspective. We first label a\nset of emotion corpora automatically via topic modeling and show that emotions\nin fact correlate with specific topics. Further, we see that emotion\nclassifiers are confounded by such topics. Finally, we show that the\nestablished debiasing method of adversarial correction via gradient reversal\nmitigates the issue. Our work points out issues with existing emotion corpora\nand that more representative resources are required for fair evaluation of\nmodels predicting affective concepts from text.\n","authors":["Maximilian Wegge","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2312.09043v3.pdf","comment":"accepted to W-NUT at EACL 2024"},{"id":"http://arxiv.org/abs/2402.01592v1","updated":"2024-02-02T17:35:49Z","published":"2024-02-02T17:35:49Z","title":"Towards Sustainable Workplace Mental Health: A Novel Approach to Early\n Intervention and Support","summary":" Employee well-being is a critical concern in the contemporary workplace, as\nhighlighted by the American Psychological Association's 2021 report, indicating\nthat 71% of employees experience stress or tension. This stress contributes\nsignificantly to workplace attrition and absenteeism, with 61% of attrition and\n16% of sick days attributed to poor mental health. A major challenge for\nemployers is that employees often remain unaware of their mental health issues\nuntil they reach a crisis point, resulting in limited utilization of corporate\nwell-being benefits. This research addresses this challenge by presenting a\ngroundbreaking stress detection algorithm that provides real-time support\npreemptively. Leveraging automated chatbot technology, the algorithm\nobjectively measures mental health levels by analyzing chat conversations,\noffering personalized treatment suggestions in real-time based on linguistic\nbiomarkers. The study explores the feasibility of integrating these innovations\ninto practical learning applications within real-world contexts and introduces\na chatbot-style system integrated into the broader employee experience\nplatform. This platform, encompassing various features, aims to enhance overall\nemployee well-being, detect stress in real time, and proactively engage with\nindividuals to improve support effectiveness, demonstrating a 22% increase when\nassistance is provided early. Overall, the study emphasizes the importance of\nfostering a supportive workplace environment for employees' mental health.\n","authors":["David W. Vinson","Mihael Arcan","David-Paul Niland","Fionn Delahunty"],"pdf_url":"https://arxiv.org/pdf/2402.01592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01591v1","updated":"2024-02-02T17:34:53Z","published":"2024-02-02T17:34:53Z","title":"BAT: Learning to Reason about Spatial Sounds with Large Language Models","summary":" Spatial sound reasoning is a fundamental human skill, enabling us to navigate\nand interpret our surroundings based on sound. In this paper we present BAT,\nwhich combines the spatial sound perception ability of a binaural acoustic\nscene analysis model with the natural language reasoning capabilities of a\nlarge language model (LLM) to replicate this innate ability. To address the\nlack of existing datasets of in-the-wild spatial sounds, we synthesized a\nbinaural audio dataset using AudioSet and SoundSpaces 2.0. Next, we developed\nSpatialSoundQA, a spatial sound-based question-answering dataset, offering a\nrange of QA tasks that train BAT in various aspects of spatial sound perception\nand reasoning. The acoustic front end encoder of BAT is a novel spatial audio\nencoder named Spatial Audio Spectrogram Transformer, or Spatial-AST, which by\nitself achieves strong performance across sound event detection, spatial\nlocalization, and distance estimation. By integrating Spatial-AST with LLaMA-2\n7B model, BAT transcends standard Sound Event Localization and Detection (SELD)\ntasks, enabling the model to reason about the relationships between the sounds\nin its environment. Our experiments demonstrate BAT's superior performance on\nboth spatial sound perception and reasoning, showcasing the immense potential\nof LLMs in navigating and interpreting complex spatial audio environments.\n","authors":["Zhisheng Zheng","Puyuan Peng","Ziyang Ma","Xie Chen","Eunsol Choi","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2402.01591v1.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2401.16578v2","updated":"2024-02-02T17:28:22Z","published":"2024-01-29T21:24:43Z","title":"Leveraging Professional Radiologists' Expertise to Enhance LLMs'\n Evaluation for Radiology Reports","summary":" In radiology, Artificial Intelligence (AI) has significantly advanced report\ngeneration, but automatic evaluation of these AI-produced reports remains\nchallenging. Current metrics, such as Conventional Natural Language Generation\n(NLG) and Clinical Efficacy (CE), often fall short in capturing the semantic\nintricacies of clinical contexts or overemphasize clinical details, undermining\nreport clarity. To overcome these issues, our proposed method synergizes the\nexpertise of professional radiologists with Large Language Models (LLMs), like\nGPT-3.5 and GPT-4 1. Utilizing In-Context Instruction Learning (ICIL) and Chain\nof Thought (CoT) reasoning, our approach aligns LLM evaluations with\nradiologist standards, enabling detailed comparisons between human and AI\ngenerated reports. This is further enhanced by a Regression model that\naggregates sentence evaluation scores. Experimental results show that our\n\"Detailed GPT-4 (5-shot)\" model achieves a 0.48 score, outperforming the METEOR\nmetric by 0.19, while our \"Regressed GPT-4\" model shows even greater alignment\nwith expert evaluations, exceeding the best existing metric by a 0.35 margin.\nMoreover, the robustness of our explanations has been validated through a\nthorough iterative strategy. We plan to publicly release annotations from\nradiology experts, setting a new standard for accuracy in future assessments.\nThis underscores the potential of our approach in enhancing the quality\nassessment of AI-driven medical reports.\n","authors":["Qingqing Zhu","Xiuying Chen","Qiao Jin","Benjamin Hou","Tejas Sudharshan Mathai","Pritam Mukherjee","Xin Gao","Ronald M Summers","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.16578v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01586v1","updated":"2024-02-02T17:26:23Z","published":"2024-02-02T17:26:23Z","title":"TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent\n Constitution","summary":" The emergence of LLM-based agents has garnered considerable attention, yet\ntheir trustworthiness remains an under-explored area. As agents can directly\ninteract with the physical environment, their reliability and safety is\ncritical. This paper presents an Agent-Constitution-based agent framework,\nTrustAgent, an initial investigation into improving the safety dimension of\ntrustworthiness in LLM-based agents. This framework consists of threefold\nstrategies: pre-planning strategy which injects safety knowledge to the model\nprior to plan generation, in-planning strategy which bolsters safety during\nplan generation, and post-planning strategy which ensures safety by\npost-planning inspection. Through experimental analysis, we demonstrate how\nthese approaches can effectively elevate an LLM agent's safety by identifying\nand preventing potential dangers. Furthermore, we explore the intricate\nrelationships between safety and helpfulness, and between the model's reasoning\nability and its efficacy as a safe agent. This paper underscores the imperative\nof integrating safety awareness and trustworthiness into the design and\ndeployment of LLM-based agents, not only to enhance their performance but also\nto ensure their responsible integration into human-centric environments. Data\nand code are available at https://github.com/agiresearch/TrustAgent.\n","authors":["Wenyue Hua","Xianjun Yang","Zelong Li","Cheng Wei","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01586v1.pdf","comment":"16 pages, 3 figures, 5 tables, comments and suggestions are welcome"},{"id":"http://arxiv.org/abs/2402.01582v1","updated":"2024-02-02T17:20:16Z","published":"2024-02-02T17:20:16Z","title":"Automating Sound Change Prediction for Phylogenetic Inference: A\n Tukanoan Case Study","summary":" We describe a set of new methods to partially automate linguistic\nphylogenetic inference given (1) cognate sets with their respective protoforms\nand sound laws, (2) a mapping from phones to their articulatory features and\n(3) a typological database of sound changes. We train a neural network on these\nsound change data to weight articulatory distances between phones and predict\nintermediate sound change steps between historical protoforms and their modern\ndescendants, replacing a linguistic expert in part of a parsimony-based\nphylogenetic inference algorithm. In our best experiments on Tukanoan\nlanguages, this method produces trees with a Generalized Quartet Distance of\n0.12 from a tree that used expert annotations, a significant improvement over\nother semi-automated baselines. We discuss potential benefits and drawbacks to\nour neural approach and parsimony-based tree prediction. We also experiment\nwith a minimal generalization learner for automatic sound law induction,\nfinding it comparably effective to sound laws from expert annotation. Our code\nis publicly available at https://github.com/cmu-llab/aiscp.\n","authors":["Kalvin Chang","Nathaniel R. Robinson","Anna Cai","Ting Chen","Annie Zhang","David R. Mortensen"],"pdf_url":"https://arxiv.org/pdf/2402.01582v1.pdf","comment":"Accepted to LChange 2023"},{"id":"http://arxiv.org/abs/2402.01579v1","updated":"2024-02-02T17:17:42Z","published":"2024-02-02T17:17:42Z","title":"How Paralingual are Paralinguistic Representations? A Case Study in\n Speech Emotion Recognition","summary":" Pre-trained Models (PTMs) have facilitated substantial progress in the field\nof Speech Emotion Recognition (SER). SER is an area with applications ranging\nfrom HumanComputer Interaction to Healthcare. Recent studies have leveraged\nvarious PTM representations as input features for downstream models for SER.\nPTM specifically pre-trained for paralinguistic tasks have obtained\nstate-of-the-art (SOTA) performance for SER. However, such PTM haven't been\nevaluated for SER in multilingual settings and experimented only with English.\nSo, we fill this gap, by performing a comprehensive comparative study of five\nPTMs (TRILLsson, wav2vec2, XLS-R, x-vector, Whisper) for assessing the\neffectiveness of paralingual PTM (TRILLsson) for SER across multiple languages.\nRepresentations from TRILLsson achieved the best performance among all the\nPTMs. This demonstrates that TRILLsson is able to effectively capture the\nvarious paralinguistic features from speech data for better SER. We also show\nthat downstream models using TRILLsson representations achieve SOTA performance\nin terms of accuracy across various multi-lingual datasets.\n","authors":["Orchid Chetia Phukan","Gautam Siddharth Kashyap","Arun Balaji Buduru","Rajesh Sharma"],"pdf_url":"https://arxiv.org/pdf/2402.01579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01577v1","updated":"2024-02-02T17:16:23Z","published":"2024-02-02T17:16:23Z","title":"Deep Active Learning for Data Mining from Conflict Text Corpora","summary":" High-resolution event data on armed conflict and related processes have\nrevolutionized the study of political contention with datasets like UCDP GED,\nACLED etc. However, most of these datasets limit themselves to collecting\nspatio-temporal (high-resolution) and intensity data. Information on dynamics,\nsuch as targets, tactics, purposes etc. are rarely collected owing to the\nextreme workload of collecting data. However, most datasets rely on a rich\ncorpus of textual data allowing further mining of further information connected\nto each event. This paper proposes one such approach that is inexpensive and\nhigh performance, leveraging active learning - an iterative process of\nimproving a machine learning model based on sequential (guided) human input.\nActive learning is employed to then step-wise train (fine-tuning) of a large,\nencoder-only language model adapted for extracting sub-classes of events\nrelating to conflict dynamics. The approach shows performance similar to human\n(gold-standard) coding while reducing the amount of required human annotation\nby as much as 99%.\n","authors":["Mihai Croicu"],"pdf_url":"https://arxiv.org/pdf/2402.01577v1.pdf","comment":"40 pages, 6 figures. Paper presented at the Using LLMs and\n Text-as-Data in Political Science Research Workshop at the University of\n Barcelona, 29 January 2024"},{"id":"http://arxiv.org/abs/2401.16587v2","updated":"2024-02-02T16:47:16Z","published":"2024-01-29T21:43:27Z","title":"A Linguistic Comparison between Human and ChatGPT-Generated\n Conversations","summary":" This study explores linguistic differences between human and LLM-generated\ndialogues, using 19.5K dialogues generated by ChatGPT-3.5 as a companion to the\nEmpathicDialogues dataset. The research employs Linguistic Inquiry and Word\nCount (LIWC) analysis, comparing ChatGPT-generated conversations with human\nconversations across 118 linguistic categories. Results show greater\nvariability and authenticity in human dialogues, but ChatGPT excels in\ncategories such as social processes, analytical style, cognition, attentional\nfocus, and positive emotional tone, reinforcing recent findings of LLMs being\n\"more human than human.\" However, no significant difference was found in\npositive or negative affect between ChatGPT and human dialogues. Classifier\nanalysis of dialogue embeddings indicates implicit coding of the valence of\naffect despite no explicit mention of affect in the conversations. The research\nalso contributes a novel, companion ChatGPT-generated dataset of conversations\nbetween two independent chatbots, which were designed to replicate a corpus of\nhuman conversations available for open access and used widely in AI research on\nlanguage modeling. Our findings increase understanding of ChatGPT's linguistic\ncapabilities and inform ongoing efforts to distinguish between human and\nLLM-generated text, which is critical in detecting AI-generated fakes,\nmisinformation, and disinformation.\n","authors":["Morgan Sandler","Hyesun Choung","Arun Ross","Prabu David"],"pdf_url":"https://arxiv.org/pdf/2401.16587v2.pdf","comment":"Preprint. Pending review and feedback from ICPRAI2024"},{"id":"http://arxiv.org/abs/2402.01535v1","updated":"2024-02-02T16:26:52Z","published":"2024-02-02T16:26:52Z","title":"An Empirical Analysis of Diversity in Argument Summarization","summary":" Presenting high-level arguments is a crucial task for fostering participation\nin online societal discussions. Current argument summarization approaches miss\nan important facet of this task -- capturing diversity -- which is important\nfor accommodating multiple perspectives. We introduce three aspects of\ndiversity: those of opinions, annotators, and sources. We evaluate approaches\nto a popular argument summarization task called Key Point Analysis, which shows\nhow these approaches struggle to (1) represent arguments shared by few people,\n(2) deal with data from various sources, and (3) align with subjectivity in\nhuman-provided annotations. We find that both general-purpose LLMs and\ndedicated KPA models exhibit this behavior, but have complementary strengths.\nFurther, we observe that diversification of training data may ameliorate\ngeneralization. Addressing diversity in argument summarization requires a mix\nof strategies to deal with subjectivity.\n","authors":["Michiel van der Meer","Piek Vossen","Catholijn M. Jonker","Pradeep K. Murukannaiah"],"pdf_url":"https://arxiv.org/pdf/2402.01535v1.pdf","comment":"Accepted at EACL2024 (main proceedings)"},{"id":"http://arxiv.org/abs/2402.01528v1","updated":"2024-02-02T16:15:24Z","published":"2024-02-02T16:15:24Z","title":"Decoding Speculative Decoding","summary":" Speculative Decoding is a widely used technique to speed up inference for\nLarge Language Models (LLMs) without modifying its outcome. When performing\ninference on an LLM, speculative decoding uses a smaller draft model which\ngenerates speculative tokens and then uses the target LLM to verify those draft\ntokens. The speedup provided by speculative decoding heavily depends on the\nchoice of the draft model. It has been widely suggested to select a draft model\nthat provides a high probability of the generated token being accepted by the\nLLM to achieve the highest throughput. However, our experiments indicate the\ncontrary with throughput diminishing as the probability of generated tokens to\nbe accepted by the target model increases. To understand this phenomenon, we\nperform extensive experiments to characterize the different factors that affect\nspeculative decoding and how those factors interact and affect the speedups.\nBased on our experiments we describe an analytical model which can be used to\ndecide the right draft model for a given workload. Further, using our insights\nwe design a new draft model for LLaMA-65B which can provide 30% higher\nthroughput than existing draft models.\n","authors":["Minghao Yan","Saurabh Agarwal","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2402.01528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01521v1","updated":"2024-02-02T16:07:05Z","published":"2024-02-02T16:07:05Z","title":"K-Level Reasoning with Large Language Models","summary":" While Large Language Models (LLMs) have demonstrated their proficiency in\ncomplex reasoning tasks, their performance in dynamic, interactive, and\ncompetitive scenarios - such as business strategy and stock market analysis -\nremains underexplored. To bridge this gap, we formally explore the dynamic\nreasoning capabilities of LLMs for decision-making in rapidly evolving\nenvironments. We introduce two game theory-based pilot challenges that mirror\nthe complexities of real-world dynamic decision-making. These challenges are\nwell-defined, enabling clear, controllable, and precise evaluation of LLMs'\ndynamic reasoning abilities. Through extensive experiments, we find that\nexisting reasoning methods tend to falter in dynamic settings that require\nk-level thinking - a key concept not tackled by previous works. To address\nthis, we propose a novel reasoning approach for LLMs, named \"K-Level\nReasoning\". This approach adopts the perspective of rivals to recursively\nemploy k-level thinking based on available historical information, which\nsignificantly improves the prediction accuracy of rivals' subsequent moves and\ninforms more strategic decision-making. This research not only sets a robust\nquantitative benchmark for the assessment of dynamic reasoning but also\nmarkedly enhances the proficiency of LLMs in dynamic contexts.\n","authors":["Yadong Zhang","Shaoguang Mao","Tao Ge","Xun Wang","Yan Xia","Man Lan","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2402.01521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01513v1","updated":"2024-02-02T15:54:19Z","published":"2024-02-02T15:54:19Z","title":"Multilingual Gradient Word-Order Typology from Universal Dependencies","summary":" While information from the field of linguistic typology has the potential to\nimprove performance on NLP tasks, reliable typological data is a prerequisite.\nExisting typological databases, including WALS and Grambank, suffer from\ninconsistencies primarily caused by their categorical format. Furthermore,\ntypological categorisations by definition differ significantly from the\ncontinuous nature of phenomena, as found in natural language corpora. In this\npaper, we introduce a new seed dataset made up of continuous-valued data,\nrather than categorical data, that can better reflect the variability of\nlanguage. While this initial dataset focuses on word-order typology, we also\npresent the methodology used to create the dataset, which can be easily adapted\nto generate data for a broader set of features and languages.\n","authors":["Emi Baylor","Esther Ploeger","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2402.01513v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2402.01512v1","updated":"2024-02-02T15:53:31Z","published":"2024-02-02T15:53:31Z","title":"Distractor Generation for Multiple-Choice Questions: A Survey of\n Methods, Datasets, and Evaluation","summary":" Distractors are important in learning evaluation. This paper surveys\ndistractor generation tasks using English multiple-choice question datasets for\ntextual and multimodal contexts. In particular, this paper presents a thorough\nliterature review of the recent studies on distractor generation tasks,\ndiscusses multiple choice components and their characteristics, analyzes the\nrelated datasets, and summarizes the evaluation metrics of distractor\ngeneration. Our investigation reveals that more than half of datasets are\nhuman-generated from educational sources in specific domains such as Science\nand English, which are largely text-based, with a lack of open domain and\nmultimodal datasets.\n","authors":["Elaf Alhazmi","Quan Z. Sheng","Wei Emma Zhang","Munazza Zaib","Ahoud Alhazmi"],"pdf_url":"https://arxiv.org/pdf/2402.01512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00856v2","updated":"2024-02-02T15:50:10Z","published":"2024-02-01T18:51:54Z","title":"Towards Efficient and Exact Optimization of Language Model Alignment","summary":" The alignment of language models with human preferences is vital for their\napplication in real-world tasks. The problem is formulated as optimizing the\nmodel's policy to maximize the expected reward that reflects human preferences\nwith minimal deviation from the initial policy. While considered as a\nstraightforward solution, reinforcement learning (RL) suffers from high\nvariance in policy updates, which impedes efficient policy improvement.\nRecently, direct preference optimization (DPO) was proposed to directly\noptimize the policy from preference data. Though simple to implement, DPO is\nderived based on the optimal policy that is not assured to be achieved in\npractice, which undermines its convergence to the intended solution.\n In this paper, we propose efficient exact optimization (EXO) of the alignment\nobjective. We prove that EXO is guaranteed to optimize in the same direction as\nthe RL algorithms asymptotically for arbitary parametrization of the policy,\nwhile enables efficient optimization by circumventing the complexities\nassociated with RL algorithms. We compare our method to DPO with both\ntheoretical and empirical analyses, and further demonstrate the advantages of\nour method over existing approaches on realistic human preference data.\n","authors":["Haozhe Ji","Cheng Lu","Yilin Niu","Pei Ke","Hongning Wang","Jun Zhu","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2402.00856v2.pdf","comment":"24 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.03610v2","updated":"2024-02-02T15:46:42Z","published":"2023-05-05T15:16:07Z","title":"The Role of Data Curation in Image Captioning","summary":" Image captioning models are typically trained by treating all samples\nequally, neglecting to account for mismatched or otherwise difficult data\npoints. In contrast, recent work has shown the effectiveness of training models\nby scheduling the data using curriculum learning strategies. This paper\ncontributes to this direction by actively curating difficult samples in\ndatasets without increasing the total number of samples. We explore the effect\nof using three data curation methods within the training process: complete\nremoval of an sample, caption replacement, or image replacement via a\ntext-to-image generation model. Experiments on the Flickr30K and COCO datasets\nwith the BLIP and BEiT-3 models demonstrate that these curation methods do\nindeed yield improved image captioning models, underscoring their efficacy.\n","authors":["Wenyan Li","Jonas F. Lotz","Chen Qiu","Desmond Elliott"],"pdf_url":"https://arxiv.org/pdf/2305.03610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01510v1","updated":"2024-02-02T15:44:28Z","published":"2024-02-02T15:44:28Z","title":"A Hybrid Strategy for Chat Transcript Summarization","summary":" Text summarization is the process of condensing a piece of text to fewer\nsentences, while still preserving its content. Chat transcript, in this\ncontext, is a textual copy of a digital or online conversation between a\ncustomer (caller) and agent(s). This paper presents an indigenously (locally)\ndeveloped hybrid method that first combines extractive and abstractive\nsummarization techniques in compressing ill-punctuated or un-punctuated chat\ntranscripts to produce more readable punctuated summaries and then optimizes\nthe overall quality of summarization through reinforcement learning. Extensive\ntesting, evaluations, comparisons, and validation have demonstrated the\nefficacy of this approach for large-scale deployment of chat transcript\nsummarization, in the absence of manually generated reference (annotated)\nsummaries.\n","authors":["Pratik K. Biswas"],"pdf_url":"https://arxiv.org/pdf/2402.01510v1.pdf","comment":"Journal Paper (13 Pages, 7 Figures, 4 Tables). arXiv admin note: text\n overlap with arXiv:2103.10599"},{"id":"http://arxiv.org/abs/2402.01505v1","updated":"2024-02-02T15:38:47Z","published":"2024-02-02T15:38:47Z","title":"Code-Switched Language Identification is Harder Than You Think","summary":" Code switching (CS) is a very common phenomenon in written and spoken\ncommunication but one that is handled poorly by many natural language\nprocessing applications. Looking to the application of building CS corpora, we\nexplore CS language identification (LID) for corpus building. We make the task\nmore realistic by scaling it to more languages and considering models with\nsimpler architectures for faster inference. We also reformulate the task as a\nsentence-level multi-label tagging problem to make it more tractable. Having\ndefined the task, we investigate three reasonable models for this task and\ndefine metrics which better reflect desired performance. We present empirical\nevidence that no current approach is adequate and finally provide\nrecommendations for future work in this area.\n","authors":["Laurie Burchell","Alexandra Birch","Robert P. Thompson","Kenneth Heafield"],"pdf_url":"https://arxiv.org/pdf/2402.01505v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2402.01495v1","updated":"2024-02-02T15:26:39Z","published":"2024-02-02T15:26:39Z","title":"A Comparative Analysis of Conversational Large Language Models in\n Knowledge-Based Text Generation","summary":" Generating natural language text from graph-structured data is essential for\nconversational information seeking. Semantic triples derived from knowledge\ngraphs can serve as a valuable source for grounding responses from\nconversational agents by providing a factual basis for the information they\ncommunicate. This is especially relevant in the context of large language\nmodels, which offer great potential for conversational interaction but are\nprone to hallucinating, omitting, or producing conflicting information. In this\nstudy, we conduct an empirical analysis of conversational large language models\nin generating natural language text from semantic triples. We compare four\nlarge language models of varying sizes with different prompting techniques.\nThrough a series of benchmark experiments on the WebNLG dataset, we analyze the\nmodels' performance and identify the most common issues in the generated\npredictions. Our findings show that the capabilities of large language models\nin triple verbalization can be significantly improved through few-shot\nprompting, post-processing, and efficient fine-tuning techniques, particularly\nfor smaller models that exhibit lower zero-shot performance.\n","authors":["Phillip Schneider","Manuel Klettner","Elena Simperl","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2402.01495v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2402.01469v1","updated":"2024-02-02T14:56:48Z","published":"2024-02-02T14:56:48Z","title":"AMOR: A Recipe for Building Adaptable Modular Knowledge Agents Through\n Process Feedback","summary":" The notable success of large language models (LLMs) has sparked an upsurge in\nbuilding language agents to complete various complex tasks. We present AMOR, an\nagent framework based on open-source LLMs, which reasons with external\nknowledge bases and adapts to specific domains through human supervision to the\nreasoning process. AMOR builds reasoning logic over a finite state machine\n(FSM) that solves problems through autonomous executions and transitions over\ndisentangled modules. This allows humans to provide direct feedback to the\nindividual modules, and thus naturally forms process supervision. Based on this\nreasoning and feedback framework, we develop AMOR through two-stage\nfine-tuning: warm-up and adaptation. The former fine-tunes the LLM with\nexamples automatically constructed from various public datasets and enables\nAMOR to generalize across different knowledge environments, while the latter\ntailors AMOR to specific domains using process feedback. Extensive experiments\nacross multiple domains demonstrate the advantage of AMOR to strong baselines,\nthanks to its FSM-based reasoning and process feedback mechanism.\n","authors":["Jian Guan","Wei Wu","Zujie Wen","Peng Xu","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2402.01469v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.16405v2","updated":"2024-02-02T14:53:14Z","published":"2024-01-29T18:43:49Z","title":"Scaling Sparse Fine-Tuning to Large Language Models","summary":" Large Language Models (LLMs) are difficult to fully fine-tune (e.g., with\ninstructions or human feedback) due to their sheer number of parameters. A\nfamily of parameter-efficient sparse fine-tuning methods have proven promising\nin terms of performance but their memory requirements increase proportionally\nto the size of the LLMs. In this work, we scale sparse fine-tuning to\nstate-of-the-art LLMs like LLaMA 2 7B and 13B. We propose SpIEL, a novel sparse\nfine-tuning method which, for a desired density level, maintains an array of\nparameter indices and the deltas of these parameters relative to their\npretrained values. It iterates over: (a) updating the active deltas, (b)\npruning indices (based on the change of magnitude of their deltas) and (c)\nregrowth of indices. For regrowth, we explore two criteria based on either the\naccumulated gradients of a few candidate parameters or their approximate\nmomenta estimated using the efficient SM3 optimizer. We experiment with\ninstruction-tuning of LLMs on standard dataset mixtures, finding that SpIEL is\noften superior to popular parameter-efficient fine-tuning methods like LoRA\n(low-rank adaptation) in terms of performance and comparable in terms of run\ntime. We additionally show that SpIEL is compatible with both quantization and\nefficient optimizers, to facilitate scaling to ever-larger model sizes. We\nrelease the code for SpIEL at https://github.com/AlanAnsell/peft and for the\ninstruction-tuning experiments at https://github.com/ducdauge/sft-llm.\n","authors":["Alan Ansell","Ivan Vulić","Hannah Sterz","Anna Korhonen","Edoardo M. Ponti"],"pdf_url":"https://arxiv.org/pdf/2401.16405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01453v1","updated":"2024-02-02T14:42:09Z","published":"2024-02-02T14:42:09Z","title":"The Queen of England is not England's Queen: On the Lack of Factual\n Coherency in PLMs","summary":" Factual knowledge encoded in Pre-trained Language Models (PLMs) enriches\ntheir representations and justifies their use as knowledge bases. Previous work\nhas focused on probing PLMs for factual knowledge by measuring how often they\ncan correctly predict an object entity given a subject and a relation, and\nimproving fact retrieval by optimizing the prompts used for querying PLMs. In\nthis work, we consider a complementary aspect, namely the coherency of factual\nknowledge in PLMs, i.e., how often can PLMs predict the subject entity given\nits initial prediction of the object entity. This goes beyond evaluating how\nmuch PLMs know, and focuses on the internal state of knowledge inside them. Our\nresults indicate that PLMs have low coherency using manually written, optimized\nand paraphrased prompts, but including an evidence paragraph leads to\nsubstantial improvement. This shows that PLMs fail to model inverse relations\nand need further enhancements to be able to handle retrieving facts from their\nparameters in a coherent manner, and to be considered as knowledge bases.\n","authors":["Paul Youssef","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2402.01453v1.pdf","comment":"Accepted to EACL Findings 2024"},{"id":"http://arxiv.org/abs/2309.01606v2","updated":"2024-02-02T14:15:32Z","published":"2023-09-04T13:44:50Z","title":"Geo-Encoder: A Chunk-Argument Bi-Encoder Framework for Chinese\n Geographic Re-Ranking","summary":" Chinese geographic re-ranking task aims to find the most relevant addresses\namong retrieved candidates, which is crucial for location-related services such\nas navigation maps. Unlike the general sentences, geographic contexts are\nclosely intertwined with geographical concepts, from general spans (e.g.,\nprovince) to specific spans (e.g., road). Given this feature, we propose an\ninnovative framework, namely Geo-Encoder, to more effectively integrate Chinese\ngeographical semantics into re-ranking pipelines. Our methodology begins by\nemploying off-the-shelf tools to associate text with geographical spans,\ntreating them as chunking units. Then, we present a multi-task learning module\nto simultaneously acquire an effective attention matrix that determines chunk\ncontributions to extra semantic representations. Furthermore, we put forth an\nasynchronous update mechanism for the proposed addition task, aiming to guide\nthe model capable of effectively focusing on specific chunks. Experiments on\ntwo distinct Chinese geographic re-ranking datasets, show that the Geo-Encoder\nachieves significant improvements when compared to state-of-the-art baselines.\nNotably, it leads to a substantial improvement in the Hit@1 score of MGEO-BERT,\nincreasing it by 6.22% from 62.76 to 68.98 on the GeoTES dataset.\n","authors":["Yong Cao","Ruixue Ding","Boli Chen","Xianzhi Li","Min Chen","Daniel Hershcovich","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2309.01606v2.pdf","comment":"15 pages, 5 figures, EACL 2024 main"},{"id":"http://arxiv.org/abs/2402.01427v1","updated":"2024-02-02T14:15:01Z","published":"2024-02-02T14:15:01Z","title":"The effect of diversity on group decision-making","summary":" We explore different aspects of cognitive diversity and its effect on the\nsuccess of group deliberation. To evaluate this, we use 500 dialogues from\nsmall, online groups discussing the Wason Card Selection task - the DeliData\ncorpus. Leveraging the corpus, we perform quantitative analysis evaluating\nthree different measures of cognitive diversity. First, we analyse the effect\nof group size as a proxy measure for diversity. Second, we evaluate the effect\nof the size of the initial idea pool. Finally, we look into the content of the\ndiscussion by analysing discussed solutions, discussion patterns, and how\nconversational probing can improve those characteristics.\n Despite the reputation of groups for compounding bias, we show that small\ngroups can, through dialogue, overcome intuitive biases and improve individual\ndecision-making. Across a large sample and different operationalisations, we\nconsistently find that greater cognitive diversity is associated with more\nsuccessful group deliberation. Code and data used for the analysis are\navailable in the anonymised repository: https://anonymous.4open.science/\nr/cogsci24-FD6D\n","authors":["Georgi Karadzhov","Andreas Vlachos","Tom Stafford"],"pdf_url":"https://arxiv.org/pdf/2402.01427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01423v1","updated":"2024-02-02T14:08:34Z","published":"2024-02-02T14:08:34Z","title":"Different Tastes of Entities: Investigating Human Label Variation in\n Named Entity Annotations","summary":" Named Entity Recognition (NER) is a key information extraction task with a\nlong-standing tradition. While recent studies address and aim to correct\nannotation errors via re-labeling efforts, little is known about the sources of\nhuman label variation, such as text ambiguity, annotation error, or guideline\ndivergence. This is especially the case for high-quality datasets and beyond\nEnglish CoNLL03. This paper studies disagreements in expert-annotated named\nentity datasets for three languages: English, Danish, and Bavarian. We show\nthat text ambiguity and artificial guideline changes are dominant factors for\ndiverse annotations among high-quality revisions. We survey student annotations\non a subset of difficult entities and substantiate the feasibility and\nnecessity of manifold annotations for understanding named entity ambiguities\nfrom a distributional perspective.\n","authors":["Siyao Peng","Zihang Sun","Sebastian Loftus","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2402.01423v1.pdf","comment":"9 pages; Accepted at UnImplicit workshop at EACL 2024"},{"id":"http://arxiv.org/abs/2401.15316v3","updated":"2024-02-02T14:06:31Z","published":"2024-01-27T06:29:07Z","title":"UNSEE: Unsupervised Non-contrastive Sentence Embeddings","summary":" We present UNSEE: Unsupervised Non-Contrastive Sentence Embeddings, a novel\napproach that outperforms SimCSE in the Massive Text Embedding benchmark. Our\nexploration begins by addressing the challenge of representation collapse, a\nphenomenon observed when contrastive objectives in SimCSE are replaced with\nnon-contrastive objectives. To counter this issue, we propose a straightforward\nsolution known as the target network, effectively mitigating representation\ncollapse. The introduction of the target network allows us to leverage\nnon-contrastive objectives, maintaining training stability while achieving\nperformance improvements comparable to contrastive objectives. Our method has\nachieved peak performance in non-contrastive sentence embeddings through\nmeticulous fine-tuning and optimization. This comprehensive effort has yielded\nsuperior sentence representation models, showcasing the effectiveness of our\napproach.\n","authors":["Ömer Veysel Çağatan"],"pdf_url":"https://arxiv.org/pdf/2401.15316v3.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2212.10923v2","updated":"2024-02-02T14:06:28Z","published":"2022-12-21T11:12:14Z","title":"Language Models as Inductive Reasoners","summary":" Inductive reasoning is a core component of human intelligence. In the past\nresearch of inductive reasoning within computer science, formal language is\nused as representations of knowledge (facts and rules, more specifically).\nHowever, formal language can cause systematic problems for inductive reasoning\nsuch as disability of handling raw input such as natural language,\nsensitiveness to mislabeled data, and incapacity to handle ambiguous input. To\nthis end, we propose a new paradigm (task) for inductive reasoning, which is to\ninduce natural language rules from natural language facts, and create a dataset\ntermed DEER containing 1.2k rule-fact pairs for the task, where rules and facts\nare written in natural language. New automatic metrics are also proposed and\nanalysed for the evaluation of this task. With DEER, we investigate a modern\napproach for inductive reasoning where we use natural language as\nrepresentation for knowledge instead of formal language and use pretrained\nlanguage models as \"reasoners\". Moreover, we provide the first and\ncomprehensive analysis of how well pretrained language models can induce\nnatural language rules from natural language facts. We also propose a new\nframework drawing insights from philosophy literature for this task, which we\nshow in the experiment section that surpasses baselines in both automatic and\nhuman evaluations. We discuss about our future perspectives for inductive\nreasoning in Section 7. Dataset and code are available at\nhttps://github.com/ZonglinY/Inductive_Reasoning.\n","authors":["Zonglin Yang","Li Dong","Xinya Du","Hao Cheng","Erik Cambria","Xiaodong Liu","Jianfeng Gao","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2212.10923v2.pdf","comment":"Accepted by EACL 2024 (main)"},{"id":"http://arxiv.org/abs/2402.01416v1","updated":"2024-02-02T13:55:37Z","published":"2024-02-02T13:55:37Z","title":"Sequence Shortening for Context-Aware Machine Translation","summary":" Context-aware Machine Translation aims to improve translations of sentences\nby incorporating surrounding sentences as context. Towards this task, two main\narchitectures have been applied, namely single-encoder (based on concatenation)\nand multi-encoder models. In this study, we show that a special case of\nmulti-encoder architecture, where the latent representation of the source\nsentence is cached and reused as the context in the next step, achieves higher\naccuracy on the contrastive datasets (where the models have to rank the correct\ntranslation among the provided sentences) and comparable BLEU and COMET scores\nas the single- and multi-encoder approaches. Furthermore, we investigate the\napplication of Sequence Shortening to the cached representations. We test three\npooling-based shortening techniques and introduce two novel methods - Latent\nGrouping and Latent Selecting, where the network learns to group tokens or\nselects the tokens to be cached as context. Our experiments show that the two\nmethods achieve competitive BLEU and COMET scores and accuracies on the\ncontrastive datasets to the other tested methods while potentially allowing for\nhigher interpretability and reducing the growth of memory requirements with\nincreased context size.\n","authors":["Paweł Mąka","Yusuf Can Semerci","Jan Scholtes","Gerasimos Spanakis"],"pdf_url":"https://arxiv.org/pdf/2402.01416v1.pdf","comment":"Findings of the ACL: EACL 2024"},{"id":"http://arxiv.org/abs/2310.11085v2","updated":"2024-02-02T13:50:42Z","published":"2023-10-17T09:10:27Z","title":"Document-Level In-Context Few-Shot Relation Extraction via Pre-Trained\n Language Models","summary":" Relation extraction aims at inferring structured human knowledge from textual\ndocuments. State-of-the-art methods based on language models commonly have two\nlimitations: (1) they require named entities to be either given as input or\ninfer them, which introduces additional noise, and (2) they require human\nannotations of documents. As a remedy, we present a novel framework for\ndocument-level in-context few-shot relation extraction via pre-trained language\nmodels. We achieve crucial benefits in that we eliminate the need for both\nnamed entity recognition and human annotation of documents. Unlike existing\nmethods based on fine-tuning, our framework is flexible in that it can be\neasily updated for a new set of relations without re-training. We evaluate our\nframework using DocRED, the largest publicly available dataset for\ndocument-level relation extraction, and demonstrate that our framework achieves\nstate-of-the-art performance. Finally, we show that our framework actually\nperforms much better than the original labels from the development set of\nDocRED. To the best of our knowledge, we are the first to reformulate the\ndocument-level relation extraction task as a tailored in-context few-shot\nlearning paradigm.\n","authors":["Yilmazcan Ozyurt","Stefan Feuerriegel","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01404v1","updated":"2024-02-02T13:37:07Z","published":"2024-02-02T13:37:07Z","title":"On Measuring Context Utilization in Document-Level MT Systems","summary":" Document-level translation models are usually evaluated using general metrics\nsuch as BLEU, which are not informative about the benefits of context. Current\nwork on context-aware evaluation, such as contrastive methods, only measure\ntranslation accuracy on words that need context for disambiguation. Such\nmeasures cannot reveal whether the translation model uses the correct\nsupporting context. We propose to complement accuracy-based evaluation with\nmeasures of context utilization. We find that perturbation-based analysis\n(comparing models' performance when provided with correct versus random\ncontext) is an effective measure of overall context utilization. For a\nfiner-grained phenomenon-specific evaluation, we propose to measure how much\nthe supporting context contributes to handling context-dependent discourse\nphenomena. We show that automatically-annotated supporting context gives\nsimilar conclusions to human-annotated context and can be used as alternative\nfor cases where human annotations are not available. Finally, we highlight the\nimportance of using discourse-rich datasets when assessing context utilization.\n","authors":["Wafaa Mohammed","Vlad Niculae"],"pdf_url":"https://arxiv.org/pdf/2402.01404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01391v1","updated":"2024-02-02T13:14:31Z","published":"2024-02-02T13:14:31Z","title":"StepCoder: Improve Code Generation with Reinforcement Learning from\n Compiler Feedback","summary":" The advancement of large language models (LLMs) has significantly propelled\nthe field of code generation. Previous work integrated reinforcement learning\n(RL) with compiler feedback for exploring the output space of LLMs to enhance\ncode generation quality. However, the lengthy code generated by LLMs in\nresponse to complex human requirements makes RL exploration a challenge. Also,\nsince the unit tests may not cover the complicated code, optimizing LLMs by\nusing these unexecuted code snippets is ineffective. To tackle these\nchallenges, we introduce StepCoder, a novel RL framework for code generation,\nconsisting of two main components: CCCS addresses the exploration challenge by\nbreaking the long sequences code generation task into a Curriculum of Code\nCompletion Subtasks, while FGO only optimizes the model by masking the\nunexecuted code segments to provide Fine-Grained Optimization. In addition, we\nfurthermore construct the APPS+ dataset for RL training, which is manually\nverified to ensure the correctness of unit tests. Experimental results show\nthat our method improves the ability to explore the output space and\noutperforms state-of-the-art approaches in corresponding benchmarks.\n","authors":["Shihan Dou","Yan Liu","Haoxiang Jia","Limao Xiong","Enyu Zhou","Junjie Shan","Caishuang Huang","Wei Shen","Xiaoran Fan","Zhiheng Xi","Yuhao Zhou","Tao Ji","Rui Zheng","Qi Zhang","Xuanjing Huang","Tao Gui"],"pdf_url":"https://arxiv.org/pdf/2402.01391v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.01383v1","updated":"2024-02-02T13:06:35Z","published":"2024-02-02T13:06:35Z","title":"LLM-based NLG Evaluation: Current Status and Challenges","summary":" Evaluating natural language generation (NLG) is a vital but challenging\nproblem in artificial intelligence. Traditional evaluation metrics mainly\ncapturing content (e.g. n-gram) overlap between system outputs and references\nare far from satisfactory, and large language models (LLMs) such as ChatGPT\nhave demonstrated great potential in NLG evaluation in recent years. Various\nautomatic evaluation methods based on LLMs have been proposed, including\nmetrics derived from LLMs, prompting LLMs, and fine-tuning LLMs with labeled\nevaluation data. In this survey, we first give a taxonomy of LLM-based NLG\nevaluation methods, and discuss their pros and cons, respectively. We also\ndiscuss human-LLM collaboration for NLG evaluation. Lastly, we discuss several\nopen problems in this area and point out future research directions.\n","authors":["Mingqi Gao","Xinyu Hu","Jie Ruan","Xiao Pu","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2402.01383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01376v1","updated":"2024-02-02T13:00:38Z","published":"2024-02-02T13:00:38Z","title":"LoTR: Low Tensor Rank Weight Adaptation","summary":" In this paper we generalize and extend an idea of low-rank adaptation (LoRA)\nof large language models (LLMs) based on Transformer architecture. Widely used\nLoRA-like methods of fine-tuning LLMs are based on matrix factorization of\ngradient update. We introduce LoTR, a novel approach for parameter-efficient\nfine-tuning of LLMs which represents a gradient update to parameters in a form\nof tensor decomposition. Low-rank adapter for each layer is constructed as a\nproduct of three matrices, and tensor structure arises from sharing left and\nright multipliers of this product among layers. Simultaneous compression of a\nsequence of layers with low-rank tensor representation allows LoTR to archive\neven better parameter efficiency then LoRA especially for deep models.\nMoreover, the core tensor does not depend on original weight dimension and can\nbe made arbitrary small, which allows for extremely cheap and fast downstream\nfine-tuning.\n","authors":["Daniel Bershatsky","Daria Cherniuk","Talgat Daulbaev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2402.01376v1.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2402.01375v1","updated":"2024-02-02T12:59:27Z","published":"2024-02-02T12:59:27Z","title":"Dive into the Chasm: Probing the Gap between In- and Cross-Topic\n Generalization","summary":" Pre-trained language models (LMs) perform well in In-Topic setups, where\ntraining and testing data come from the same topics. However, they face\nchallenges in Cross-Topic scenarios where testing data is derived from distinct\ntopics -- such as Gun Control. This study analyzes various LMs with three\nprobing-based experiments to shed light on the reasons behind the In- vs.\nCross-Topic generalization gap. Thereby, we demonstrate, for the first time,\nthat generalization gaps and the robustness of the embedding space vary\nsignificantly across LMs. Additionally, we assess larger LMs and underscore the\nrelevance of our analysis for recent models. Overall, diverse pre-training\nobjectives, architectural regularization, or data deduplication contribute to\nmore robust LMs and diminish generalization gaps. Our research contributes to a\ndeeper understanding and comparison of language models across different\ngeneralization scenarios.\n","authors":["Andreas Waldis","Yufang Hou","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2402.01375v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.10352v2","updated":"2024-02-02T12:35:15Z","published":"2024-01-18T19:42:04Z","title":"Bridging Cultural Nuances in Dialogue Agents through Cultural Value\n Surveys","summary":" The cultural landscape of interactions with dialogue agents is a compelling\nyet relatively unexplored territory. It's clear that various sociocultural\naspects -- from communication styles and beliefs to shared metaphors and\nknowledge -- profoundly impact these interactions. To delve deeper into this\ndynamic, we introduce cuDialog, a first-of-its-kind benchmark for dialogue\ngeneration with a cultural lens. We also develop baseline models capable of\nextracting cultural attributes from dialogue exchanges, with the goal of\nenhancing the predictive accuracy and quality of dialogue agents. To\neffectively co-learn cultural understanding and multi-turn dialogue\npredictions, we propose to incorporate cultural dimensions with dialogue\nencoding features. Our experimental findings highlight that incorporating\ncultural value surveys boosts alignment with references and cultural markers,\ndemonstrating its considerable influence on personalization and dialogue\nquality. To facilitate further exploration in this exciting domain, we publish\nour benchmark publicly accessible at https://github.com/yongcaoplus/cuDialog.\n","authors":["Yong Cao","Min Chen","Daniel Hershcovich"],"pdf_url":"https://arxiv.org/pdf/2401.10352v2.pdf","comment":"17pages, 7 figures, EACL 2024 findings"},{"id":"http://arxiv.org/abs/2401.03462v2","updated":"2024-02-02T12:34:25Z","published":"2024-01-07T11:57:40Z","title":"Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon","summary":" The utilization of long contexts poses a big challenge for LLMs due to their\nlimited context window size. Although the context window can be extended\nthrough fine-tuning, it will result in a considerable cost at both training and\ninference time, and exert an unfavorable impact to the LLM's original\ncapabilities. In this work, we propose a new method called Activation Beacon,\nwhich condenses LLM's raw activations into compact forms such that the LLM can\nperceive a longer context with a limited context window. Activation Beacon is\nintroduced as a plug-in module, which fully preserves the LLM's original\ncapability in short contexts. It works with the sliding window to streamingly\nprocess the long context, which leads to a competitive memory and time\nefficiency in both training and inference. Activation Beacon is trained with\nshort-sequence data of diversified condensing ratios. Thanks to such a\ntreatment, it can be effectively learned to support different context lengths\nwith a small training cost. Our experiment verifies Activation Beacon's\neffectiveness of context extension: it can remarkably accomplish high-quality\nextension of Llama-2-7B's context by $\\times100$ times (from 4K to 400K);\nmeanwhile, it can also achieve superior performances across a variety of\nlong-context language modeling and understanding tasks. The source code and\nmodel checkpoint are available at\n\\url{https://github.com/FlagOpen/FlagEmbedding}.\n","authors":["Peitian Zhang","Zheng Liu","Shitao Xiao","Ninglu Shao","Qiwei Ye","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2401.03462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01364v1","updated":"2024-02-02T12:34:09Z","published":"2024-02-02T12:34:09Z","title":"Continual Learning for Large Language Models: A Survey","summary":" Large language models (LLMs) are not amenable to frequent re-training, due to\nhigh training costs arising from their massive scale. However, updates are\nnecessary to endow LLMs with new skills and keep them up-to-date with rapidly\nevolving human knowledge. This paper surveys recent works on continual learning\nfor LLMs. Due to the unique nature of LLMs, we catalog continue learning\ntechniques in a novel multi-staged categorization scheme, involving continual\npretraining, instruction tuning, and alignment. We contrast continual learning\nfor LLMs with simpler adaptation methods used in smaller models, as well as\nwith other enhancement strategies like retrieval-augmented generation and model\nediting. Moreover, informed by a discussion of benchmarks and evaluation, we\nidentify several challenges and future work directions for this crucial task.\n","authors":["Tongtong Wu","Linhao Luo","Yuan-Fang Li","Shirui Pan","Thuy-Trang Vu","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.01364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01360v1","updated":"2024-02-02T12:27:58Z","published":"2024-02-02T12:27:58Z","title":"What Makes Medical Claims (Un)Verifiable? Analyzing Entity and Relation\n Properties for Fact Verification","summary":" Biomedical claim verification fails if no evidence can be discovered. In\nthese cases, the fact-checking verdict remains unknown and the claim is\nunverifiable. To improve upon this, we have to understand if there are any\nclaim properties that impact its verifiability. In this work we assume that\nentities and relations define the core variables in a biomedical claim's\nanatomy and analyze if their properties help us to differentiate verifiable\nfrom unverifiable claims. In a study with trained annotation experts we prompt\nthem to find evidence for biomedical claims, and observe how they refine search\nqueries for their evidence search. This leads to the first corpus for\nscientific fact verification annotated with subject-relation-object triplets,\nevidence documents, and fact-checking verdicts (the BEAR-Fact corpus). We find\n(1) that discovering evidence for negated claims (e.g., X-does-not-cause-Y) is\nparticularly challenging. Further, we see that annotators process queries\nmostly by adding constraints to the search and by normalizing entities to\ncanonical names. (2) We compare our in-house annotations with a small\ncrowdsourcing setting where we employ medical experts and laypeople. We find\nthat domain expertise does not have a substantial effect on the reliability of\nannotations. Finally, (3), we demonstrate that it is possible to reliably\nestimate the success of evidence retrieval purely from the claim text~(.82\\F),\nwhereas identifying unverifiable claims proves more challenging (.27\\F). The\ndataset is available at http://www.ims.uni-stuttgart.de/data/bioclaim.\n","authors":["Amelie Wührl","Yarik Menchaca Resendiz","Lara Grimminger","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2402.01360v1.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2307.16513v2","updated":"2024-02-02T12:16:12Z","published":"2023-07-31T09:27:01Z","title":"Deception Abilities Emerged in Large Language Models","summary":" Large language models (LLMs) are currently at the forefront of intertwining\nartificial intelligence (AI) systems with human communication and everyday\nlife. Thus, aligning them with human values is of great importance. However,\ngiven the steady increase in reasoning abilities, future LLMs are under\nsuspicion of becoming able to deceive human operators and utilizing this\nability to bypass monitoring efforts. As a prerequisite to this, LLMs need to\npossess a conceptual understanding of deception strategies. This study reveals\nthat such strategies emerged in state-of-the-art LLMs, such as GPT-4, but were\nnon-existent in earlier LLMs. We conduct a series of experiments showing that\nstate-of-the-art LLMs are able to understand and induce false beliefs in other\nagents, that their performance in complex deception scenarios can be amplified\nutilizing chain-of-thought reasoning, and that eliciting Machiavellianism in\nLLMs can alter their propensity to deceive. In sum, revealing hitherto unknown\nmachine behavior in LLMs, our study contributes to the nascent field of machine\npsychology.\n","authors":["Thilo Hagendorff"],"pdf_url":"https://arxiv.org/pdf/2307.16513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01352v1","updated":"2024-02-02T12:11:16Z","published":"2024-02-02T12:11:16Z","title":"Describing Images $\\textit{Fast and Slow}$: Quantifying and Predicting\n the Variation in Human Signals during Visuo-Linguistic Processes","summary":" There is an intricate relation between the properties of an image and how\nhumans behave while describing the image. This behavior shows ample variation,\nas manifested in human signals such as eye movements and when humans start to\ndescribe the image. Despite the value of such signals of visuo-linguistic\nvariation, they are virtually disregarded in the training of current pretrained\nmodels, which motivates further investigation. Using a corpus of Dutch image\ndescriptions with concurrently collected eye-tracking data, we explore the\nnature of the variation in visuo-linguistic signals, and find that they\ncorrelate with each other. Given this result, we hypothesize that variation\nstems partly from the properties of the images, and explore whether image\nrepresentations encoded by pretrained vision encoders can capture such\nvariation. Our results indicate that pretrained models do so to a\nweak-to-moderate degree, suggesting that the models lack biases about what\nmakes a stimulus complex for humans and what leads to variations in human\noutputs.\n","authors":["Ece Takmaz","Sandro Pezzelle","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2402.01352v1.pdf","comment":"To appear in EACL 2024"},{"id":"http://arxiv.org/abs/2402.01349v1","updated":"2024-02-02T12:07:00Z","published":"2024-02-02T12:07:00Z","title":"Beyond the Answers: Reviewing the Rationality of Multiple Choice\n Question Answering for the Evaluation of Large Language Models","summary":" In the field of natural language processing (NLP), Large Language Models\n(LLMs) have precipitated a paradigm shift, markedly enhancing performance in\nnatural language generation tasks. Despite these advancements, the\ncomprehensive evaluation of LLMs remains an inevitable challenge for the\ncommunity. Recently, the utilization of Multiple Choice Question Answering\n(MCQA) as a benchmark for LLMs has gained considerable traction. This study\ninvestigates the rationality of MCQA as an evaluation method for LLMs. If LLMs\ngenuinely understand the semantics of questions, their performance should\nexhibit consistency across the varied configurations derived from the same\nquestions. Contrary to this expectation, our empirical findings suggest a\nnotable disparity in the consistency of LLM responses, which we define as\nREsponse VAriability Syndrome (REVAS) of the LLMs, indicating that current\nMCQA-based benchmarks may not adequately capture the true capabilities of LLMs,\nwhich underscores the need for more robust evaluation mechanisms in assessing\nthe performance of LLMs.\n","authors":["Haochun Wang","Sendong Zhao","Zewen Qiang","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01349v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.01345v1","updated":"2024-02-02T12:02:46Z","published":"2024-02-02T12:02:46Z","title":"Skip $\\textbackslash n$: A simple method to reduce hallucination in\n Large Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks ('$\\textbackslash n\\textbackslash n$'),\nwhere the content before and after '$\\textbackslash n\\textbackslash n$' in the\ntraining data frequently exhibit significant semantic changes. This pattern\nleads the model to infer that the contents following '$\\textbackslash\nn\\textbackslash n$' should be obviously different from the preceding contents\nwith less hallucinatory descriptions, thereby increasing the probability of\nhallucinatory descriptions subsequent to the '$\\textbackslash n\\textbackslash\nn$'. We have validated this hypothesis on multiple publicly available LVLMs.\nBesides, we find that deliberately inserting '$\\textbackslash n\\textbackslash\nn$' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of `\\textbackslash n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.01300v1","updated":"2024-02-02T10:42:06Z","published":"2024-02-02T10:42:06Z","title":"Two Approaches to Diachronic Normalization of Polish Texts","summary":" This paper discusses two approaches to the diachronic normalization of Polish\ntexts: a rule-based solution that relies on a set of handcrafted patterns, and\na neural normalization model based on the text-to-text transfer transformer\narchitecture. The training and evaluation data prepared for the task are\ndiscussed in detail, along with experiments conducted to compare the proposed\nnormalization solutions. A quantitative and qualitative analysis is made. It is\nshown that at the current stage of inquiry into the problem, the rule-based\nsolution outperforms the neural one on 3 out of 4 variants of the prepared\ndataset, although in practice both approaches have distinct advantages and\ndisadvantages.\n","authors":["Kacper Dudzic","Filip Graliński","Krzysztof Jassem","Marek Kubis","Piotr Wierzchoń"],"pdf_url":"https://arxiv.org/pdf/2402.01300v1.pdf","comment":"Accepted to the LaTeCH-CLfL 2024 workshop"},{"id":"http://arxiv.org/abs/2312.16903v2","updated":"2024-02-02T10:37:53Z","published":"2023-12-28T08:53:27Z","title":"Spike No More: Stabilizing the Pre-training of Large Language Models","summary":" Loss spikes often occur during pre-training of large language models. The\nspikes degrade the performance of large language models and sometimes ruin the\npre-training. Since the pre-training needs a vast computational budget, we\nshould avoid such spikes. To investigate the cause of loss spikes, we focus on\ngradients of internal layers. Through theoretical analyses, we reveal two\ncauses of the exploding gradients, and provide requirements to prevent the\nexplosion. In addition, we propose a method to satisfy the requirements by\ncombining the initialization method and a simple modification to embeddings. We\nconduct various experiments to verify our theoretical analyses empirically.\nExperimental results indicate that the combination is effective in preventing\nspikes during pre-training.\n","authors":["Sho Takase","Shun Kiyono","Sosuke Kobayashi","Jun Suzuki"],"pdf_url":"https://arxiv.org/pdf/2312.16903v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.01293v1","updated":"2024-02-02T10:30:05Z","published":"2024-02-02T10:30:05Z","title":"Can MLLMs Perform Text-to-Image In-Context Learning?","summary":" The evolution from Large Language Models (LLMs) to Multimodal Large Language\nModels (MLLMs) has spurred research into extending In-Context Learning (ICL) to\nits multimodal counterpart. Existing such studies have primarily concentrated\non image-to-text ICL. However, the Text-to-Image ICL (T2I-ICL), with its unique\ncharacteristics and potential applications, remains underexplored. To address\nthis gap, we formally define the task of T2I-ICL and present CoBSAT, the first\nT2I-ICL benchmark dataset, encompassing ten tasks. Utilizing our dataset to\nbenchmark six state-of-the-art MLLMs, we uncover considerable difficulties\nMLLMs encounter in solving T2I-ICL. We identify the primary challenges as the\ninherent complexity of multimodality and image generation. To overcome these\nchallenges, we explore strategies like fine-tuning and Chain-of-Thought\nprompting, demonstrating notable improvements. Our code and dataset are\navailable at \\url{https://github.com/UW-Madison-Lee-Lab/CoBSAT}.\n","authors":["Yuchen Zeng","Wonjun Kang","Yicong Chen","Hyung Il Koo","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2402.01293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07303v3","updated":"2024-02-02T10:10:40Z","published":"2023-05-12T08:16:06Z","title":"Multi-Relational Hyperbolic Word Embeddings from Natural Language\n Definitions","summary":" Natural language definitions possess a recursive, self-explanatory semantic\nstructure that can support representation learning methods able to preserve\nexplicit conceptual relations and constraints in the latent space. This paper\npresents a multi-relational model that explicitly leverages such a structure to\nderive word embeddings from definitions. By automatically extracting the\nrelations linking defined and defining terms from dictionaries, we demonstrate\nhow the problem of learning word embeddings can be formalised via a\ntranslational framework in Hyperbolic space and used as a proxy to capture the\nglobal semantic structure of definitions. An extensive empirical analysis\ndemonstrates that the framework can help imposing the desired structural\nconstraints while preserving the semantic mapping required for controllable and\ninterpretable traversal. Moreover, the experiments reveal the superiority of\nthe Hyperbolic word embeddings over the Euclidean counterparts and demonstrate\nthat the multi-relational approach can obtain competitive results when compared\nto state-of-the-art neural models, with the advantage of being intrinsically\nmore efficient and interpretable.\n","authors":["Marco Valentino","Danilo S. Carvalho","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2305.07303v3.pdf","comment":"Accepted at the 18th Conference of the European Chapter of the\n Association for Computational Linguistics (EACL 2024), camera-ready"},{"id":"http://arxiv.org/abs/2401.06683v2","updated":"2024-02-02T09:54:18Z","published":"2024-01-12T16:43:28Z","title":"DQNC2S: DQN-based Cross-stream Crisis event Summarizer","summary":" Summarizing multiple disaster-relevant data streams simultaneously is\nparticularly challenging as existing Retrieve&Re-ranking strategies suffer from\nthe inherent redundancy of multi-stream data and limited scalability in a\nmulti-query setting. This work proposes an online approach to crisis timeline\ngeneration based on weak annotation with Deep Q-Networks. It selects on-the-fly\nthe relevant pieces of text without requiring neither human annotations nor\ncontent re-ranking. This makes the inference time independent of the number of\ninput queries. The proposed approach also incorporates a redundancy filter into\nthe reward function to effectively handle cross-stream content overlaps. The\nachieved ROUGE and BERTScore results are superior to those of best-performing\nmodels on the CrisisFACTS 2022 benchmark.\n","authors":["Daniele Rege Cambrin","Luca Cagliero","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2401.06683v2.pdf","comment":"accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2402.01267v1","updated":"2024-02-02T09:41:51Z","published":"2024-02-02T09:41:51Z","title":"The Human and the Mechanical: logos, truthfulness, and ChatGPT","summary":" The paper addresses the question of whether it is appropriate to talk about\n`mechanical minds' at all, and whether ChatGPT models can indeed be thought of\nas realizations of that. Our paper adds a semantic argument to the current\ndebate. The act of human assertion requires the formation of a veridicality\njudgment. Modification of assertions with modals (John must be at home) and the\nuse of subjective elements (John is obviously at home) indicate that the\nspeaker is manipulating her judgments and, in a cooperative context, intends\nher epistemic state to be transparent to the addressee. Veridicality judgments\nare formed on the basis of two components: (i) evidence that relates to reality\n(exogenous evidence) and (ii) endogenous evidence, such as preferences and\nprivate beliefs. `Mechanical minds' lack these two components: (i) they do not\nrelate to reality and (ii) do not have endogenous evidence. Therefore they lack\nthe ability to form a belief about the world and a veridicality judgments\naltogether. They can only mimic that judgment, but the output is not ground in\nthe very foundations for it.\n","authors":["Anastasia Giannakidou","Alda Mari"],"pdf_url":"https://arxiv.org/pdf/2402.01267v1.pdf","comment":"Under submission"},{"id":"http://arxiv.org/abs/2401.04592v2","updated":"2024-02-02T09:36:58Z","published":"2024-01-09T14:50:04Z","title":"An Assessment on Comprehending Mental Health through Large Language\n Models","summary":" Mental health challenges pose considerable global burdens on individuals and\ncommunities. Recent data indicates that more than 20% of adults may encounter\nat least one mental disorder in their lifetime. On the one hand, the\nadvancements in large language models have facilitated diverse applications,\nyet a significant research gap persists in understanding and enhancing the\npotential of large language models within the domain of mental health. On the\nother hand, across various applications, an outstanding question involves the\ncapacity of large language models to comprehend expressions of human mental\nhealth conditions in natural language. This study presents an initial\nevaluation of large language models in addressing this gap. Due to this, we\ncompare the performance of Llama-2 and ChatGPT with classical Machine as well\nas Deep learning models. Our results on the DAIC-WOZ dataset show that\ntransformer-based models, like BERT or XLNet, outperform the large language\nmodels.\n","authors":["Mihael Arcan","David-Paul Niland","Fionn Delahunty"],"pdf_url":"https://arxiv.org/pdf/2401.04592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08417v3","updated":"2024-02-02T09:10:11Z","published":"2024-01-16T15:04:51Z","title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM\n Performance in Machine Translation","summary":" Moderate-sized large language models (LLMs) -- those with 7B or 13B\nparameters -- exhibit promising machine translation (MT) performance. However,\neven the top-performing 13B LLM-based translation models, like ALMA, does not\nmatch the performance of state-of-the-art conventional encoder-decoder\ntranslation models or larger-scale LLMs such as GPT-4. In this study, we bridge\nthis performance gap. We first assess the shortcomings of supervised\nfine-tuning for LLMs in the MT task, emphasizing the quality issues present in\nthe reference data, despite being human-generated. Then, in contrast to SFT\nwhich mimics reference translations, we introduce Contrastive Preference\nOptimization (CPO), a novel approach that trains models to avoid generating\nadequate but not perfect translations. Applying CPO to ALMA models with only\n22K parallel sentences and 12M parameters yields significant improvements. The\nresulting model, called ALMA-R, can match or exceed the performance of the WMT\ncompetition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.\n","authors":["Haoran Xu","Amr Sharaf","Yunmo Chen","Weiting Tan","Lingfeng Shen","Benjamin Van Durme","Kenton Murray","Young Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08417v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11855v2","updated":"2024-02-02T08:28:01Z","published":"2023-11-20T15:50:09Z","title":"Evil Geniuses: Delving into the Safety of LLM-based Agents","summary":" Rapid advancements in large language models (LLMs) have revitalized in\nLLM-based agents, exhibiting impressive human-like behaviors and cooperative\ncapabilities in various scenarios. However, these agents also bring some\nexclusive risks, stemming from the complexity of interaction environments and\nthe usability of tools. This paper delves into the safety of LLM-based agents\nfrom three perspectives: agent quantity, role definition, and attack level.\nSpecifically, we initially propose to employ a template-based attack strategy\non LLM-based agents to find the influence of agent quantity. In addition, to\naddress interaction environment and role specificity issues, we introduce Evil\nGeniuses (EG), an effective attack method that autonomously generates prompts\nrelated to the original role to examine the impact across various role\ndefinitions and attack levels. EG leverages Red-Blue exercises, significantly\nimproving the generated prompt aggressiveness and similarity to original roles.\nOur evaluations on CAMEL, Metagpt and ChatDev based on GPT-3.5 and GPT-4,\ndemonstrate high success rates. Extensive evaluation and discussion reveal that\nthese agents are less robust, prone to more harmful behaviors, and capable of\ngenerating stealthier content than LLMs, highlighting significant safety\nchallenges and guiding future research. Our code is available at\nhttps://github.com/T1aNS1R/Evil-Geniuses.\n","authors":["Yu Tian","Xiao Yang","Jingyuan Zhang","Yinpeng Dong","Hang Su"],"pdf_url":"https://arxiv.org/pdf/2311.11855v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2305.12641v3","updated":"2024-02-02T07:57:05Z","published":"2023-05-22T02:31:15Z","title":"A Comprehensive Survey of Sentence Representations: From the BERT Epoch\n to the ChatGPT Era and Beyond","summary":" Sentence representations are a critical component in NLP applications such as\nretrieval, question answering, and text classification. They capture the\nmeaning of a sentence, enabling machines to understand and reason over human\nlanguage. In recent years, significant progress has been made in developing\nmethods for learning sentence representations, including unsupervised,\nsupervised, and transfer learning approaches. However there is no literature\nreview on sentence representations till now. In this paper, we provide an\noverview of the different methods for sentence representation learning,\nfocusing mostly on deep learning models. We provide a systematic organization\nof the literature, highlighting the key contributions and challenges in this\narea. Overall, our review highlights the importance of this area in natural\nlanguage processing, the progress made in sentence representation learning, and\nthe challenges that remain. We conclude with directions for future research,\nsuggesting potential avenues for improving the quality and efficiency of\nsentence representations.\n","authors":["Abhinav Ramesh Kashyap","Thanh-Tung Nguyen","Viktor Schlegel","Stefan Winkler","See-Kiong Ng","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2305.12641v3.pdf","comment":"Accepted to EACL'24"},{"id":"http://arxiv.org/abs/2402.00559v2","updated":"2024-02-02T07:32:41Z","published":"2024-02-01T12:46:45Z","title":"A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for\n Verifiers of Reasoning Chains","summary":" Prompting language models to provide step-by-step answers (e.g.,\n\"Chain-of-Thought\") is the prominent approach for complex reasoning tasks,\nwhere more accurate reasoning chains typically improve downstream task\nperformance. Recent literature discusses automatic methods to verify reasoning\nsteps to evaluate and improve their correctness. However, no fine-grained\nstep-level datasets are available to enable thorough evaluation of such\nverification methods, hindering progress in this direction. We introduce\nReveal: Reasoning Verification Evaluation, a new dataset to benchmark automatic\nverifiers of complex Chain-of-Thought reasoning in open-domain question\nanswering settings. Reveal includes comprehensive labels for the relevance,\nattribution to evidence passages, and logical correctness of each reasoning\nstep in a language model's answer, across a wide variety of datasets and\nstate-of-the-art language models.\n","authors":["Alon Jacovi","Yonatan Bitton","Bernd Bohnet","Jonathan Herzig","Or Honovich","Michael Tseng","Michael Collins","Roee Aharoni","Mor Geva"],"pdf_url":"https://arxiv.org/pdf/2402.00559v2.pdf","comment":"Dataset at https://huggingface.co/datasets/google/reveal"},{"id":"http://arxiv.org/abs/2402.01182v1","updated":"2024-02-02T06:57:53Z","published":"2024-02-02T06:57:53Z","title":"In-Context Learning for Few-Shot Nested Named Entity Recognition","summary":" In nested Named entity recognition (NER), entities are nested with each\nother, and thus requiring more data annotations to address. This leads to the\ndevelopment of few-shot nested NER, where the prevalence of pretrained language\nmodels with in-context learning (ICL) offers promising solutions. In this work,\nwe introduce an effective and innovative ICL framework for the setting of\nfew-shot nested NER. We improve the ICL prompt by devising a novel example\ndemonstration selection mechanism, EnDe retriever. In EnDe retriever, we employ\ncontrastive learning to perform three types of representation learning, in\nterms of semantic similarity, boundary similarity, and label similarity, to\ngenerate high-quality demonstration examples. Extensive experiments over three\nnested NER and four flat NER datasets demonstrate the efficacy of our system.\n","authors":["Meishan Zhang","Bin Wang","Hao Fei","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01182v1.pdf","comment":"5 figures"},{"id":"http://arxiv.org/abs/2312.07950v2","updated":"2024-02-02T06:55:52Z","published":"2023-12-13T07:56:27Z","title":"CBQ: Cross-Block Quantization for Large Language Models","summary":" Post-training quantization (PTQ) has played a key role in compressing large\nlanguage models (LLMs) with ultra-low costs. However, existing PTQ methods only\nfocus on handling the outliers within one layer or one block, which ignores the\ndependency of blocks and leads to severe performance degradation in low-bit\nsettings. In this paper, we propose CBQ, a cross-block reconstruction-based PTQ\nmethod for LLMs. CBQ employs a cross-block dependency using a homologous\nreconstruction scheme, establishing long-range dependencies across multiple\nblocks to minimize error accumulation. Furthermore, CBQ incorporates a\ncoarse-to-fine preprocessing (CFP) strategy for suppressing weight and\nactivation outliers, coupled with an adaptive LoRA-Rounding technique for\nprecise weight quantization. These innovations enable CBQ to not only handle\nextreme outliers effectively but also improve overall quantization accuracy.\nExtensive experiments show that CBQ achieves superior low-bit quantization\n(W4A4, W4A8, W2A16) and outperforms existing state-of-the-art methods across\nvarious LLMs and datasets. Notably, CBQ quantizes the 4-bit LLAMA1-65B model\nwithin only 4.3 hours on a single GPU, achieving a commendable tradeoff between\nperformance and quantization efficiency.\n","authors":["Xin Ding","Xiaoyu Liu","Zhijun Tu","Yun Zhang","Wei Li","Jie Hu","Hanting Chen","Yehui Tang","Zhiwei Xiong","Baoqun Yin","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2312.07950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05112v3","updated":"2024-02-02T06:48:24Z","published":"2023-11-09T02:55:58Z","title":"A Survey of Large Language Models in Medicine: Principles, Applications,\n and Challenges","summary":" Large language models (LLMs), such as ChatGPT, have received substantial\nattention due to their capabilities for understanding and generating human\nlanguage. LLMs in medicine to assist physicians for patient care are emerging\nas a promising research direction in both artificial intelligence and clinical\nmedicine. This review provides a comprehensive overview of the principles,\napplications, and challenges faced by LLMs in medicine. We address the\nfollowing specific questions: 1) How should medical LLMs be built? 2) What are\nthe measures for the downstream performance of medical LLMs? 3) How should\nmedical LLMs be utilized in real-world clinical practice? 4) What challenges\narise from the use of medical LLMs? and 5) How should we better construct and\nutilize medical LLMs? This review aims to provide insights into the\nopportunities and challenges of LLMs in medicine, and serve as a practical\nresource for constructing effective medical LLMs. We also maintain and\nregularly updated list of practical guides on medical LLMs at\nhttps://github.com/AI-in-Health/MedLLMsPracticalGuide.\n","authors":["Hongjian Zhou","Fenglin Liu","Boyang Gu","Xinyu Zou","Jinfa Huang","Jinge Wu","Yiru Li","Sam S. Chen","Peilin Zhou","Junling Liu","Yining Hua","Chengfeng Mao","Chenyu You","Xian Wu","Yefeng Zheng","Lei Clifton","Zheng Li","Jiebo Luo","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2311.05112v3.pdf","comment":"Preprint. Version 3. 54 pages"},{"id":"http://arxiv.org/abs/2402.01176v1","updated":"2024-02-02T06:44:22Z","published":"2024-02-02T06:44:22Z","title":"Towards a Unified Language Model for Knowledge-Intensive Tasks Utilizing\n External Corpus","summary":" The advent of large language models (LLMs) has showcased their efficacy\nacross various domains, yet they often hallucinate, especially in\nknowledge-intensive tasks that require external knowledge sources. To improve\nfactual accuracy of language models, retrieval-augmented generation (RAG) has\nemerged as a popular solution. However, traditional retrieval modules often\nrely on large-scale document indexes, which can be disconnected from generative\ntasks. Through generative retrieval (GR) approach, language models can achieve\nsuperior retrieval performance by directly generating relevant document\nidentifiers (DocIDs). However, the relationship between GR and downstream\ntasks, as well as the potential of LLMs in GR, remains unexplored. In this\npaper, we present a unified language model that utilizes external corpus to\nhandle various knowledge-intensive tasks by seamlessly integrating generative\nretrieval, closed-book generation, and RAG. In order to achieve effective\nretrieval and generation through a unified continuous decoding process, we\nintroduce the following mechanisms: (1) a ranking-oriented DocID decoding\nstrategy, which improves ranking ability by directly learning from a DocID\nranking list; (2) a continuous generation strategy to facilitate effective and\nefficient RAG; (3) well-designed auxiliary DocID understanding tasks to enhance\nthe model's comprehension of DocIDs and their relevance to downstream tasks.\nOur approach is evaluated on the widely used KILT benchmark using two variants\nof backbone models: an encoder-decoder T5 model and a decoder-only LLM, Llama2.\nExperimental results showcase the superior performance of our models in both\nretrieval and downstream knowledge-intensive tasks.\n","authors":["Xiaoxi Li","Zhicheng Dou","Yujia Zhou","Fangchao Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01173v1","updated":"2024-02-02T06:34:11Z","published":"2024-02-02T06:34:11Z","title":"Efficient Prompt Caching via Embedding Similarity","summary":" Large language models (LLMs) have achieved huge success in numerous natural\nlanguage process (NLP) tasks. However, it faces the challenge of significant\nresource consumption during inference. In this paper, we aim to improve the\ninference efficiency of LLMs by prompt caching, i.e., if the current prompt can\nbe answered by the same response of a previous prompt, one can directly utilize\nthat previous response without calling the LLM. Specifically, we focus on the\nprediction accuracy of prompt caching for single-round question-answering tasks\nvia embedding similarity. The existing embeddings of prompts mostly focus on\nwhether two prompts are semantically similar, which is not necessarily\nequivalent to whether the same response can answer them. Therefore, we propose\na distillation-based method to fine-tune the existing embeddings for better\ncaching prediction. Theoretically, we provide finite-sample guarantees for the\nconvergence of our method under different types of loss functions. Empirically,\nwe carefully construct a hard dataset based on Kwiatkowski et al. (2019) where\nthe existing embedding model (Wang et al., 2022) only achieves an AUC of 0.51.\nWe then fine-tune the above embedding model, which significantly improves the\nAUC of caching prediction from 0.51 to 0.81. We also conduct simulations\ndemonstrating that our trained models achieve better caching efficiency than\nthe previous embedding model.\n","authors":["Hanlin Zhu","Banghua Zhu","Jiantao Jiao"],"pdf_url":"https://arxiv.org/pdf/2402.01173v1.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.01172v1","updated":"2024-02-02T06:31:50Z","published":"2024-02-02T06:31:50Z","title":"Streaming Sequence Transduction through Dynamic Compression","summary":" We introduce STAR (Stream Transduction with Anchor Representations), a novel\nTransformer-based model designed for efficient sequence-to-sequence\ntransduction over streams. STAR dynamically segments input streams to create\ncompressed anchor representations, achieving nearly lossless compression (12x)\nin Automatic Speech Recognition (ASR) and outperforming existing methods.\nMoreover, STAR demonstrates superior segmentation and latency-quality\ntrade-offs in simultaneous speech-to-text tasks, optimizing latency, memory\nfootprint, and quality.\n","authors":["Weiting Tan","Yunmo Chen","Tongfei Chen","Guanghui Qin","Haoran Xu","Heidi C. Zhang","Benjamin Van Durme","Philipp Koehn"],"pdf_url":"https://arxiv.org/pdf/2402.01172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01158v1","updated":"2024-02-02T05:54:12Z","published":"2024-02-02T05:54:12Z","title":"LLM-Detector: Improving AI-Generated Chinese Text Detection with\n Open-Source LLM Instruction Tuning","summary":" ChatGPT and other general large language models (LLMs) have achieved\nremarkable success, but they have also raised concerns about the misuse of\nAI-generated texts. Existing AI-generated text detection models, such as based\non BERT and RoBERTa, are prone to in-domain over-fitting, leading to poor\nout-of-domain (OOD) detection performance. In this paper, we first collected\nChinese text responses generated by human experts and 9 types of LLMs, for\nwhich to multiple domains questions, and further created a dataset that mixed\nhuman-written sentences and sentences polished by LLMs. We then proposed\nLLM-Detector, a novel method for both document-level and sentence-level text\ndetection through Instruction Tuning of LLMs. Our method leverages the wealth\nof knowledge LLMs acquire during pre-training, enabling them to detect the text\nthey generate. Instruction tuning aligns the model's responses with the user's\nexpected text detection tasks. Experimental results show that previous methods\nstruggle with sentence-level AI-generated text detection and OOD detection. In\ncontrast, our proposed method not only significantly outperforms baseline\nmethods in both sentence-level and document-level text detection but also\ndemonstrates strong generalization capabilities. Furthermore, since\nLLM-Detector is trained based on open-source LLMs, it is easy to customize for\ndeployment.\n","authors":["Rongsheng Wang","Haoming Chen","Ruizhe Zhou","Han Ma","Yaofei Duan","Yanlan Kang","Songhua Yang","Baoyu Fan","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01158v1.pdf","comment":"17 pages, 13 tables, 7 figures"},{"id":"http://arxiv.org/abs/2402.01155v1","updated":"2024-02-02T05:48:39Z","published":"2024-02-02T05:48:39Z","title":"CABINET: Content Relevance based Noise Reduction for Table Question\n Answering","summary":" Table understanding capability of Large Language Models (LLMs) has been\nextensively studied through the task of question-answering (QA) over tables.\nTypically, only a small part of the whole table is relevant to derive the\nanswer for a given question. The irrelevant parts act as noise and are\ndistracting information, resulting in sub-optimal performance due to the\nvulnerability of LLMs to noise. To mitigate this, we propose CABINET (Content\nRelevAnce-Based NoIse ReductioN for TablE QuesTion-Answering) - a framework to\nenable LLMs to focus on relevant tabular data by suppressing extraneous\ninformation. CABINET comprises an Unsupervised Relevance Scorer (URS), trained\ndifferentially with the QA LLM, that weighs the table content based on its\nrelevance to the input question before feeding it to the question-answering LLM\n(QA LLM). To further aid the relevance scorer, CABINET employs a weakly\nsupervised module that generates a parsing statement describing the criteria of\nrows and columns relevant to the question and highlights the content of\ncorresponding table cells. CABINET significantly outperforms various tabular\nLLM baselines, as well as GPT3-based in-context learning methods, is more\nrobust to noise, maintains outperformance on tables of varying sizes, and\nestablishes new SoTA performance on WikiTQ, FeTaQA, and WikiSQL datasets. We\nrelease our code and datasets at https://github.com/Sohanpatnaik106/CABINET_QA.\n","authors":["Sohan Patnaik","Heril Changwal","Milan Aggarwal","Sumita Bhatia","Yaman Kumar","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2402.01155v1.pdf","comment":"Accepted at ICLR 2024 (spotlight)"},{"id":"http://arxiv.org/abs/2402.01152v1","updated":"2024-02-02T05:38:59Z","published":"2024-02-02T05:38:59Z","title":"AccentFold: A Journey through African Accents for Zero-Shot ASR\n Adaptation to Target Accents","summary":" Despite advancements in speech recognition, accented speech remains\nchallenging. While previous approaches have focused on modeling techniques or\ncreating accented speech datasets, gathering sufficient data for the multitude\nof accents, particularly in the African context, remains impractical due to\ntheir sheer diversity and associated budget constraints. To address these\nchallenges, we propose \\textit{AccentFold}, a method that exploits spatial\nrelationships between learned accent embeddings to improve downstream Automatic\nSpeech Recognition (ASR). Our exploratory analysis of speech embeddings\nrepresenting 100+ African accents reveals interesting spatial accent\nrelationships highlighting geographic and genealogical similarities, capturing\nconsistent phonological, and morphological regularities, all learned\nempirically from speech. Furthermore, we discover accent relationships\npreviously uncharacterized by the Ethnologue. Through empirical evaluation, we\ndemonstrate the effectiveness of AccentFold by showing that, for\nout-of-distribution (OOD) accents, sampling accent subsets for training based\non AccentFold information outperforms strong baselines a relative WER\nimprovement of 4.6%. AccentFold presents a promising approach for improving ASR\nperformance on accented speech, particularly in the context of African accents,\nwhere data scarcity and budget constraints pose significant challenges. Our\nfindings emphasize the potential of leveraging linguistic relationships to\nimprove zero-shot ASR adaptation to target accents.\n","authors":["Abraham Toluwase Owodunni","Aditya Yadavalli","Chris Chinenye Emezue","Tobi Olatunji","Clinton C Mbataku"],"pdf_url":"https://arxiv.org/pdf/2402.01152v1.pdf","comment":"Accepted to EACL Findings 2024"},{"id":"http://arxiv.org/abs/2312.05356v2","updated":"2024-02-02T04:31:00Z","published":"2023-12-08T20:28:08Z","title":"Neuron Patching: Neuron-level Model Editing on Code Generation and LLMs","summary":" Large Language Models are successfully adopted in software engineering,\nespecially in code generation. Updating these models with new knowledge is very\nexpensive, and is often required to fully realize their value. In this paper,\nwe propose a novel and effective model editing approach, \\textsc{MENT}, to\npatch LLMs in coding tasks. Based on the mechanism of generative LLMs,\n\\textsc{MENT} enables model editing in next-token predictions, and further\nsupports common coding tasks. \\textsc{MENT} is effective, efficient, and\nreliable. It can correct a neural model by patching 1 or 2 neurons. As the\npioneer work on neuron-level model editing of generative models, we formalize\nthe editing process and introduce the involved concepts. Besides, we also\nintroduce new measures to evaluate its generalization ability, and build a\nbenchmark for further study. Our approach is evaluated on three coding tasks,\nincluding API-seq recommendation, line-level code generation, and\npseudocode-to-code transaction. It outperforms the state-of-the-art by a\nsignificant margin on both effectiveness and efficiency measures. In addition,\nwe demonstrate the usages of \\textsc{MENT} for LLM reasoning in software\nengineering. By editing the LLM knowledge with \\textsc{MENT}, the directly or\nindirectly dependent behaviors in the chain-of-thought change accordingly and\nautomatically.\n","authors":["Jian Gu","Chunyang Chen","Aldeida Aleti"],"pdf_url":"https://arxiv.org/pdf/2312.05356v2.pdf","comment":"12 pages, 5 figures, 6 tables, under peer review"},{"id":"http://arxiv.org/abs/2402.01135v1","updated":"2024-02-02T04:20:13Z","published":"2024-02-02T04:20:13Z","title":"A Multi-Agent Conversational Recommender System","summary":" Due to strong capabilities in conducting fluent, multi-turn conversations\nwith users, Large Language Models (LLMs) have the potential to further improve\nthe performance of Conversational Recommender System (CRS). Unlike the aimless\nchit-chat that LLM excels at, CRS has a clear target. So it is imperative to\ncontrol the dialogue flow in the LLM to successfully recommend appropriate\nitems to the users. Furthermore, user feedback in CRS can assist the system in\nbetter modeling user preferences, which has been ignored by existing studies.\nHowever, simply prompting LLM to conduct conversational recommendation cannot\naddress the above two key challenges.\n In this paper, we propose Multi-Agent Conversational Recommender System\n(MACRS) which contains two essential modules. First, we design a multi-agent\nact planning framework, which can control the dialogue flow based on four\nLLM-based agents. This cooperative multi-agent framework will generate various\ncandidate responses based on different dialogue acts and then choose the most\nappropriate response as the system response, which can help MACRS plan suitable\ndialogue acts. Second, we propose a user feedback-aware reflection mechanism\nwhich leverages user feedback to reason errors made in previous turns to adjust\nthe dialogue act planning, and higher-level user information from implicit\nsemantics. We conduct extensive experiments based on user simulator to\ndemonstrate the effectiveness of MACRS in recommendation and user preferences\ncollection. Experimental results illustrate that MACRS demonstrates an\nimprovement in user interaction experience compared to directly using LLMs.\n","authors":["Jiabao Fang","Shen Gao","Pengjie Ren","Xiuying Chen","Suzan Verberne","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2402.01135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16184v2","updated":"2024-02-02T04:14:26Z","published":"2024-01-29T14:29:48Z","title":"On the Semantics of LM Latent Space: A Vocabulary-defined Approach","summary":" Understanding the latent space of language models (LM) is crucial to refining\ntheir performance and interpretability. Existing analyses often fall short in\nproviding disentangled (model-centric) insights into LM semantics, and neglect\nessential aspects of LM adaption. In response, we introduce a pioneering method\ncalled vocabulary-defined semantics, which establishes a reference frame within\nthe LM latent space, ensuring disentangled semantic analysis grounded in LM\nvocabulary. Our approach transcends prior entangled analysis, leveraging LM\nvocabulary for model-centric insights. Furthermore, we propose a novel\ntechnique to compute logits, emphasising differentiability and local isotropy,\nand introduce a neural clustering module for semantically calibrating data\nrepresentations during LM adaptation. Through extensive experiments across\ndiverse text understanding datasets, our approach outperforms state-of-the-art\nmethods of retrieval-augmented generation and parameter-efficient finetuning,\nshowcasing its efficacy and broad applicability. Our findings not only shed\nlight on LM mechanics, but also offer practical solutions to enhance LM\nperformance and interpretability.\n","authors":["Jian Gu","Chunyang Chen","Aldeida Aleti"],"pdf_url":"https://arxiv.org/pdf/2401.16184v2.pdf","comment":"under peer review"},{"id":"http://arxiv.org/abs/2401.16727v2","updated":"2024-02-02T04:07:25Z","published":"2024-01-30T03:51:44Z","title":"Recent Advances in Hate Speech Moderation: Multimodality and the Role of\n Large Models","summary":" In the evolving landscape of online communication, moderating hate speech\n(HS) presents an intricate challenge, compounded by the multimodal nature of\ndigital content. This comprehensive survey delves into the recent strides in HS\nmoderation, spotlighting the burgeoning role of large language models (LLMs)\nand large multimodal models (LMMs). Our exploration begins with a thorough\nanalysis of current literature, revealing the nuanced interplay between\ntextual, visual, and auditory elements in propagating HS. We uncover a notable\ntrend towards integrating these modalities, primarily due to the complexity and\nsubtlety with which HS is disseminated. A significant emphasis is placed on the\nadvances facilitated by LLMs and LMMs, which have begun to redefine the\nboundaries of detection and moderation capabilities. We identify existing gaps\nin research, particularly in the context of underrepresented languages and\ncultures, and the need for solutions to handle low-resource settings. The\nsurvey concludes with a forward-looking perspective, outlining potential\navenues for future research, including the exploration of novel AI\nmethodologies, the ethical governance of AI in moderation, and the development\nof more nuanced, context-aware systems. This comprehensive overview aims to\ncatalyze further research and foster a collaborative effort towards more\nsophisticated, responsible, and human-centric approaches to HS moderation in\nthe digital era. WARNING: This paper contains offensive examples.\n","authors":["Ming Shan Hee","Shivam Sharma","Rui Cao","Palash Nandi","Tanmoy Chakraborty","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2401.16727v2.pdf","comment":"Preprint; Under-Review"},{"id":"http://arxiv.org/abs/2402.01118v1","updated":"2024-02-02T03:22:12Z","published":"2024-02-02T03:22:12Z","title":"PokéLLMon: A Human-Parity Agent for Pokémon Battles with Large\n Language Models","summary":" We introduce \\textsc{Pok\\'eLLMon}, the first LLM-embodied agent that achieves\nhuman-parity performance in tactical battle games, as demonstrated in Pok\\'emon\nbattles. The design of \\textsc{Pok\\'eLLMon} incorporates three key strategies:\n(i) In-context reinforcement learning that instantly consumes text-based\nfeedback derived from battles to iteratively refine the policy; (ii)\nKnowledge-augmented generation that retrieves external knowledge to counteract\nhallucination and enables the agent to act timely and properly; (iii)\nConsistent action generation to mitigate the \\textit{panic switching}\nphenomenon when the agent faces a powerful opponent and wants to elude the\nbattle. We show that online battles against human demonstrates\n\\textsc{Pok\\'eLLMon}'s human-like battle strategies and just-in-time decision\nmaking, achieving 49\\% of win rate in the Ladder competitions and 56\\% of win\nrate in the invited battles. Our implementation and playable battle logs are\navailable at: \\url{https://github.com/git-disl/PokeLLMon}.\n","authors":["Sihao Hu","Tiansheng Huang","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01118v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2402.01117v1","updated":"2024-02-02T03:21:00Z","published":"2024-02-02T03:21:00Z","title":"DTS-SQL: Decomposed Text-to-SQL with Small Large Language Models","summary":" Leading models for the text-to-SQL task heavily rely on proprietary Large\nLanguage Models (LLMs), posing concerns over data privacy. Closing the\nperformance gap between small open-source models and large proprietary models\nis crucial to mitigate this reliance. To this end, we introduce a novel\ntwo-stage fine-tuning approach that decomposes the task into two simpler tasks.\nThrough comprehensive evaluation on two large cross-domain datasets and two\nsmall LLMs, we show that this approach improves execution accuracy by 3 to 7\npercent, effectively aligning the performance of open-source models with their\nproprietary counterparts.\n","authors":["Mohammadreza Pourreza","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2402.01117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01115v1","updated":"2024-02-02T03:15:13Z","published":"2024-02-02T03:15:13Z","title":"Interpretation of Intracardiac Electrograms Through Textual\n Representations","summary":" Understanding the irregular electrical activity of atrial fibrillation (AFib)\nhas been a key challenge in electrocardiography. For serious cases of AFib,\ncatheter ablations are performed to collect intracardiac electrograms (EGMs).\nEGMs offer intricately detailed and localized electrical activity of the heart\nand are an ideal modality for interpretable cardiac studies. Recent\nadvancements in artificial intelligence (AI) has allowed some works to utilize\ndeep learning frameworks to interpret EGMs during AFib. Additionally, language\nmodels (LMs) have shown exceptional performance in being able to generalize to\nunseen domains, especially in healthcare. In this study, we are the first to\nleverage pretrained LMs for finetuning of EGM interpolation and AFib\nclassification via masked language modeling. We formulate the EGM as a textual\nsequence and present competitive performances on AFib classification compared\nagainst other representations. Lastly, we provide a comprehensive\ninterpretability study to provide a multi-perspective intuition of the model's\nbehavior, which could greatly benefit the clinical use.\n","authors":["William Jongwon Han","Diana Gomez","Avi Alok","Chaojing Duan","Michael A. Rosenberg","Douglas Weber","Emerson Liu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01115v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.01109v1","updated":"2024-02-02T02:56:50Z","published":"2024-02-02T02:56:50Z","title":"Vaccine: Perturbation-aware Alignment for Large Language Model","summary":" The new paradigm of finetuning-as-a-service introduces a new attack surface\nfor Large Language Models (LLMs): a few harmful data uploaded by users can\neasily trick the finetuning to produce an alignment-broken model. We conduct an\nempirical analysis and uncover a \\textit{harmful embedding drift} phenomenon,\nshowing a probable cause of the alignment-broken effect. Inspired by our\nfindings, we propose Vaccine, a perturbation-aware alignment technique to\nmitigate the security risk of users finetuning. The core idea of Vaccine is to\nproduce invariant hidden embeddings by progressively adding crafted\nperturbation to them in the alignment phase. This enables the embeddings to\nwithstand harmful perturbation from un-sanitized user data in the finetuning\nphase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna)\ndemonstrate that Vaccine can boost the robustness of alignment against harmful\nprompts induced embedding drift while reserving reasoning ability towards\nbenign prompts. Our code is available at\n\\url{https://github.com/git-disl/Vaccine}.\n","authors":["Tiansheng Huang","Sihao Hu","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01108v1","updated":"2024-02-02T02:53:11Z","published":"2024-02-02T02:53:11Z","title":"Reasoning Capacity in Multi-Agent Systems: Limitations, Challenges and\n Human-Centered Solutions","summary":" Remarkable performance of large language models (LLMs) in a variety of tasks\nbrings forth many opportunities as well as challenges of utilizing them in\nproduction settings. Towards practical adoption of LLMs, multi-agent systems\nhold great promise to augment, integrate, and orchestrate LLMs in the larger\ncontext of enterprise platforms that use existing proprietary data and models\nto tackle complex real-world tasks. Despite the tremendous success of these\nsystems, current approaches rely on narrow, single-focus objectives for\noptimization and evaluation, often overlooking potential constraints in\nreal-world scenarios, including restricted budgets, resources and time.\nFurthermore, interpreting, analyzing, and debugging these systems requires\ndifferent components to be evaluated in relation to one another. This demand is\ncurrently not feasible with existing methodologies. In this postion paper, we\nintroduce the concept of reasoning capacity as a unifying criterion to enable\nintegration of constraints during optimization and establish connections among\ndifferent components within the system, which also enable a more holistic and\ncomprehensive approach to evaluation. We present a formal definition of\nreasoning capacity and illustrate its utility in identifying limitations within\neach component of the system. We then argue how these limitations can be\naddressed with a self-reflective process wherein human-feedback is used to\nalleviate shortcomings in reasoning and enhance overall consistency of the\nsystem.\n","authors":["Pouya Pezeshkpour","Eser Kandogan","Nikita Bhutani","Sajjadur Rahman","Tom Mitchell","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2402.01108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11819v2","updated":"2024-02-02T02:35:13Z","published":"2024-01-22T10:30:11Z","title":"SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in\n Chinese","summary":" We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate\nthe mathematical reasoning abilities of Chinese language models. SC-Math6 is\ndesigned as an upgraded Chinese version of the GSM8K dataset with enhanced\ndifficulty, diversity, and application scope. It consists of over 2000\nmathematical word problems requiring multi-step reasoning and providing natural\nlanguage solutions. We propose an innovative scheme to quantify the reasoning\ncapability of large models based on performance over problems with different\nreasoning steps. Experiments on 13 representative Chinese models demonstrate a\nclear stratification of reasoning levels, with top models like GPT-4 showing\nsuperior performance. SC-Math6 fills the gap in Chinese mathematical reasoning\nbenchmarks and provides a comprehensive testbed to advance the intelligence of\nChinese language models.\n","authors":["Liang Xu","Hang Xue","Lei Zhu","Kangkang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.11819v2.pdf","comment":"Dataset revised and finalized, results updated with new model; 8\n pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.01097v1","updated":"2024-02-02T02:12:46Z","published":"2024-02-02T02:12:46Z","title":"Let's Negotiate! A Survey of Negotiation Dialogue Systems","summary":" Negotiation is a crucial ability in human communication. Recently, there has\nbeen a resurgent research interest in negotiation dialogue systems, whose goal\nis to create intelligent agents that can assist people in resolving conflicts\nor reaching agreements. Although there have been many explorations into\nnegotiation dialogue systems, a systematic review of this task has not been\nperformed to date. We aim to fill this gap by investigating recent studies in\nthe field of negotiation dialogue systems, and covering benchmarks, evaluations\nand methodologies within the literature. We also discuss potential future\ndirections, including multi-modal, multi-party and cross-cultural negotiation\nscenarios. Our goal is to provide the community with a systematic overview of\nnegotiation dialogue systems and to inspire future research.\n","authors":["Haolan Zhan","Yufei Wang","Tao Feng","Yuncheng Hua","Suraj Sharma","Zhuang Li","Lizhen Qu","Zhaleh Semnani Azad","Ingrid Zukerman","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.01097v1.pdf","comment":"Accepted by EACL 2024 (findings). arXiv admin note: substantial text\n overlap with arXiv:2212.09072"},{"id":"http://arxiv.org/abs/2401.12425v2","updated":"2024-02-02T02:06:20Z","published":"2024-01-23T01:25:00Z","title":"The Neglected Tails of Vision-Language Models","summary":" Vision-language models (VLMs) excel in zero-shot recognition but their\nperformance varies greatly across different visual concepts. For example,\nalthough CLIP achieves impressive accuracy on ImageNet (60-80%), its\nperformance drops below 10% for more than ten concepts like night snake,\npresumably due to their limited presence in the pretraining data. However,\nmeasuring the frequency of concepts in VLMs' large-scale datasets is\nchallenging. We address this by using large language models (LLMs) to count the\nnumber of pretraining texts that contain synonyms of these concepts. Our\nanalysis confirms that popular datasets, such as LAION, exhibit a long-tailed\nconcept distribution, yielding biased performance in VLMs. We also find that\ndownstream applications of VLMs, including visual chatbots (e.g., GPT-4V) and\ntext-to-image models (e.g., Stable Diffusion), often fail to recognize or\ngenerate images of rare concepts identified by our method. To mitigate the\nimbalanced performance of zero-shot VLMs, we propose REtrieval-Augmented\nLearning (REAL). First, instead of prompting VLMs using the original class\nnames, REAL uses their most frequent synonyms found in pretraining texts. This\nsimple change already outperforms costly human-engineered and LLM-enriched\nprompts over nine benchmark datasets. Second, REAL trains a linear classifier\non a small yet balanced set of pretraining data retrieved using concept\nsynonyms. REAL surpasses the previous zero-shot SOTA, using 400x less storage\nand 10,000x less training time!\n","authors":["Shubham Parashar","Zhiqiu Lin","Tian Liu","Xiangjue Dong","Yanan Li","Deva Ramanan","James Caverlee","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2401.12425v2.pdf","comment":"Project Page:\n https://shubhamprshr27.github.io/neglected-tails-of-vlms/"},{"id":"http://arxiv.org/abs/2402.01093v1","updated":"2024-02-02T01:45:18Z","published":"2024-02-02T01:45:18Z","title":"Specialized Language Models with Cheap Inference from Limited Domain\n Data","summary":" Large language models have emerged as a versatile tool but are challenging to\napply to tasks lacking large inference budgets and large in-domain training\nsets. This work formalizes these constraints and distinguishes four important\nvariables: the pretraining budget (for training before the target domain is\nknown), the specialization budget (for training after the target domain is\nknown), the inference budget, and the in-domain training set size. Across these\nsettings, we compare different approaches from the machine learning literature.\nLimited by inference cost, we find better alternatives to the standard practice\nof training very large vanilla transformer models. In particular, we show that\nhyper-networks and mixture of experts have better perplexity for large\npretraining budgets, while small models trained on importance sampled datasets\nare attractive for large specialization budgets.\n","authors":["David Grangier","Angelos Katharopoulos","Pierre Ablin","Awni Hannun"],"pdf_url":"https://arxiv.org/pdf/2402.01093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01091v1","updated":"2024-02-02T01:39:00Z","published":"2024-02-02T01:39:00Z","title":"Reading Between the Tweets: Deciphering Ideological Stances of\n Interconnected Mixed-Ideology Communities","summary":" Recent advances in NLP have improved our ability to understand the nuanced\nworldviews of online communities. Existing research focused on probing\nideological stances treats liberals and conservatives as separate groups.\nHowever, this fails to account for the nuanced views of the organically formed\nonline communities and the connections between them. In this paper, we study\ndiscussions of the 2020 U.S. election on Twitter to identify complex\ninteracting communities. Capitalizing on this interconnectedness, we introduce\na novel approach that harnesses message passing when finetuning language models\n(LMs) to probe the nuanced ideologies of these communities. By comparing the\nresponses generated by LMs and real-world survey results, our method shows\nhigher alignment than existing baselines, highlighting the potential of using\nLMs in revealing complex ideologies within and across interconnected\nmixed-ideology communities.\n","authors":["Zihao He","Ashwin Rao","Siyi Guo","Negar Mokhberian","Kristina Lerman"],"pdf_url":"https://arxiv.org/pdf/2402.01091v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2303.11435v5","updated":"2024-02-02T18:52:51Z","published":"2023-03-20T20:28:17Z","title":"Inversion by Direct Iteration: An Alternative to Denoising Diffusion for\n Image Restoration","summary":" Inversion by Direct Iteration (InDI) is a new formulation for supervised\nimage restoration that avoids the so-called \"regression to the mean\" effect and\nproduces more realistic and detailed images than existing regression-based\nmethods. It does this by gradually improving image quality in small steps,\nsimilar to generative denoising diffusion models. Image restoration is an\nill-posed problem where multiple high-quality images are plausible\nreconstructions of a given low-quality input. Therefore, the outcome of a\nsingle step regression model is typically an aggregate of all possible\nexplanations, therefore lacking details and realism. The main advantage of InDI\nis that it does not try to predict the clean target image in a single step but\ninstead gradually improves the image in small steps, resulting in better\nperceptual quality. While generative denoising diffusion models also work in\nsmall steps, our formulation is distinct in that it does not require knowledge\nof any analytic form of the degradation process. Instead, we directly learn an\niterative restoration process from low-quality and high-quality paired\nexamples. InDI can be applied to virtually any image degradation, given paired\ntraining data. In conditional denoising diffusion image restoration the\ndenoising network generates the restored image by repeatedly denoising an\ninitial image of pure noise, conditioned on the degraded input. Contrary to\nconditional denoising formulations, InDI directly proceeds by iteratively\nrestoring the input low-quality image, producing high-quality results on a\nvariety of image restoration tasks, including motion and out-of-focus\ndeblurring, super-resolution, compression artifact removal, and denoising.\n","authors":["Mauricio Delbracio","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2303.11435v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08204v2","updated":"2024-02-02T18:31:52Z","published":"2023-10-12T10:50:21Z","title":"STELLA: Continual Audio-Video Pre-training with Spatio-Temporal\n Localized Alignment","summary":" Continuously learning a variety of audio-video semantics over time is crucial\nfor audio-related reasoning tasks in our ever-evolving world. However, this is\na nontrivial problem and poses two critical challenges: sparse spatio-temporal\ncorrelation between audio-video pairs and multimodal correlation overwriting\nthat forgets audio-video relations. To tackle this problem, we propose a new\ncontinual audio-video pre-training method with two novel ideas: (1) Localized\nPatch Importance Scoring: we introduce a multimodal encoder to determine the\nimportance score for each patch, emphasizing semantically intertwined\naudio-video patches. (2) Replay-guided Correlation Assessment: to reduce the\ncorruption of previously learned audiovisual knowledge due to drift, we propose\nto assess the correlation of the current patches on the past steps to identify\nthe patches exhibiting high correlations with the past steps. Based on the\nresults from the two ideas, we perform probabilistic patch selection for\neffective continual audio-video pre-training. Experimental validation on\nmultiple benchmarks shows that our method achieves a 3.69%p of relative\nperformance gain in zero-shot retrieval tasks compared to strong continual\nlearning baselines, while reducing memory consumption by ~45%.\n","authors":["Jaewoo Lee","Jaehong Yoon","Wonjae Kim","Yunji Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.08204v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.01596v1","updated":"2024-02-02T17:49:31Z","published":"2024-02-02T17:49:31Z","title":"Immersive Video Compression using Implicit Neural Representations","summary":" Recent work on implicit neural representations (INRs) has evidenced their\npotential for efficiently representing and encoding conventional video content.\nIn this paper we, for the first time, extend their application to immersive\n(multi-view) videos, by proposing MV-HiNeRV, a new INR-based immersive video\ncodec. MV-HiNeRV is an enhanced version of a state-of-the-art INR-based video\ncodec, HiNeRV, which was developed for single-view video compression. We have\nmodified the model to learn a different group of feature grids for each view,\nand share the learnt network parameters among all views. This enables the model\nto effectively exploit the spatio-temporal and the inter-view redundancy that\nexists within multi-view videos. The proposed codec was used to compress\nmulti-view texture and depth video sequences in the MPEG Immersive Video (MIV)\nCommon Test Conditions, and tested against the MIV Test model (TMIV) that uses\nthe VVenC video codec. The results demonstrate the superior performance of\nMV-HiNeRV, with significant coding gains (up to 72.33%) over TMIV. The\nimplementation of MV-HiNeRV will be published for further development and\nevaluation.\n","authors":["Ho Man Kwan","Fan Zhang","Andrew Gower","David Bull"],"pdf_url":"https://arxiv.org/pdf/2402.01596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01590v1","updated":"2024-02-02T17:34:25Z","published":"2024-02-02T17:34:25Z","title":"NeuroCine: Decoding Vivid Video Sequences from Human Brain Activties","summary":" In the pursuit to understand the intricacies of human brain's visual\nprocessing, reconstructing dynamic visual experiences from brain activities\nemerges as a challenging yet fascinating endeavor. While recent advancements\nhave achieved success in reconstructing static images from non-invasive brain\nrecordings, the domain of translating continuous brain activities into video\nformat remains underexplored. In this work, we introduce NeuroCine, a novel\ndual-phase framework to targeting the inherent challenges of decoding fMRI\ndata, such as noises, spatial redundancy and temporal lags. This framework\nproposes spatial masking and temporal interpolation-based augmentation for\ncontrastive learning fMRI representations and a diffusion model enhanced by\ndependent prior noise for video generation. Tested on a publicly available fMRI\ndataset, our method shows promising results, outperforming the previous\nstate-of-the-art models by a notable margin of ${20.97\\%}$, ${31.00\\%}$ and\n${12.30\\%}$ respectively on decoding the brain activities of three subjects in\nthe fMRI dataset, as measured by SSIM. Additionally, our attention analysis\nsuggests that the model aligns with existing brain structures and functions,\nindicating its biological plausibility and interpretability.\n","authors":["Jingyuan Sun","Mingxiao Li","Zijiao Chen","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2402.01590v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2312.15101v2","updated":"2024-02-02T17:26:07Z","published":"2023-12-22T22:46:48Z","title":"Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model\n Conversions","summary":" Converting deep learning models between frameworks is a common step to\nmaximize model compatibility across devices and leverage optimization features\nthat may be exclusively provided in one deep learning framework. However, this\nconversion process may be riddled with bugs, making the converted models either\nundeployable or problematic, considerably degrading their prediction\ncorrectness.\n We propose an automated approach for fault localization and repair, Fix-Con,\nduring model conversion between deep learning frameworks. Fix-Con is capable of\ndetecting and fixing faults introduced in model input, parameters,\nhyperparameters, and the model graph during conversion.\n Fix-Con uses a set of fault types mined from surveying conversion issues\nraised to localize potential conversion faults in the converted target model,\nand then repairs them appropriately, e.g. replacing the parameters of the\ntarget model with those from the source model. This is done iteratively for\nevery image in the dataset with output label differences between the source\nmodel and the converted target model until all differences are resolved. We\nevaluate the effectiveness of Fix-Con in fixing model conversion bugs of three\nwidely used image recognition models converted across four different deep\nlearning frameworks. Overall, Fix-Con was able to either completely repair, or\nsignificantly improve the performance of 14 out of the 15 erroneous conversion\ncases.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2312.15101v2.pdf","comment":"12 pages, 3 figures, 4 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2206.02997v2","updated":"2024-02-02T17:11:10Z","published":"2022-06-07T04:07:48Z","title":"TadML: A fast temporal action detection with Mechanics-MLP","summary":" Temporal Action Detection(TAD) is a crucial but challenging task in video\nunderstanding.It is aimed at detecting both the type and start-end frame for\neach action instance in a long, untrimmed video.Most current models adopt both\nRGB and Optical-Flow streams for the TAD task. Thus, original RGB frames must\nbe converted manually into Optical-Flow frames with additional computation and\ntime cost, which is an obstacle to achieve real-time processing. At present,\nmany models adopt two-stage strategies, which would slow the inference speed\ndown and complicatedly tuning on proposals generating.By comparison, we propose\na one-stage anchor-free temporal localization method with RGB stream only, in\nwhich a novel Newtonian Mechanics-MLP architecture is established. It has\ncomparable accuracy with all existing state-of-the-art models, while surpasses\nthe inference speed of these methods by a large margin. The typical inference\nspeed in this paper is astounding 4.44 video per second on THUMOS14. In\napplications, because there is no need to convert optical flow, the inference\nspeed will be faster.It also proves that MLP has great potential in downstream\ntasks such as TAD. The source code is available at\nhttps://github.com/BonedDeng/TadML\n","authors":["Bowen Deng","Dongchang Liu"],"pdf_url":"https://arxiv.org/pdf/2206.02997v2.pdf","comment":"8 pages,3 figures"},{"id":"http://arxiv.org/abs/2402.01566v1","updated":"2024-02-02T16:59:48Z","published":"2024-02-02T16:59:48Z","title":"Boximator: Generating Rich and Controllable Motions for Video Synthesis","summary":" Generating rich and controllable motion is a pivotal challenge in video\nsynthesis. We propose Boximator, a new approach for fine-grained motion\ncontrol. Boximator introduces two constraint types: hard box and soft box.\nUsers select objects in the conditional frame using hard boxes and then use\neither type of boxes to roughly or rigorously define the object's position,\nshape, or motion path in future frames. Boximator functions as a plug-in for\nexisting video diffusion models. Its training process preserves the base\nmodel's knowledge by freezing the original weights and training only the\ncontrol module. To address training challenges, we introduce a novel\nself-tracking technique that greatly simplifies the learning of box-object\ncorrelations. Empirically, Boximator achieves state-of-the-art video quality\n(FVD) scores, improving on two base models, and further enhanced after\nincorporating box constraints. Its robust motion controllability is validated\nby drastic increases in the bounding box alignment metric. Human evaluation\nalso shows that users favor Boximator generation results over the base model.\n","authors":["Jiawei Wang","Yuchen Zhang","Jiaxin Zou","Yan Zeng","Guoqiang Wei","Liping Yuan","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2402.01566v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.06895v2","updated":"2024-02-02T16:55:00Z","published":"2023-09-13T11:37:04Z","title":"MagiCapture: High-Resolution Multi-Concept Portrait Customization","summary":" Large-scale text-to-image models including Stable Diffusion are capable of\ngenerating high-fidelity photorealistic portrait images. There is an active\nresearch area dedicated to personalizing these models, aiming to synthesize\nspecific subjects or styles using provided sets of reference images. However,\ndespite the plausible results from these personalization methods, they tend to\nproduce images that often fall short of realism and are not yet on a\ncommercially viable level. This is particularly noticeable in portrait image\ngeneration, where any unnatural artifact in human faces is easily discernible\ndue to our inherent human bias. To address this, we introduce MagiCapture, a\npersonalization method for integrating subject and style concepts to generate\nhigh-resolution portrait images using just a few subject and style references.\nFor instance, given a handful of random selfies, our fine-tuned model can\ngenerate high-quality portrait images in specific styles, such as passport or\nprofile photos. The main challenge with this task is the absence of ground\ntruth for the composed concepts, leading to a reduction in the quality of the\nfinal output and an identity shift of the source subject. To address these\nissues, we present a novel Attention Refocusing loss coupled with auxiliary\npriors, both of which facilitate robust learning within this weakly supervised\nlearning setting. Our pipeline also includes additional post-processing steps\nto ensure the creation of highly realistic outputs. MagiCapture outperforms\nother baselines in both quantitative and qualitative evaluations and can also\nbe generalized to other non-human objects.\n","authors":["Junha Hyung","Jaeyo Shin","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2309.06895v2.pdf","comment":"18 pages, 17 figures"},{"id":"http://arxiv.org/abs/2402.01557v1","updated":"2024-02-02T16:50:18Z","published":"2024-02-02T16:50:18Z","title":"Deep Continuous Networks","summary":" CNNs and computational models of biological vision share some fundamental\nprinciples, which opened new avenues of research. However, fruitful cross-field\nresearch is hampered by conventional CNN architectures being based on spatially\nand depthwise discrete representations, which cannot accommodate certain\naspects of biological complexity such as continuously varying receptive field\nsizes and dynamics of neuronal responses. Here we propose deep continuous\nnetworks (DCNs), which combine spatially continuous filters, with the\ncontinuous depth framework of neural ODEs. This allows us to learn the spatial\nsupport of the filters during training, as well as model the continuous\nevolution of feature maps, linking DCNs closely to biological models. We show\nthat DCNs are versatile and highly applicable to standard image classification\nand reconstruction problems, where they improve parameter and data efficiency,\nand allow for meta-parametrization. We illustrate the biological plausibility\nof the scale distributions learned by DCNs and explore their performance in a\nneuroscientifically inspired pattern completion task. Finally, we investigate\nan efficient implementation of DCNs by changing input contrast.\n","authors":["Nergis Tomen","Silvia L. Pintea","Jan C. van Gemert"],"pdf_url":"https://arxiv.org/pdf/2402.01557v1.pdf","comment":"Presented at ICML 2021"},{"id":"http://arxiv.org/abs/2309.04836v2","updated":"2024-02-02T16:50:15Z","published":"2023-09-09T16:21:56Z","title":"Neural Semantic Surface Maps","summary":" We present an automated technique for computing a map between two genus-zero\nshapes, which matches semantically corresponding regions to one another. Lack\nof annotated data prohibits direct inference of 3D semantic priors; instead,\ncurrent State-of-the-art methods predominantly optimize geometric properties or\nrequire varying amounts of manual annotation. To overcome the lack of annotated\ntraining data, we distill semantic matches from pre-trained vision models: our\nmethod renders the pair of 3D shapes from multiple viewpoints; the resulting\nrenders are then fed into an off-the-shelf image-matching method which\nleverages a pretrained visual model to produce feature points. This yields\nsemantic correspondences, which can be projected back to the 3D shapes,\nproducing a raw matching that is inaccurate and inconsistent between different\nviewpoints. These correspondences are refined and distilled into an\ninter-surface map by a dedicated optimization scheme, which promotes\nbijectivity and continuity of the output map. We illustrate that our approach\ncan generate semantic surface-to-surface maps, eliminating manual annotations\nor any 3D training data requirement. Furthermore, it proves effective in\nscenarios with high semantic complexity, where objects are non-isometrically\nrelated, as well as in situations where they are nearly isometric.\n","authors":["Luca Morreale","Noam Aigerman","Vladimir G. Kim","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2309.04836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01555v1","updated":"2024-02-02T16:47:18Z","published":"2024-02-02T16:47:18Z","title":"SLYKLatent, a Learning Framework for Facial Features Estimation","summary":" In this research, we present SLYKLatent, a novel approach for enhancing gaze\nestimation by addressing appearance instability challenges in datasets due to\naleatoric uncertainties, covariant shifts, and test domain generalization.\nSLYKLatent utilizes Self-Supervised Learning for initial training with facial\nexpression datasets, followed by refinement with a patch-based tri-branch\nnetwork and an inverse explained variance-weighted training loss function. Our\nevaluation on benchmark datasets achieves an 8.7% improvement on Gaze360,\nrivals top MPIIFaceGaze results, and leads on a subset of ETH-XGaze by 13%,\nsurpassing existing methods by significant margins. Adaptability tests on\nRAF-DB and Affectnet show 86.4% and 60.9% accuracies, respectively. Ablation\nstudies confirm the effectiveness of SLYKLatent's novel components. This\napproach has strong potential in human-robot interaction.\n","authors":["Samuel Adebayo","Joost C. Dessing","Seán McLoone"],"pdf_url":"https://arxiv.org/pdf/2402.01555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01537v1","updated":"2024-02-02T16:27:45Z","published":"2024-02-02T16:27:45Z","title":"Closing the Gap in Human Behavior Analysis: A Pipeline for Synthesizing\n Trimodal Data","summary":" In pervasive machine learning, especially in Human Behavior Analysis (HBA),\nRGB has been the primary modality due to its accessibility and richness of\ninformation. However, linked with its benefits are challenges, including\nsensitivity to lighting conditions and privacy concerns. One possibility to\novercome these vulnerabilities is to resort to different modalities. For\ninstance, thermal is particularly adept at accentuating human forms, while\ndepth adds crucial contextual layers. Despite their known benefits, only a few\nHBA-specific datasets that integrate these modalities exist. To address this\nshortage, our research introduces a novel generative technique for creating\ntrimodal, i.e., RGB, thermal, and depth, human-focused datasets. This technique\ncapitalizes on human segmentation masks derived from RGB images, combined with\nthermal and depth backgrounds that are sourced automatically. With these two\ningredients, we synthesize depth and thermal counterparts from existing RGB\ndata utilizing conditional image-to-image translation. By employing this\napproach, we generate trimodal data that can be leveraged to train models for\nsettings with limited data, bad lightning conditions, or privacy-sensitive\nareas.\n","authors":["Christian Stippel","Thomas Heitzinger","Rafael Sterzinger","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2402.01537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07519v2","updated":"2024-02-02T16:15:22Z","published":"2024-01-15T07:50:18Z","title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds","summary":" There has been significant progress in personalized image synthesis with\nmethods such as Textual Inversion, DreamBooth, and LoRA. Yet, their real-world\napplicability is hindered by high storage demands, lengthy fine-tuning\nprocesses, and the need for multiple reference images. Conversely, existing ID\nembedding-based methods, while requiring only a single forward inference, face\nchallenges: they either necessitate extensive fine-tuning across numerous model\nparameters, lack compatibility with community pre-trained models, or fail to\nmaintain high face fidelity. Addressing these limitations, we introduce\nInstantID, a powerful diffusion model-based solution. Our plug-and-play module\nadeptly handles image personalization in various styles using just a single\nfacial image, while ensuring high fidelity. To achieve this, we design a novel\nIdentityNet by imposing strong semantic and weak spatial conditions,\nintegrating facial and landmark images with textual prompts to steer the image\ngeneration. InstantID demonstrates exceptional performance and efficiency,\nproving highly beneficial in real-world applications where identity\npreservation is paramount. Moreover, our work seamlessly integrates with\npopular pre-trained text-to-image diffusion models like SD1.5 and SDXL, serving\nas an adaptable plugin. Our codes and pre-trained checkpoints will be available\nat https://github.com/InstantID/InstantID.\n","authors":["Qixun Wang","Xu Bai","Haofan Wang","Zekui Qin","Anthony Chen","Huaxia Li","Xu Tang","Yao Hu"],"pdf_url":"https://arxiv.org/pdf/2401.07519v2.pdf","comment":"Technical Report, project page available at\n https://instantid.github.io/"},{"id":"http://arxiv.org/abs/2402.01524v1","updated":"2024-02-02T16:10:29Z","published":"2024-02-02T16:10:29Z","title":"HyperPlanes: Hypernetwork Approach to Rapid NeRF Adaptation","summary":" Neural radiance fields (NeRFs) are a widely accepted standard for\nsynthesizing new 3D object views from a small number of base images. However,\nNeRFs have limited generalization properties, which means that we need to use\nsignificant computational resources to train individual architectures for each\nitem we want to represent. To address this issue, we propose a few-shot\nlearning approach based on the hypernetwork paradigm that does not require\ngradient optimization during inference. The hypernetwork gathers information\nfrom the training data and generates an update for universal weights. As a\nresult, we have developed an efficient method for generating a high-quality 3D\nobject representation from a small number of images in a single step. This has\nbeen confirmed by direct comparison with the state-of-the-art solutions and a\ncomprehensive ablation study.\n","authors":["Paweł Batorski","Dawid Malarz","Marcin Przewięźlikowski","Marcin Mazur","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2402.01524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01516v1","updated":"2024-02-02T15:57:13Z","published":"2024-02-02T15:57:13Z","title":"Cross-view Masked Diffusion Transformers for Person Image Synthesis","summary":" We present X-MDPT (Cross-view Masked Diffusion Prediction Transformers), a\nnovel diffusion model designed for pose-guided human image generation. X-MDPT\ndistinguishes itself by employing masked diffusion transformers that operate on\nlatent patches, a departure from the commonly-used Unet structures in existing\nworks. The model comprises three key modules: 1) a denoising diffusion\nTransformer, 2) an aggregation network that consolidates conditions into a\nsingle vector for the diffusion process, and 3) a mask cross-prediction module\nthat enhances representation learning with semantic information from the\nreference image. X-MDPT demonstrates scalability, improving FID, SSIM, and\nLPIPS with larger models. Despite its simple design, our model outperforms\nstate-of-the-art approaches on the DeepFashion dataset while exhibiting\nefficiency in terms of training parameters, training time, and inference speed.\nOur compact 33MB model achieves an FID of 7.42, surpassing a prior Unet latent\ndiffusion approach (FID 8.07) using only $11\\times$ fewer parameters. Our best\nmodel surpasses the pixel-based diffusion with $\\frac{2}{3}$ of the parameters\nand achieves $5.43 \\times$ faster inference.\n","authors":["Trung X. Pham","Zhang Kang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.01516v1.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.03610v2","updated":"2024-02-02T15:46:42Z","published":"2023-05-05T15:16:07Z","title":"The Role of Data Curation in Image Captioning","summary":" Image captioning models are typically trained by treating all samples\nequally, neglecting to account for mismatched or otherwise difficult data\npoints. In contrast, recent work has shown the effectiveness of training models\nby scheduling the data using curriculum learning strategies. This paper\ncontributes to this direction by actively curating difficult samples in\ndatasets without increasing the total number of samples. We explore the effect\nof using three data curation methods within the training process: complete\nremoval of an sample, caption replacement, or image replacement via a\ntext-to-image generation model. Experiments on the Flickr30K and COCO datasets\nwith the BLIP and BEiT-3 models demonstrate that these curation methods do\nindeed yield improved image captioning models, underscoring their efficacy.\n","authors":["Wenyan Li","Jonas F. Lotz","Chen Qiu","Desmond Elliott"],"pdf_url":"https://arxiv.org/pdf/2305.03610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01509v1","updated":"2024-02-02T15:43:51Z","published":"2024-02-02T15:43:51Z","title":"Advancing Brain Tumor Inpainting with Generative Models","summary":" Synthesizing healthy brain scans from diseased brain scans offers a potential\nsolution to address the limitations of general-purpose algorithms, such as\ntissue segmentation and brain extraction algorithms, which may not effectively\nhandle diseased images. We consider this a 3D inpainting task and investigate\nthe adaptation of 2D inpainting methods to meet the requirements of 3D magnetic\nresonance imaging(MRI) data. Our contributions encompass potential\nmodifications tailored to MRI-specific needs, and we conducted evaluations of\nmultiple inpainting techniques using the BraTS2023 Inpainting datasets to\nassess their efficacy and limitations.\n","authors":["Ruizhi Zhu","Xinru Zhang","Haowen Pang","Chundan Xu","Chuyang Ye"],"pdf_url":"https://arxiv.org/pdf/2402.01509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04584v2","updated":"2024-02-02T15:40:51Z","published":"2023-10-06T20:55:05Z","title":"An Algorithm to Train Unrestricted Sequential Discrete Morphological\n Neural Networks","summary":" There have been attempts to insert mathematical morphology (MM) operators\ninto convolutional neural networks (CNN), and the most successful endeavor to\ndate has been the morphological neural networks (MNN). Although MNN have\nperformed better than CNN in solving some problems, they inherit their\nblack-box nature. Furthermore, in the case of binary images, they are\napproximations that loose the Boolean lattice structure of MM operators and,\nthus, it is not possible to represent a specific class of W-operators with\ndesired properties. In a recent work, we proposed the Discrete Morphological\nNeural Networks (DMNN) for binary image transformation to represent specific\nclasses of W-operators and estimate them via machine learning. We also proposed\na stochastic lattice descent algorithm (SLDA) to learn the parameters of\nCanonical Discrete Morphological Neural Networks (CDMNN), whose architecture is\ncomposed only of operators that can be decomposed as the supremum, infimum, and\ncomplement of erosions and dilations. In this paper, we propose an algorithm to\nlearn unrestricted sequential DMNN, whose architecture is given by the\ncomposition of general W-operators. We illustrate the algorithm in a practical\nexample.\n","authors":["Diego Marcondes","Mariana Feldman","Junior Barrera"],"pdf_url":"https://arxiv.org/pdf/2310.04584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01476v1","updated":"2024-02-02T15:05:13Z","published":"2024-02-02T15:05:13Z","title":"Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian\n Processes","summary":" While the great capability of Transformers significantly boosts prediction\naccuracy, it could also yield overconfident predictions and require calibrated\nuncertainty estimation, which can be commonly tackled by Gaussian processes\n(GPs). Existing works apply GPs with symmetric kernels under variational\ninference to the attention kernel; however, omitting the fact that attention\nkernels are in essence asymmetric. Moreover, the complexity of deriving the GP\nposteriors remains high for large-scale data. In this work, we propose\nKernel-Eigen Pair Sparse Variational Gaussian Processes (KEP-SVGP) for building\nuncertainty-aware self-attention where the asymmetry of attention kernels is\ntackled by Kernel SVD (KSVD) and a reduced complexity is acquired. Through\nKEP-SVGP, i) the SVGP pair induced by the two sets of singular vectors from\nKSVD w.r.t. the attention kernel fully characterizes the asymmetry; ii) using\nonly a small set of adjoint eigenfunctions from KSVD, the derivation of SVGP\nposteriors can be based on the inversion of a diagonal matrix containing\nsingular values, contributing to a reduction in time complexity; iii) an\nevidence lower bound is derived so that variational parameters can be optimized\ntowards this objective. Experiments verify our excellent performances and\nefficiency on in-distribution, distribution-shift and out-of-distribution\nbenchmarks.\n","authors":["Yingyi Chen","Qinghua Tao","Francesco Tonin","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2402.01476v1.pdf","comment":"We propose Kernel-Eigen Pair Sparse Variational Gaussian Processes\n (KEP-SVGP) for building uncertainty-aware self-attention where the asymmetry\n of attention kernel is tackled by KSVD and a reduced time complexity is\n acquired"},{"id":"http://arxiv.org/abs/2402.01472v1","updated":"2024-02-02T14:57:42Z","published":"2024-02-02T14:57:42Z","title":"Synthetic Data for the Mitigation of Demographic Biases in Face\n Recognition","summary":" This study investigates the possibility of mitigating the demographic biases\nthat affect face recognition technologies through the use of synthetic data.\nDemographic biases have the potential to impact individuals from specific\ndemographic groups, and can be identified by observing disparate performance of\nface recognition systems across demographic groups. They primarily arise from\nthe unequal representations of demographic groups in the training data. In\nrecent times, synthetic data have emerged as a solution to some problems that\naffect face recognition systems. In particular, during the generation process\nit is possible to specify the desired demographic and facial attributes of\nimages, in order to control the demographic distribution of the synthesized\ndataset, and fairly represent the different demographic groups. We propose to\nfine-tune with synthetic data existing face recognition systems that present\nsome demographic biases. We use synthetic datasets generated with GANDiffFace,\na novel framework able to synthesize datasets for face recognition with\ncontrollable demographic distribution and realistic intra-class variations. We\nconsider multiple datasets representing different demographic groups for\ntraining and evaluation. Also, we fine-tune different face recognition systems,\nand evaluate their demographic fairness with different metrics. Our results\nsupport the proposed approach and the use of synthetic data to mitigate\ndemographic biases in face recognition.\n","authors":["Pietro Melzi","Christian Rathgeb","Ruben Tolosana","Ruben Vera-Rodriguez","Aythami Morales","Dominik Lawatsch","Florian Domin","Maxim Schaubert"],"pdf_url":"https://arxiv.org/pdf/2402.01472v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.01466v1","updated":"2024-02-02T14:55:36Z","published":"2024-02-02T14:55:36Z","title":"Scaled 360 layouts: Revisiting non-central panoramas","summary":" From a non-central panorama, 3D lines can be recovered by geometric\nreasoning. However, their sensitivity to noise and the complex geometric\nmodeling required has led these panoramas being very little investigated. In\nthis work we present a novel approach for 3D layout recovery of indoor\nenvironments using single non-central panoramas. We obtain the boundaries of\nthe structural lines of the room from a non-central panorama using deep\nlearning and exploit the properties of non-central projection systems in a new\ngeometrical processing to recover the scaled layout. We solve the problem for\nManhattan environments, handling occlusions, and also for Atlanta environments\nin an unified method. The experiments performed improve the state-of-the-art\nmethods for 3D layout recovery from a single panorama. Our approach is the\nfirst work using deep learning with non-central panoramas and recovering the\nscale of single panorama layouts.\n","authors":["Bruno Berenguel-Baeta","Jesus Bermudez-Cameo","Jose J. Guerrero"],"pdf_url":"https://arxiv.org/pdf/2402.01466v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2401.17058"},{"id":"http://arxiv.org/abs/2402.01462v1","updated":"2024-02-02T14:52:41Z","published":"2024-02-02T14:52:41Z","title":"3D Vertebrae Measurements: Assessing Vertebral Dimensions in Human Spine\n Mesh Models Using Local Anatomical Vertebral Axes","summary":" Vertebral morphological measurements are important across various\ndisciplines, including spinal biomechanics and clinical applications, pre- and\npost-operatively. These measurements also play a crucial role in\nanthropological longitudinal studies, where spinal metrics are repeatedly\ndocumented over extended periods. Traditionally, such measurements have been\nmanually conducted, a process that is time-consuming. In this study, we\nintroduce a novel, fully automated method for measuring vertebral morphology\nusing 3D meshes of lumbar and thoracic spine models.Our experimental results\ndemonstrate the method's capability to accurately measure low-resolution\npatient-specific vertebral meshes with mean absolute error (MAE) of 1.09 mm and\nthose derived from artificially created lumbar spines, where the average MAE\nvalue was 0.7 mm. Our qualitative analysis indicates that measurements obtained\nusing our method on 3D spine models can be accurately reprojected back onto the\noriginal medical images if these images are available.\n","authors":["Ivanna Kramer","Vinzent Rittel","Lara Blomenkamp","Sabine Bauer","Dietrich Paulus"],"pdf_url":"https://arxiv.org/pdf/2402.01462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01461v1","updated":"2024-02-02T14:52:24Z","published":"2024-02-02T14:52:24Z","title":"Visual Gyroscope: Combination of Deep Learning Features and Direct\n Alignment for Panoramic Stabilization","summary":" In this article we present a visual gyroscope based on equirectangular\npanoramas. We propose a new pipeline where we take advantage of combining three\ndifferent methods to obtain a robust and accurate estimation of the attitude of\nthe camera. We quantitatively and qualitatively validate our method on two\nimage sequences taken with a $360^\\circ$ dual-fisheye camera mounted on\ndifferent aerial vehicles.\n","authors":["Bruno Berenguel-Baeta","Antoine N. Andre","Guillaume Caron","Jesus Bermudez-Cameo","Jose J. Guerrero"],"pdf_url":"https://arxiv.org/pdf/2402.01461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01459v1","updated":"2024-02-02T14:50:23Z","published":"2024-02-02T14:50:23Z","title":"GaMeS: Mesh-Based Adapting and Modification of Gaussian Splatting","summary":" In recent years, a range of neural network-based methods for image rendering\nhave been introduced. For instance, widely-researched neural radiance fields\n(NeRF) rely on a neural network to represent 3D scenes, allowing for realistic\nview synthesis from a small number of 2D images. However, most NeRF models are\nconstrained by long training and inference times. In comparison, Gaussian\nSplatting (GS) is a novel, state-of-theart technique for rendering points in a\n3D scene by approximating their contribution to image pixels through Gaussian\ndistributions, warranting fast training and swift, real-time rendering. A\ndrawback of GS is the absence of a well-defined approach for its conditioning\ndue to the necessity to condition several hundred thousand Gaussian components.\nTo solve this, we introduce Gaussian Mesh Splatting (GaMeS) model, a hybrid of\nmesh and a Gaussian distribution, that pin all Gaussians splats on the object\nsurface (mesh). The unique contribution of our methods is defining Gaussian\nsplats solely based on their location on the mesh, allowing for automatic\nadjustments in position, scale, and rotation during animation. As a result, we\nobtain high-quality renders in the real-time generation of high-quality views.\nFurthermore, we demonstrate that in the absence of a predefined mesh, it is\npossible to fine-tune the initial mesh during the learning process.\n","authors":["Joanna Waczyńska","Piotr Borycki","Sławomir Tadeja","Jacek Tabor","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2402.01459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01580v4","updated":"2024-02-02T14:47:15Z","published":"2023-04-04T07:17:31Z","title":"Untargeted Near-collision Attacks on Biometrics: Real-world Bounds and\n Theoretical Limits","summary":" A biometric recognition system can operate in two distinct modes:\nidentification or verification. In the first mode, the system recognizes an\nindividual by searching the enrolled templates of all the users for a match. In\nthe second mode, the system validates a user's identity claim by comparing the\nfresh provided template with the enrolled template. The biometric\ntransformation schemes usually produce binary templates that are better handled\nby cryptographic schemes, and the comparison is based on a distance that leaks\ninformation about the similarities between two biometric templates. Both the\nexperimentally determined false match rate and false non-match rate through\nrecognition threshold adjustment define the recognition accuracy, and hence the\nsecurity of the system. To our knowledge, few works provide a formal treatment\nof security in case of minimal information leakage, i.e., the binary outcome of\na comparison with a threshold. In this paper, we focus on untargeted attacks\nthat can be carried out both online and offline, and in both identification and\nverification modes. On the first hand, we focus our analysis on the accuracy\nmetrics of biometric systems. We provide the complexity of untargeted attacks\nusing the False Match Rate (FMR) and the False Positive Identification Rate\n(FPIR) to address the security of these systems. Studying near-collisions with\nthese metrics allows us to estimate the maximum number of users in a database,\ngiven a chosen FMR, to preserve the security and the accuracy. These results\nare evaluated on systems from the literature. On the other hand, we rely on\nprobabilistic modelling to assess the theoretical security limits of biometric\nsystems. The study of this metric space, and system parameters (template size,\nthreshold and database size), gives us the complexity of untargeted attacks and\nthe probability of a near-collision.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2304.01580v4.pdf","comment":"Addition of results"},{"id":"http://arxiv.org/abs/2402.01456v1","updated":"2024-02-02T14:44:50Z","published":"2024-02-02T14:44:50Z","title":"Convolution kernel adaptation to calibrated fisheye","summary":" Convolution kernels are the basic structural component of convolutional\nneural networks (CNNs). In the last years there has been a growing interest in\nfisheye cameras for many applications. However, the radially symmetric\nprojection model of these cameras produces high distortions that affect the\nperformance of CNNs, especially when the field of view is very large. In this\nwork, we tackle this problem by proposing a method that leverages the\ncalibration of cameras to deform the convolution kernel accordingly and adapt\nto the distortion. That way, the receptive field of the convolution is similar\nto standard convolutions in perspective images, allowing us to take advantage\nof pre-trained networks in large perspective datasets. We show how, with just a\nbrief fine-tuning stage in a small dataset, we improve the performance of the\nnetwork for the calibrated fisheye with respect to standard convolutions in\ndepth estimation and semantic segmentation.\n","authors":["Bruno Berenguel-Baeta","Maria Santos-Villafranca","Jesus Bermudez-Cameo","Alejandro Perez-Yus","Jose J. Guerrero"],"pdf_url":"https://arxiv.org/pdf/2402.01456v1.pdf","comment":"Previously presented at BMVC: https://proceedings.bmvc2023.org/721/"},{"id":"http://arxiv.org/abs/2402.01444v1","updated":"2024-02-02T14:36:50Z","published":"2024-02-02T14:36:50Z","title":"Mission Critical -- Satellite Data is a Distinct Modality in Machine\n Learning","summary":" Satellite data has the potential to inspire a seismic shift for machine\nlearning -- one in which we rethink existing practices designed for traditional\ndata modalities. As machine learning for satellite data (SatML) gains traction\nfor its real-world impact, our field is at a crossroads. We can either continue\napplying ill-suited approaches, or we can initiate a new research agenda that\ncenters around the unique characteristics and challenges of satellite data.\nThis position paper argues that satellite data constitutes a distinct modality\nfor machine learning research and that we must recognize it as such to advance\nthe quality and impact of SatML research across theory, methods, and\ndeployment. We outline critical discussion questions and actionable suggestions\nto transform SatML from merely an intriguing application area to a dedicated\nresearch discipline that helps move the needle on big challenges for machine\nlearning and society.\n","authors":["Esther Rolf","Konstantin Klemmer","Caleb Robinson","Hannah Kerner"],"pdf_url":"https://arxiv.org/pdf/2402.01444v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.01422v1","updated":"2024-02-02T14:04:18Z","published":"2024-02-02T14:04:18Z","title":"EmoSpeaker: One-shot Fine-grained Emotion-Controlled Talking Face\n Generation","summary":" Implementing fine-grained emotion control is crucial for emotion generation\ntasks because it enhances the expressive capability of the generative model,\nallowing it to accurately and comprehensively capture and express various\nnuanced emotional states, thereby improving the emotional quality and\npersonalization of generated content. Generating fine-grained facial animations\nthat accurately portray emotional expressions using only a portrait and an\naudio recording presents a challenge. In order to address this challenge, we\npropose a visual attribute-guided audio decoupler. This enables the obtention\nof content vectors solely related to the audio content, enhancing the stability\nof subsequent lip movement coefficient predictions. To achieve more precise\nemotional expression, we introduce a fine-grained emotion coefficient\nprediction module. Additionally, we propose an emotion intensity control method\nusing a fine-grained emotion matrix. Through these, effective control over\nemotional expression in the generated videos and finer classification of\nemotion intensity are accomplished. Subsequently, a series of 3DMM coefficient\ngeneration networks are designed to predict 3D coefficients, followed by the\nutilization of a rendering network to generate the final video. Our\nexperimental results demonstrate that our proposed method, EmoSpeaker,\noutperforms existing emotional talking face generation methods in terms of\nexpression variation and lip synchronization. Project page:\nhttps://peterfanfan.github.io/EmoSpeaker/\n","authors":["Guanwen Feng","Haoran Cheng","Yunan Li","Zhiyuan Ma","Chaoneng Li","Zhihao Qian","Qiguang Miao","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2402.01422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01410v1","updated":"2024-02-02T13:42:45Z","published":"2024-02-02T13:42:45Z","title":"XAI for Skin Cancer Detection with Prototypes and Non-Expert Supervision","summary":" Skin cancer detection through dermoscopy image analysis is a critical task.\nHowever, existing models used for this purpose often lack interpretability and\nreliability, raising the concern of physicians due to their black-box nature.\nIn this paper, we propose a novel approach for the diagnosis of melanoma using\nan interpretable prototypical-part model. We introduce a guided supervision\nbased on non-expert feedback through the incorporation of: 1) binary masks,\nobtained automatically using a segmentation network; and 2) user-refined\nprototypes. These two distinct information pathways aim to ensure that the\nlearned prototypes correspond to relevant areas within the skin lesion,\nexcluding confounding factors beyond its boundaries. Experimental results\ndemonstrate that, even without expert supervision, our approach achieves\nsuperior performance and generalization compared to non-interpretable models.\n","authors":["Miguel Correia","Alceu Bissoto","Carlos Santiago","Catarina Barata"],"pdf_url":"https://arxiv.org/pdf/2402.01410v1.pdf","comment":"Accepted in the iMIMIC Workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2312.02197v3","updated":"2024-02-02T13:37:52Z","published":"2023-12-02T13:35:48Z","title":"Test-Time Degradation Adaption for Open-Set Image Restoration","summary":" In contrast to close-set scenarios that restore images from a predefined set\nof degradations, open-set image restoration aims to handle the unknown\ndegradations that were unforeseen during the pretraining phase, which is\nless-touched as far as we know. In this work, we explicitly study this\nchallenging problem and reveal its essence, i.e., the unidentified distribution\nshifts between test and training data. In recent, test-time adaptation emerges\nas a fundamental method to address this inherent disparities. Inspired by this,\nwe propose a test-time degradation adaption framework for open-set image\nrestoration, which involves three components, i.e., i) a pre-trained and\ndegradation-agnostic diffusion model to generate clean images, ii) a test-time\ndegradation adapter adapts the unknown degradations based on the input image\nduring the testing phase, and iii) the adapter-guided image restoration guides\nthe model through the adapter to produce the corresponding clean image. Through\nexperiments on multiple degradations absent from the training data, we show\nthat our method achieves comparable even better performance than those\ntask-specific methods.\n","authors":["Yuanbiao Gou","Haiyu Zhao","Boyun Li","Xinyan Xiao","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2312.02197v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01393v1","updated":"2024-02-02T13:17:19Z","published":"2024-02-02T13:17:19Z","title":"ALERT-Transformer: Bridging Asynchronous and Synchronous Machine\n Learning for Real-Time Event-based Spatio-Temporal Data","summary":" We seek to enable classic processing of continuous ultra-sparse\nspatiotemporal data generated by event-based sensors with dense machine\nlearning models. We propose a novel hybrid pipeline composed of asynchronous\nsensing and synchronous processing that combines several ideas: (1) an\nembedding based on PointNet models -- the ALERT module -- that can continuously\nintegrate new and dismiss old events thanks to a leakage mechanism, (2) a\nflexible readout of the embedded data that allows to feed any downstream model\nwith always up-to-date features at any sampling rate, (3) exploiting the input\nsparsity in a patch-based approach inspired by Vision Transformer to optimize\nthe efficiency of the method. These embeddings are then processed by a\ntransformer model trained for object and gesture recognition. Using this\napproach, we achieve performances at the state-of-the-art with a lower latency\nthan competitors. We also demonstrate that our asynchronous model can operate\nat any desired sampling rate.\n","authors":["Carmen Martin-Turrero","Maxence Bouvier","Manuel Breitenstein","Pietro Zanuttigh","Vincent Parret"],"pdf_url":"https://arxiv.org/pdf/2402.01393v1.pdf","comment":"Preprint version. 8 pages, 7 figures, under review"},{"id":"http://arxiv.org/abs/2402.01389v1","updated":"2024-02-02T13:14:20Z","published":"2024-02-02T13:14:20Z","title":"SiMA-Hand: Boosting 3D Hand-Mesh Reconstruction by Single-to-Multi-View\n Adaptation","summary":" Estimating 3D hand mesh from RGB images is a longstanding track, in which\nocclusion is one of the most challenging problems. Existing attempts towards\nthis task often fail when the occlusion dominates the image space. In this\npaper, we propose SiMA-Hand, aiming to boost the mesh reconstruction\nperformance by Single-to-Multi-view Adaptation. First, we design a multi-view\nhand reconstructor to fuse information across multiple views by holistically\nadopting feature fusion at image, joint, and vertex levels. Then, we introduce\na single-view hand reconstructor equipped with SiMA. Though taking only one\nview as input at inference, the shape and orientation features in the\nsingle-view reconstructor can be enriched by learning non-occluded knowledge\nfrom the extra views at training, enhancing the reconstruction precision on the\noccluded regions. We conduct experiments on the Dex-YCB and HanCo benchmarks\nwith challenging object- and self-caused occlusion cases, manifesting that\nSiMA-Hand consistently achieves superior performance over the state of the\narts. Code will be released on https://github.com/JoyboyWang/SiMA-Hand Pytorch.\n","authors":["Yinqiao Wang","Hao Xu","Pheng-Ann Heng","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2402.01389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01380v1","updated":"2024-02-02T13:03:20Z","published":"2024-02-02T13:03:20Z","title":"Efficient Dynamic-NeRF Based Volumetric Video Coding with Rate\n Distortion Optimization","summary":" Volumetric videos, benefiting from immersive 3D realism and interactivity,\nhold vast potential for various applications, while the tremendous data volume\nposes significant challenges for compression. Recently, NeRF has demonstrated\nremarkable potential in volumetric video compression thanks to its simple\nrepresentation and powerful 3D modeling capabilities, where a notable work is\nReRF. However, ReRF separates the modeling from compression process, resulting\nin suboptimal compression efficiency. In contrast, in this paper, we propose a\nvolumetric video compression method based on dynamic NeRF in a more compact\nmanner. Specifically, we decompose the NeRF representation into the coefficient\nfields and the basis fields, incrementally updating the basis fields in the\ntemporal domain to achieve dynamic modeling. Additionally, we perform\nend-to-end joint optimization on the modeling and compression process to\nfurther improve the compression efficiency. Extensive experiments demonstrate\nthat our method achieves higher compression efficiency compared to ReRF on\nvarious datasets.\n","authors":["Zhiyu Zhang","Guo Lu","Huanxiong Liang","Anni Tang","Qiang Hu","Li Song"],"pdf_url":"https://arxiv.org/pdf/2402.01380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14506v2","updated":"2024-02-02T12:48:45Z","published":"2023-11-24T14:26:07Z","title":"Multi-Class Anomaly Detection based on Regularized Discriminative\n Coupled hypersphere-based Feature Adaptation","summary":" In anomaly detection, identification of anomalies across diverse product\ncategories is a complex task. This paper introduces a new model by including\nclass discriminative properties obtained by a modified Regularized\nDiscriminative Variational Auto-Encoder (RD-VAE) in the feature extraction\nprocess of Coupled-hypersphere-based Feature Adaptation (CFA). By doing so, the\nproposed Regularized Discriminative Coupled-hypersphere-based Feature\nAdaptation (RD-CFA), forms a solution for multi-class anomaly detection. By\nusing the discriminative power of RD-VAE to capture intricate class\ndistributions, combined with CFA's robust anomaly detection capability, the\nproposed method excels in discerning anomalies across various classes.\nExtensive evaluations on multi-class anomaly detection and localization using\nthe MVTec AD and BeanTech AD datasets showcase the effectiveness of RD-CFA\ncompared to eight leading contemporary methods.\n","authors":["Mehdi Rafiei","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2311.14506v2.pdf","comment":"14 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.01369v1","updated":"2024-02-02T12:39:49Z","published":"2024-02-02T12:39:49Z","title":"Cheating Suffix: Targeted Attack to Text-To-Image Diffusion Models with\n Multi-Modal Priors","summary":" Diffusion models have been widely deployed in various image generation tasks,\ndemonstrating an extraordinary connection between image and text modalities.\nHowever, they face challenges of being maliciously exploited to generate\nharmful or sensitive images by appending a specific suffix to the original\nprompt. Existing works mainly focus on using single-modal information to\nconduct attacks, which fails to utilize multi-modal features and results in\nless than satisfactory performance. Integrating multi-modal priors (MMP), i.e.\nboth text and image features, we propose a targeted attack method named\nMMP-Attack in this work. Specifically, the goal of MMP-Attack is to add a\ntarget object into the image content while simultaneously removing the original\nobject. The MMP-Attack shows a notable advantage over existing works with\nsuperior universality and transferability, which can effectively attack\ncommercial text-to-image (T2I) models such as DALL-E 3. To the best of our\nknowledge, this marks the first successful attempt of transfer-based attack to\ncommercial T2I models. Our code is publicly available at\n\\url{https://github.com/ydc123/MMP-Attack}.\n","authors":["Dingcheng Yang","Yang Bai","Xiaojun Jia","Yang Liu","Xiaochun Cao","Wenjian Yu"],"pdf_url":"https://arxiv.org/pdf/2402.01369v1.pdf","comment":"10 figures"},{"id":"http://arxiv.org/abs/2402.01368v1","updated":"2024-02-02T12:39:47Z","published":"2024-02-02T12:39:47Z","title":"LIR: Efficient Degradation Removal for Lightweight Image Restoration","summary":" Recently, there have been significant advancements in Image Restoration based\non CNN and transformer. However, the inherent characteristics of the Image\nRestoration task are often overlooked in many works. These works often focus on\nthe basic block design and stack numerous basic blocks to the model, leading to\nredundant parameters and unnecessary computations and hindering the efficiency\nof the image restoration. In this paper, we propose a Lightweight Image\nRestoration network called LIR to efficiently remove degradation (blur, rain,\nnoise, haze, etc.). A key component in LIR is the Efficient Adaptive Attention\n(EAA) Block, which is mainly composed of Adaptive Filters and Attention Blocks.\nIt is capable of adaptively sharpening contours, removing degradation, and\ncapturing global information in various image restoration scenes in an\nefficient and computation-friendly manner. In addition, through a simple\nstructural design, LIR addresses the degradations existing in the local and\nglobal residual connections that are ignored by modern networks. Extensive\nexperiments demonstrate that our LIR achieves comparable performance to\nstate-of-the-art networks on most benchmarks with fewer parameters and\ncomputations. It is worth noting that our LIR produces better visual results\nthan state-of-the-art networks that are more in line with the human aesthetic.\n","authors":["Dongqi Fan","Ting Yue","Xin Zhao","Liang Chang"],"pdf_url":"https://arxiv.org/pdf/2402.01368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01355v1","updated":"2024-02-02T12:22:41Z","published":"2024-02-02T12:22:41Z","title":"FindingEmo: An Image Dataset for Emotion Recognition in the Wild","summary":" We introduce FindingEmo, a new image dataset containing annotations for 25k\nimages, specifically tailored to Emotion Recognition. Contrary to existing\ndatasets, it focuses on complex scenes depicting multiple people in various\nnaturalistic, social settings, with images being annotated as a whole, thereby\ngoing beyond the traditional focus on faces or single individuals. Annotated\ndimensions include Valence, Arousal and Emotion label, with annotations\ngathered using Prolific. Together with the annotations, we release the list of\nURLs pointing to the original images, as well as all associated source code.\n","authors":["Laurent Mertens","Elahe' Yargholi","Hans Op de Beeck","Jan Van den Stock","Joost Vennekens"],"pdf_url":"https://arxiv.org/pdf/2402.01355v1.pdf","comment":"30 pages, 21 figures, 12 tables"},{"id":"http://arxiv.org/abs/2208.12259v3","updated":"2024-02-02T12:21:32Z","published":"2022-08-25T17:59:29Z","title":"Pix4Point: Image Pretrained Standard Transformers for 3D Point Cloud\n Understanding","summary":" While Transformers have achieved impressive success in natural language\nprocessing and computer vision, their performance on 3D point clouds is\nrelatively poor. This is mainly due to the limitation of Transformers: a\ndemanding need for extensive training data. Unfortunately, in the realm of 3D\npoint clouds, the availability of large datasets is a challenge, exacerbating\nthe issue of training Transformers for 3D tasks. In this work, we solve the\ndata issue of point cloud Transformers from two perspectives: (i) introducing\nmore inductive bias to reduce the dependency of Transformers on data, and (ii)\nrelying on cross-modality pretraining. More specifically, we first present\nProgressive Point Patch Embedding and present a new point cloud Transformer\nmodel namely PViT. PViT shares the same backbone as Transformer but is shown to\nbe less hungry for data, enabling Transformer to achieve performance comparable\nto the state-of-the-art. Second, we formulate a simple yet effective pipeline\ndubbed \"Pix4Point\" that allows harnessing Transformers pretrained in the image\ndomain to enhance downstream point cloud understanding. This is achieved\nthrough a modality-agnostic Transformer backbone with the help of a tokenizer\nand decoder specialized in the different domains. Pretrained on a large number\nof widely available images, significant gains of PViT are observed in the tasks\nof 3D point cloud classification, part segmentation, and semantic segmentation\non ScanObjectNN, ShapeNetPart, and S3DIS, respectively. Our code and models are\navailable at https://github.com/guochengqian/Pix4Point .\n","authors":["Guocheng Qian","Abdullah Hamdi","Xingdi Zhang","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2208.12259v3.pdf","comment":"camera-ready version at 3DV 2024"},{"id":"http://arxiv.org/abs/2402.01352v1","updated":"2024-02-02T12:11:16Z","published":"2024-02-02T12:11:16Z","title":"Describing Images $\\textit{Fast and Slow}$: Quantifying and Predicting\n the Variation in Human Signals during Visuo-Linguistic Processes","summary":" There is an intricate relation between the properties of an image and how\nhumans behave while describing the image. This behavior shows ample variation,\nas manifested in human signals such as eye movements and when humans start to\ndescribe the image. Despite the value of such signals of visuo-linguistic\nvariation, they are virtually disregarded in the training of current pretrained\nmodels, which motivates further investigation. Using a corpus of Dutch image\ndescriptions with concurrently collected eye-tracking data, we explore the\nnature of the variation in visuo-linguistic signals, and find that they\ncorrelate with each other. Given this result, we hypothesize that variation\nstems partly from the properties of the images, and explore whether image\nrepresentations encoded by pretrained vision encoders can capture such\nvariation. Our results indicate that pretrained models do so to a\nweak-to-moderate degree, suggesting that the models lack biases about what\nmakes a stimulus complex for humans and what leads to variations in human\noutputs.\n","authors":["Ece Takmaz","Sandro Pezzelle","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2402.01352v1.pdf","comment":"To appear in EACL 2024"},{"id":"http://arxiv.org/abs/2402.01345v1","updated":"2024-02-02T12:02:46Z","published":"2024-02-02T12:02:46Z","title":"Skip $\\textbackslash n$: A simple method to reduce hallucination in\n Large Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks ('$\\textbackslash n\\textbackslash n$'),\nwhere the content before and after '$\\textbackslash n\\textbackslash n$' in the\ntraining data frequently exhibit significant semantic changes. This pattern\nleads the model to infer that the contents following '$\\textbackslash\nn\\textbackslash n$' should be obviously different from the preceding contents\nwith less hallucinatory descriptions, thereby increasing the probability of\nhallucinatory descriptions subsequent to the '$\\textbackslash n\\textbackslash\nn$'. We have validated this hypothesis on multiple publicly available LVLMs.\nBesides, we find that deliberately inserting '$\\textbackslash n\\textbackslash\nn$' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of `\\textbackslash n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2311.02826v2","updated":"2024-02-02T11:56:41Z","published":"2023-11-06T02:21:11Z","title":"InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image","summary":" With the success of Neural Radiance Field (NeRF) in 3D-aware portrait\nediting, a variety of works have achieved promising results regarding both\nquality and 3D consistency. However, these methods heavily rely on per-prompt\noptimization when handling natural language as editing instructions. Due to the\nlack of labeled human face 3D datasets and effective architectures, the area of\nhuman-instructed 3D-aware editing for open-world portraits in an end-to-end\nmanner remains under-explored. To solve this problem, we propose an end-to-end\ndiffusion-based framework termed InstructPix2NeRF, which enables instructed\n3D-aware portrait editing from a single open-world image with human\ninstructions. At its core lies a conditional latent 3D diffusion process that\nlifts 2D editing to 3D space by learning the correlation between the paired\nimages' difference and the instructions via triplet data. With the help of our\nproposed token position randomization strategy, we could even achieve\nmulti-semantic editing through one single pass with the portrait identity\nwell-preserved. Besides, we further propose an identity consistency module that\ndirectly modulates the extracted identity signals into our diffusion process,\nwhich increases the multi-view 3D identity consistency. Extensive experiments\nverify the effectiveness of our method and show its superiority against strong\nbaselines quantitatively and qualitatively. Source code and pre-trained models\ncan be found on our project page:\n\\url{https://mybabyyh.github.io/InstructPix2NeRF}.\n","authors":["Jianhui Li","Shilong Liu","Zidong Liu","Yikai Wang","Kaiwen Zheng","Jinghui Xu","Jianmin Li","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.02826v2.pdf","comment":"https://github.com/mybabyyh/InstructPix2NeRF"},{"id":"http://arxiv.org/abs/2402.01335v1","updated":"2024-02-02T11:40:27Z","published":"2024-02-02T11:40:27Z","title":"Simulator-Free Visual Domain Randomization via Video Games","summary":" Domain randomization is an effective computer vision technique for improving\ntransferability of vision models across visually distinct domains exhibiting\nsimilar content. Existing approaches, however, rely extensively on tweaking\ncomplex and specialized simulation engines that are difficult to construct,\nsubsequently affecting their feasibility and scalability. This paper introduces\nBehAVE, a video understanding framework that uniquely leverages the plethora of\nexisting commercial video games for domain randomization, without requiring\naccess to their simulation engines. Under BehAVE (1) the inherent rich visual\ndiversity of video games acts as the source of randomization and (2) player\nbehavior -- represented semantically via textual descriptions of actions --\nguides the *alignment* of videos with similar content. We test BehAVE on 25\ngames of the first-person shooter (FPS) genre across various video and text\nfoundation models and we report its robustness for domain randomization. BehAVE\nsuccessfully aligns player behavioral patterns and is able to zero-shot\ntransfer them to multiple unseen FPS games when trained on just one FPS game.\nIn a more challenging setting, BehAVE manages to improve the zero-shot\ntransferability of foundation models to unseen FPS games (up to 22%) even when\ntrained on a game of a different genre (Minecraft). Code and dataset can be\nfound at https://github.com/nrasajski/BehAVE.\n","authors":["Chintan Trivedi","Nemanja Rašajski","Konstantinos Makantasis","Antonios Liapis","Georgios N. Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2402.01335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01331v1","updated":"2024-02-02T11:33:05Z","published":"2024-02-02T11:33:05Z","title":"A general framework for rotation invariant point cloud analysis","summary":" We propose a general method for deep learning based point cloud analysis,\nwhich is invariant to rotation on the inputs. Classical methods are vulnerable\nto rotation, as they usually take aligned point clouds as input. Principle\nComponent Analysis (PCA) is a practical approach to achieve rotation\ninvariance. However, there are still some gaps between theory and practical\nalgorithms. In this work, we present a thorough study on designing rotation\ninvariant algorithms for point cloud analysis. We first formulate it as a\npermutation invariant problem, then propose a general framework which can be\ncombined with any backbones. Our method is beneficial for further research such\nas 3D pre-training and multi-modal learning. Experiments show that our method\nhas considerable or better performance compared to state-of-the-art approaches\non common benchmarks. Code is available at\nhttps://github.com/luoshuqing2001/RI_framework.\n","authors":["Shuqing Luo","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2402.01331v1.pdf","comment":"5 pages, 1 figure, accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.01313v1","updated":"2024-02-02T11:07:27Z","published":"2024-02-02T11:07:27Z","title":"AutoGCN -- Towards Generic Human Activity Recognition with Neural\n Architecture Search","summary":" This paper introduces AutoGCN, a generic Neural Architecture Search (NAS)\nalgorithm for Human Activity Recognition (HAR) using Graph Convolution Networks\n(GCNs). HAR has gained attention due to advances in deep learning, increased\ndata availability, and enhanced computational capabilities. At the same time,\nGCNs have shown promising results in modeling relationships between body key\npoints in a skeletal graph. While domain experts often craft dataset-specific\nGCN-based methods, their applicability beyond this specific context is severely\nlimited. AutoGCN seeks to address this limitation by simultaneously searching\nfor the ideal hyperparameters and architecture combination within a versatile\nsearch space using a reinforcement controller while balancing optimal\nexploration and exploitation behavior with a knowledge reservoir during the\nsearch process. We conduct extensive experiments on two large-scale datasets\nfocused on skeleton-based action recognition to assess the proposed algorithm's\nperformance. Our experimental results underscore the effectiveness of AutoGCN\nin constructing optimal GCN architectures for HAR, outperforming conventional\nNAS and GCN methods, as well as random search. These findings highlight the\nsignificance of a diverse search space and an expressive input representation\nto enhance the network performance and generalizability.\n","authors":["Felix Tempel","Inga Strümke","Espen Alexander F. Ihlen"],"pdf_url":"https://arxiv.org/pdf/2402.01313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01311v1","updated":"2024-02-02T11:03:33Z","published":"2024-02-02T11:03:33Z","title":"Deep Multimodal Fusion of Data with Heterogeneous Dimensionality via\n Projective Networks","summary":" The use of multimodal imaging has led to significant improvements in the\ndiagnosis and treatment of many diseases. Similar to clinical practice, some\nworks have demonstrated the benefits of multimodal fusion for automatic\nsegmentation and classification using deep learning-based methods. However,\ncurrent segmentation methods are limited to fusion of modalities with the same\ndimensionality (e.g., 3D+3D, 2D+2D), which is not always possible, and the\nfusion strategies implemented by classification methods are incompatible with\nlocalization tasks. In this work, we propose a novel deep learning-based\nframework for the fusion of multimodal data with heterogeneous dimensionality\n(e.g., 3D+2D) that is compatible with localization tasks. The proposed\nframework extracts the features of the different modalities and projects them\ninto the common feature subspace. The projected features are then fused and\nfurther processed to obtain the final prediction. The framework was validated\non the following tasks: segmentation of geographic atrophy (GA), a late-stage\nmanifestation of age-related macular degeneration, and segmentation of retinal\nblood vessels (RBV) in multimodal retinal imaging. Our results show that the\nproposed method outperforms the state-of-the-art monomodal methods on GA and\nRBV segmentation by up to 3.10% and 4.64% Dice, respectively.\n","authors":["José Morano","Guilherme Aresta","Christoph Grechenig","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2402.01311v1.pdf","comment":"Accepted for publication in the IEEE Journal of Biomedical and Health\n Informatics (JBHI)"},{"id":"http://arxiv.org/abs/2402.01304v1","updated":"2024-02-02T10:48:43Z","published":"2024-02-02T10:48:43Z","title":"Phrase Grounding-based Style Transfer for Single-Domain Generalized\n Object Detection","summary":" Single-domain generalized object detection aims to enhance a model's\ngeneralizability to multiple unseen target domains using only data from a\nsingle source domain during training. This is a practical yet challenging task\nas it requires the model to address domain shift without incorporating target\ndomain data into training. In this paper, we propose a novel phrase\ngrounding-based style transfer (PGST) approach for the task. Specifically, we\nfirst define textual prompts to describe potential objects for each unseen\ntarget domain. Then, we leverage the grounded language-image pre-training\n(GLIP) model to learn the style of these target domains and achieve style\ntransfer from the source to the target domain. The style-transferred source\nvisual features are semantically rich and could be close to imaginary\ncounterparts in the target domain. Finally, we employ these style-transferred\nvisual features to fine-tune GLIP. By introducing imaginary counterparts, the\ndetector could be effectively generalized to unseen target domains using only a\nsingle source domain for training. Extensive experimental results on five\ndiverse weather driving benchmarks demonstrate our proposed approach achieves\nstate-of-the-art performance, even surpassing some domain adaptive methods that\nincorporate target domain images into the training process.The source codes and\npre-trained models will be made available.\n","authors":["Hao Li","Wei Wang","Cong Wang","Zhigang Luo","Xinwang Liu","Kenli Li","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2402.01304v1.pdf","comment":"22 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.01303v1","updated":"2024-02-02T10:47:08Z","published":"2024-02-02T10:47:08Z","title":"AGILE: Approach-based Grasp Inference Learned from Element Decomposition","summary":" Humans, this species expert in grasp detection, can grasp objects by taking\ninto account hand-object positioning information. This work proposes a method\nto enable a robot manipulator to learn the same, grasping objects in the most\noptimal way according to how the gripper has approached the object. Built on\ndeep learning, the proposed method consists of two main stages. In order to\ngeneralize the network on unseen objects, the proposed Approach-based Grasping\nInference involves an element decomposition stage to split an object into its\nmain parts, each with one or more annotated grasps for a particular approach of\nthe gripper. Subsequently, a grasp detection network utilizes the decomposed\nelements by Mask R-CNN and the information on the approach of the gripper in\norder to detect the element the gripper has approached and the most optimal\ngrasp. In order to train the networks, the study introduces a robotic grasping\ndataset collected in the Coppeliasim simulation environment. The dataset\ninvolves 10 different objects with annotated element decomposition masks and\ngrasp rectangles. The proposed method acquires a 90% grasp success rate on seen\nobjects and 78% on unseen objects in the Coppeliasim simulation environment.\nLastly, simulation-to-reality domain adaptation is performed by applying\ntransformations on the training set collected in simulation and augmenting the\ndataset, which results in a 70% physical grasp success performance using a\nDelta parallel robot and a 2 -fingered gripper.\n","authors":["MohammadHossein Koosheshi","Hamed Hosseini","Mehdi Tale Masouleh","Ahmad Kalhor","Mohammad Reza Hairi Yazdi"],"pdf_url":"https://arxiv.org/pdf/2402.01303v1.pdf","comment":"Conference Paper, ICROM 2023, 8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2303.08714v3","updated":"2024-02-02T10:36:37Z","published":"2023-03-15T15:50:11Z","title":"ResDiff: Combining CNN and Diffusion Model for Image Super-Resolution","summary":" Adapting the Diffusion Probabilistic Model (DPM) for direct image\nsuper-resolution is wasteful, given that a simple Convolutional Neural Network\n(CNN) can recover the main low-frequency content. Therefore, we present\nResDiff, a novel Diffusion Probabilistic Model based on Residual structure for\nSingle Image Super-Resolution (SISR). ResDiff utilizes a combination of a CNN,\nwhich restores primary low-frequency components, and a DPM, which predicts the\nresidual between the ground-truth image and the CNN predicted image. In\ncontrast to the common diffusion-based methods that directly use LR images to\nguide the noise towards HR space, ResDiff utilizes the CNN's initial prediction\nto direct the noise towards the residual space between HR space and\nCNN-predicted space, which not only accelerates the generation process but also\nacquires superior sample quality. Additionally, a frequency-domain-based loss\nfunction for CNN is introduced to facilitate its restoration, and a\nfrequency-domain guided diffusion is designed for DPM on behalf of predicting\nhigh-frequency details. The extensive experiments on multiple benchmark\ndatasets demonstrate that ResDiff outperforms previous diffusion based methods\nin terms of shorter model convergence time, superior generation quality, and\nmore diverse samples.\n","authors":["Shuyao Shang","Zhengyang Shan","Guangxing Liu","LunQian Wang","XingHua Wang","Zekai Zhang","Jinglin Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.08714v3.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.01296v1","updated":"2024-02-02T10:35:05Z","published":"2024-02-02T10:35:05Z","title":"Bi-CryptoNets: Leveraging Different-Level Privacy for Encrypted\n Inference","summary":" Privacy-preserving neural networks have attracted increasing attention in\nrecent years, and various algorithms have been developed to keep the balance\nbetween accuracy, computational complexity and information security from the\ncryptographic view. This work takes a different view from the input data and\nstructure of neural networks. We decompose the input data (e.g., some images)\ninto sensitive and insensitive segments according to importance and privacy.\nThe sensitive segment includes some important and private information such as\nhuman faces and we take strong homomorphic encryption to keep security, whereas\nthe insensitive one contains some background and we add perturbations. We\npropose the bi-CryptoNets, i.e., plaintext and ciphertext branches, to deal\nwith two segments, respectively, and ciphertext branch could utilize the\ninformation from plaintext branch by unidirectional connections. We adopt\nknowledge distillation for our bi-CryptoNets by transferring representations\nfrom a well-trained teacher neural network. Empirical studies show the\neffectiveness and decrease of inference latency for our bi-CryptoNets.\n","authors":["Man-Jie Yuan","Zheng Zou","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2402.01296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01289v1","updated":"2024-02-02T10:25:39Z","published":"2024-02-02T10:25:39Z","title":"UCVC: A Unified Contextual Video Compression Framework with Joint\n P-frame and B-frame Coding","summary":" This paper presents a learned video compression method in response to video\ncompression track of the 6th Challenge on Learned Image Compression (CLIC), at\nDCC 2024.Specifically, we propose a unified contextual video compression\nframework (UCVC) for joint P-frame and B-frame coding. Each non-intra frame\nrefers to two neighboring decoded frames, which can be either both from the\npast for P-frame compression, or one from the past and one from the future for\nB-frame compression. In training stage, the model parameters are jointly\noptimized with both P-frames and B-frames. Benefiting from the designs, the\nframework can support both P-frame and B-frame coding and achieve comparable\ncompression efficiency with that specifically designed for P-frame or\nB-frame.As for challenge submission, we report the optimal compression\nefficiency by selecting appropriate frame types for each test sequence. Our\nteam name is PKUSZ-LVC.\n","authors":["Jiayu Yang","Wei Jiang","Yongqi Zhai","Chunhui Yang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.01289v1.pdf","comment":"DCC2024, CLIC2024"},{"id":"http://arxiv.org/abs/2304.07193v2","updated":"2024-02-02T10:24:09Z","published":"2023-04-14T15:12:19Z","title":"DINOv2: Learning Robust Visual Features without Supervision","summary":" The recent breakthroughs in natural language processing for model pretraining\non large quantities of data have opened the way for similar foundation models\nin computer vision. These models could greatly simplify the use of images in\nany system by producing all-purpose visual features, i.e., features that work\nacross image distributions and tasks without finetuning. This work shows that\nexisting pretraining methods, especially self-supervised methods, can produce\nsuch features if trained on enough curated data from diverse sources. We\nrevisit existing approaches and combine different techniques to scale our\npretraining in terms of data and model size. Most of the technical\ncontributions aim at accelerating and stabilizing the training at scale. In\nterms of data, we propose an automatic pipeline to build a dedicated, diverse,\nand curated image dataset instead of uncurated data, as typically done in the\nself-supervised literature. In terms of models, we train a ViT model\n(Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of\nsmaller models that surpass the best available all-purpose features, OpenCLIP\n(Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.\n","authors":["Maxime Oquab","Timothée Darcet","Théo Moutakanni","Huy Vo","Marc Szafraniec","Vasil Khalidov","Pierre Fernandez","Daniel Haziza","Francisco Massa","Alaaeldin El-Nouby","Mahmoud Assran","Nicolas Ballas","Wojciech Galuba","Russell Howes","Po-Yao Huang","Shang-Wen Li","Ishan Misra","Michael Rabbat","Vasu Sharma","Gabriel Synnaeve","Hu Xu","Hervé Jegou","Julien Mairal","Patrick Labatut","Armand Joulin","Piotr Bojanowski"],"pdf_url":"https://arxiv.org/pdf/2304.07193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01287v1","updated":"2024-02-02T10:23:03Z","published":"2024-02-02T10:23:03Z","title":"Spiking CenterNet: A Distillation-boosted Spiking Neural Network for\n Object Detection","summary":" In the era of AI at the edge, self-driving cars, and climate change, the need\nfor energy-efficient, small, embedded AI is growing. Spiking Neural Networks\n(SNNs) are a promising approach to address this challenge, with their\nevent-driven information flow and sparse activations. We propose Spiking\nCenterNet for object detection on event data. It combines an SNN CenterNet\nadaptation with an efficient M2U-Net-based decoder. Our model significantly\noutperforms comparable previous work on Prophesee's challenging GEN1 Automotive\nDetection Dataset while using less than half the energy. Distilling the\nknowledge of a non-spiking teacher into our SNN further increases performance.\nTo the best of our knowledge, our work is the first approach that takes\nadvantage of knowledge distillation in the field of spiking object detection.\n","authors":["Lennard Bodden","Franziska Schwaiger","Duc Bach Ha","Lars Kreuzberg","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2402.01287v1.pdf","comment":"8 pages, 5 figures. Submitted to WCCI-2024"},{"id":"http://arxiv.org/abs/2401.17270v2","updated":"2024-02-02T10:06:24Z","published":"2024-01-30T18:59:38Z","title":"YOLO-World: Real-Time Open-Vocabulary Object Detection","summary":" The You Only Look Once (YOLO) series of detectors have established themselves\nas efficient and practical tools. However, their reliance on predefined and\ntrained object categories limits their applicability in open scenarios.\nAddressing this limitation, we introduce YOLO-World, an innovative approach\nthat enhances YOLO with open-vocabulary detection capabilities through\nvision-language modeling and pre-training on large-scale datasets.\nSpecifically, we propose a new Re-parameterizable Vision-Language Path\nAggregation Network (RepVL-PAN) and region-text contrastive loss to facilitate\nthe interaction between visual and linguistic information. Our method excels in\ndetecting a wide range of objects in a zero-shot manner with high efficiency.\nOn the challenging LVIS dataset, YOLO-World achieves 35.4 AP with 52.0 FPS on\nV100, which outperforms many state-of-the-art methods in terms of both accuracy\nand speed. Furthermore, the fine-tuned YOLO-World achieves remarkable\nperformance on several downstream tasks, including object detection and\nopen-vocabulary instance segmentation.\n","authors":["Tianheng Cheng","Lin Song","Yixiao Ge","Wenyu Liu","Xinggang Wang","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.17270v2.pdf","comment":"Work still in progress. Code & models are available at:\n https://github.com/AILab-CVC/YOLO-World"},{"id":"http://arxiv.org/abs/2305.18453v4","updated":"2024-02-02T09:56:45Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis","summary":" Artificial intelligence (AI) in healthcare, especially in medical imaging,\nfaces challenges due to data scarcity and privacy concerns. Addressing these,\nwe introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI\nsynthesis. This model effectively tackles data scarcity and privacy issues by\nintegrating semantic conditioning. This involves the channel-wise concatenation\nof a conditioning image to the model input, enabling control in image\ngeneration. Med-DDPM demonstrates superior stability and performance compared\nto existing 3D brain imaging synthesis methods. It generates diverse,\nanatomically coherent images with high visual fidelity. In terms of dice score\naccuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the\n0.6531 accuracy of real images, and outperforms baseline models. Combined with\nreal images, it further increases segmentation accuracy to 0.6675, showing the\npotential of our proposed method for data augmentation. This model represents\nthe first use of a diffusion model in 3D semantic brain MRI synthesis,\nproducing high-quality images. Its semantic conditioning feature also shows\npotential for image anonymization in biomedical imaging, addressing data and\nprivacy issues. We provide the code and model weights for Med-DDPM on our\nGitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support\nreproducibility.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01269v1","updated":"2024-02-02T09:47:26Z","published":"2024-02-02T09:47:26Z","title":"Spectrum-guided Feature Enhancement Network for Event Person\n Re-Identification","summary":" As a cutting-edge biosensor, the event camera holds significant potential in\nthe field of computer vision, particularly regarding privacy preservation.\nHowever, compared to traditional cameras, event streams often contain noise and\npossess extremely sparse semantics, posing a formidable challenge for\nevent-based person re-identification (event Re-ID). To address this, we\nintroduce a novel event person re-identification network: the Spectrum-guided\nFeature Enhancement Network (SFE-Net). This network consists of two innovative\ncomponents: the Multi-grain Spectrum Attention Mechanism (MSAM) and the\nConsecutive Patch Dropout Module (CPDM). MSAM employs a fourier spectrum\ntransform strategy to filter event noise, while also utilizing an event-guided\nmulti-granularity attention strategy to enhance and capture discriminative\nperson semantics. CPDM employs a consecutive patch dropout strategy to generate\nmultiple incomplete feature maps, encouraging the deep Re-ID model to equally\nperceive each effective region of the person's body and capture robust person\ndescriptors. Extensive experiments on Event Re-ID datasets demonstrate that our\nSFE-Net achieves the best performance in this task.\n","authors":["Hongchen Tan","Yi Zhang","Xiuping Liu","Baocai Yin","Nan Ma","Xin Li","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2402.01269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14005v2","updated":"2024-02-02T09:46:22Z","published":"2023-08-27T04:50:05Z","title":"Calibrating Panoramic Depth Estimation for Practical Localization and\n Mapping","summary":" The absolute depth values of surrounding environments provide crucial cues\nfor various assistive technologies, such as localization, navigation, and 3D\nstructure estimation. We propose that accurate depth estimated from panoramic\nimages can serve as a powerful and light-weight input for a wide range of\ndownstream tasks requiring 3D information. While panoramic images can easily\ncapture the surrounding context from commodity devices, the estimated depth\nshares the limitations of conventional image-based depth estimation; the\nperformance deteriorates under large domain shifts and the absolute values are\nstill ambiguous to infer from 2D observations. By taking advantage of the\nholistic view, we mitigate such effects in a self-supervised way and fine-tune\nthe network with geometric consistency during the test phase. Specifically, we\nconstruct a 3D point cloud from the current depth prediction and project the\npoint cloud at various viewpoints or apply stretches on the current input image\nto generate synthetic panoramas. Then we minimize the discrepancy of the 3D\nstructure estimated from synthetic images without collecting additional data.\nWe empirically evaluate our method in robot navigation and map-free\nlocalization where our method shows large performance enhancements. Our\ncalibration method can therefore widen the applicability under various external\nconditions, serving as a key component for practical panorama-based machine\nvision systems. Code is available through the following link:\n\\url{https://github.com/82magnolia/panoramic-depth-calibration}.\n","authors":["Junho Kim","Eun Sun Lee","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14005v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08143v3","updated":"2024-02-02T09:37:57Z","published":"2023-08-16T04:31:33Z","title":"IIANet: An Intra- and Inter-Modality Attention Network for Audio-Visual\n Speech Separation","summary":" Recent research has made significant progress in designing fusion modules for\naudio-visual speech separation. However, they predominantly focus on\nmulti-modal fusion at a single temporal scale of auditory and visual features\nwithout employing selective attention mechanisms, which is in sharp contrast\nwith the brain. To address this issue, We propose a novel model called Intra-\nand Inter-Attention Network (IIANet), which leverages the attention mechanism\nfor efficient audio-visual feature fusion. IIANet consists of two types of\nattention blocks: intra-attention (IntraA) and inter-attention (InterA) blocks,\nwhere the InterA blocks are distributed at the top, middle and bottom of\nIIANet. Heavily inspired by the way how human brain selectively focuses on\nrelevant content at various temporal scales, these blocks maintain the ability\nto learn modality-specific features and enable the extraction of different\nsemantics from audio-visual features. Comprehensive experiments on three\nstandard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of IIANet, outperforming previous\nstate-of-the-art methods while maintaining comparable inference time. In\nparticular, the fast version of IIANet (IIANet-fast) has only 7% of CTCNet's\nMACs and is 40% faster than CTCNet on CPUs while achieving better separation\nquality, showing the great potential of attention mechanism for efficient and\neffective multimodal fusion.\n","authors":["Kai Li","Runxuan Yang","Fuchun Sun","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v3.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.01262v1","updated":"2024-02-02T09:33:07Z","published":"2024-02-02T09:33:07Z","title":"Cascaded Scaling Classifier: class incremental learning with probability\n scaling","summary":" Humans are capable of acquiring new knowledge and transferring learned\nknowledge into different domains, incurring a small forgetting. The same\nability, called Continual Learning, is challenging to achieve when operating\nwith neural networks due to the forgetting affecting past learned tasks when\nlearning new ones. This forgetting can be mitigated by replaying stored samples\nfrom past tasks, but a large memory size may be needed for long sequences of\ntasks; moreover, this could lead to overfitting on saved samples. In this\npaper, we propose a novel regularisation approach and a novel incremental\nclassifier called, respectively, Margin Dampening and Cascaded Scaling\nClassifier. The first combines a soft constraint and a knowledge distillation\napproach to preserve past learned knowledge while allowing the model to learn\nnew patterns effectively. The latter is a gated incremental classifier, helping\nthe model modify past predictions without directly interfering with them. This\nis achieved by modifying the output of the model with auxiliary scaling\nfunctions. We empirically show that our approach performs well on multiple\nbenchmarks against well-established baselines, and we also study each component\nof our proposal and how the combinations of such components affect the final\nresults.\n","authors":["Jary Pomponi","Alessio Devoto","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2402.01262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01241v1","updated":"2024-02-02T09:09:23Z","published":"2024-02-02T09:09:23Z","title":"Can Shape-Infused Joint Embeddings Improve Image-Conditioned 3D\n Diffusion?","summary":" Recent advancements in deep generative models, particularly with the\napplication of CLIP (Contrastive Language Image Pretraining) to Denoising\nDiffusion Probabilistic Models (DDPMs), have demonstrated remarkable\neffectiveness in text to image generation. The well structured embedding space\nof CLIP has also been extended to image to shape generation with DDPMs,\nyielding notable results. Despite these successes, some fundamental questions\narise: Does CLIP ensure the best results in shape generation from images? Can\nwe leverage conditioning to bring explicit 3D knowledge into the generative\nprocess and obtain better quality? This study introduces CISP (Contrastive\nImage Shape Pre training), designed to enhance 3D shape synthesis guided by 2D\nimages. CISP aims to enrich the CLIP framework by aligning 2D images with 3D\nshapes in a shared embedding space, specifically capturing 3D characteristics\npotentially overlooked by CLIP's text image focus. Our comprehensive analysis\nassesses CISP's guidance performance against CLIP guided models, focusing on\ngeneration quality, diversity, and coherence of the produced shapes with the\nconditioning image. We find that, while matching CLIP in generation quality and\ndiversity, CISP substantially improves coherence with input images,\nunderscoring the value of incorporating 3D knowledge into generative models.\nThese findings suggest a promising direction for advancing the synthesis of 3D\nvisual content by integrating multimodal systems with 3D representations.\n","authors":["Cristian Sbrolli","Paolo Cudrano","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2402.01241v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.01239v1","updated":"2024-02-02T09:07:00Z","published":"2024-02-02T09:07:00Z","title":"PRIME: Protect Your Videos From Malicious Editing","summary":" With the development of generative models, the quality of generated content\nkeeps increasing. Recently, open-source models have made it surprisingly easy\nto manipulate and edit photos and videos, with just a few simple prompts. While\nthese cutting-edge technologies have gained popularity, they have also given\nrise to concerns regarding the privacy and portrait rights of individuals.\nMalicious users can exploit these tools for deceptive or illegal purposes.\nAlthough some previous works focus on protecting photos against generative\nmodels, we find there are still gaps between protecting videos and images in\nthe aspects of efficiency and effectiveness. Therefore, we introduce our\nprotection method, PRIME, to significantly reduce the time cost and improve the\nprotection performance. Moreover, to evaluate our proposed protection method,\nwe consider both objective metrics and human subjective metrics. Our evaluation\nresults indicate that PRIME only costs 8.3% GPU hours of the cost of the\nprevious state-of-the-art method and achieves better protection results on both\nhuman evaluation and objective metrics. Code can be found in\nhttps://github.com/GuanlinLee/prime.\n","authors":["Guanlin Li","Shuai Yang","Jie Zhang","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.06854v3","updated":"2024-02-02T08:59:26Z","published":"2022-06-14T13:49:08Z","title":"On the explainable properties of 1-Lipschitz Neural Networks: An Optimal\n Transport Perspective","summary":" Input gradients have a pivotal role in a variety of applications, including\nadversarial attack algorithms for evaluating model robustness, explainable AI\ntechniques for generating Saliency Maps, and counterfactual\nexplanations.However, Saliency Maps generated by traditional neural networks\nare often noisy and provide limited insights. In this paper, we demonstrate\nthat, on the contrary, the Saliency Maps of 1-Lipschitz neural networks,\nlearned with the dual loss of an optimal transportation problem, exhibit\ndesirable XAI properties:They are highly concentrated on the essential parts of\nthe image with low noise, significantly outperforming state-of-the-art\nexplanation approaches across various models and metrics. We also prove that\nthese maps align unprecedentedly well with human explanations on ImageNet.To\nexplain the particularly beneficial properties of the Saliency Map for such\nmodels, we prove this gradient encodes both the direction of the transportation\nplan and the direction towards the nearest adversarial attack. Following the\ngradient down to the decision boundary is no longer considered an adversarial\nattack, but rather a counterfactual explanation that explicitly transports the\ninput from one class to another. Thus, Learning with such a loss jointly\noptimizes the classification objective and the alignment of the gradient, i.e.\nthe Saliency Map, to the transportation plan direction.These networks were\npreviously known to be certifiably robust by design, and we demonstrate that\nthey scale well for large problems and models, and are tailored for\nexplainability using a fast and straightforward method.\n","authors":["Mathieu Serrurier","Franck Mamalet","Thomas Fel","Louis Béthune","Thibaut Boissin"],"pdf_url":"https://arxiv.org/pdf/2206.06854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06030v2","updated":"2024-02-02T08:44:00Z","published":"2023-09-12T08:04:56Z","title":"Federated Learning for Large-Scale Scene Modeling with Neural Radiance\n Fields","summary":" We envision a system to continuously build and maintain a map based on\nearth-scale neural radiance fields (NeRF) using data collected from vehicles\nand drones in a lifelong learning manner. However, existing large-scale\nmodeling by NeRF has problems in terms of scalability and maintainability when\nmodeling earth-scale environments. Therefore, to address these problems, we\npropose a federated learning pipeline for large-scale modeling with NeRF. We\ntailor the model aggregation pipeline in federated learning for NeRF, thereby\nallowing local updates of NeRF. In the aggregation step, the accuracy of the\nclients' global pose is critical. Thus, we also propose global pose alignment\nto align the noisy global pose of clients before the aggregation step. In\nexperiments, we show the effectiveness of the proposed pose alignment and the\nfederated learning pipeline on the large-scale scene dataset, Mill19.\n","authors":["Teppei Suzuki"],"pdf_url":"https://arxiv.org/pdf/2309.06030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01220v1","updated":"2024-02-02T08:42:45Z","published":"2024-02-02T08:42:45Z","title":"Delving into Decision-based Black-box Attacks on Semantic Segmentation","summary":" Semantic segmentation is a fundamental visual task that finds extensive\ndeployment in applications with security-sensitive considerations. Nonetheless,\nrecent work illustrates the adversarial vulnerability of semantic segmentation\nmodels to white-box attacks. However, its adversarial robustness against\nblack-box attacks has not been fully explored. In this paper, we present the\nfirst exploration of black-box decision-based attacks on semantic segmentation.\nFirst, we analyze the challenges that semantic segmentation brings to\ndecision-based attacks through the case study. Then, to address these\nchallenges, we first propose a decision-based attack on semantic segmentation,\ncalled Discrete Linear Attack (DLA). Based on random search and proxy index, we\nutilize the discrete linear noises for perturbation exploration and calibration\nto achieve efficient attack efficiency. We conduct adversarial robustness\nevaluation on 5 models from Cityscapes and ADE20K under 8 attacks. DLA shows\nits formidable power on Cityscapes by dramatically reducing PSPNet's mIoU from\nan impressive 77.83% to a mere 2.14% with just 50 queries.\n","authors":["Zhaoyu Chen","Zhengyang Shan","Jingwen Chang","Kaixun Jiang","Dingkang Yang","Yiting Cheng","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01217v1","updated":"2024-02-02T08:39:51Z","published":"2024-02-02T08:39:51Z","title":"Taming Uncertainty in Sparse-view Generalizable NeRF via Indirect\n Diffusion Guidance","summary":" Neural Radiance Fields (NeRF) have demonstrated effectiveness in synthesizing\nnovel views. However, their reliance on dense inputs and scene-specific\noptimization has limited their broader applicability. Generalizable NeRFs\n(Gen-NeRF), while intended to address this, often produce blurring artifacts in\nunobserved regions with sparse inputs, which are full of uncertainty. In this\npaper, we aim to diminish the uncertainty in Gen-NeRF for plausible renderings.\nWe assume that NeRF's inability to effectively mitigate this uncertainty stems\nfrom its inherent lack of generative capacity. Therefore, we innovatively\npropose an Indirect Diffusion-guided NeRF framework, termed ID-NeRF, to address\nthis uncertainty from a generative perspective by leveraging a distilled\ndiffusion prior as guidance. Specifically, to avoid model confusion caused by\ndirectly regularizing with inconsistent samplings as in previous methods, our\napproach introduces a strategy to indirectly inject the inherently missing\nimagination into the learned implicit function through a diffusion-guided\nlatent space. Empirical evaluation across various benchmarks demonstrates the\nsuperior performance of our approach in handling uncertainty with sparse\ninputs.\n","authors":["Yaokun Li","Chao Gou","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01212v1","updated":"2024-02-02T08:37:38Z","published":"2024-02-02T08:37:38Z","title":"TSJNet: A Multi-modality Target and Semantic Awareness Joint-driven\n Image Fusion Network","summary":" Multi-modality image fusion involves integrating complementary information\nfrom different modalities into a single image. Current methods primarily focus\non enhancing image fusion with a single advanced task such as incorporating\nsemantic or object-related information into the fusion process. This method\ncreates challenges in achieving multiple objectives simultaneously. We\nintroduce a target and semantic awareness joint-driven fusion network called\nTSJNet. TSJNet comprises fusion, detection, and segmentation subnetworks\narranged in a series structure. It leverages object and semantically relevant\ninformation derived from dual high-level tasks to guide the fusion network.\nAdditionally, We propose a local significant feature extraction module with a\ndouble parallel branch structure to fully capture the fine-grained features of\ncross-modal images and foster interaction among modalities, targets, and\nsegmentation information. We conducted extensive experiments on four publicly\navailable datasets (MSRS, M3FD, RoadScene, and LLVIP). The results demonstrate\nthat TSJNet can generate visually pleasing fused results, achieving an average\nincrease of 2.84% and 7.47% in object detection and segmentation mAP @0.5 and\nmIoU, respectively, compared to the state-of-the-art methods.\n","authors":["Yuchan Jie","Yushen Xu","Xiaosong Li","Haishu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07856v2","updated":"2024-02-02T08:25:16Z","published":"2023-12-13T02:51:26Z","title":"DTL: Disentangled Transfer Learning for Visual Recognition","summary":" When pre-trained models become rapidly larger, the cost of fine-tuning on\ndownstream tasks steadily increases, too. To economically fine-tune these\nmodels, parameter-efficient transfer learning (PETL) is proposed, which only\ntunes a tiny subset of trainable parameters to efficiently learn quality\nrepresentations. However, current PETL methods are facing the dilemma that\nduring training the GPU memory footprint is not effectively reduced as\ntrainable parameters. PETL will likely fail, too, if the full fine-tuning\nencounters the out-of-GPU-memory issue. This phenomenon happens because\ntrainable parameters from these methods are generally entangled with the\nbackbone, such that a lot of intermediate states have to be stored in GPU\nmemory for gradient propagation. To alleviate this problem, we introduce\nDisentangled Transfer Learning (DTL), which disentangles the trainable\nparameters from the backbone using a lightweight Compact Side Network (CSN). By\nprogressively extracting task-specific information with a few low-rank linear\nmappings and appropriately adding the information back to the backbone, CSN\neffectively realizes knowledge transfer in various downstream tasks. We\nconducted extensive experiments to validate the effectiveness of our method.\nThe proposed method not only reduces a large amount of GPU memory usage and\ntrainable parameters, but also outperforms existing PETL methods by a\nsignificant margin in accuracy, achieving new state-of-the-art on several\nstandard benchmarks. The code is available at https://github.com/heekhero/DTL.\n","authors":["Minghao Fu","Ke Zhu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2312.07856v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2402.01203v1","updated":"2024-02-02T08:13:18Z","published":"2024-02-02T08:13:18Z","title":"Structured World Modeling via Semantic Vector Quantization","summary":" Neural discrete representations are crucial components of modern neural\nnetworks. However, their main limitation is that the primary strategies such as\nVQ-VAE can only provide representations at the patch level. Therefore, one of\nthe main goals of representation learning, acquiring structured, semantic, and\ncompositional abstractions such as the color and shape of an object, remains\nelusive. In this paper, we present the first approach to semantic neural\ndiscrete representation learning. The proposed model, called Semantic\nVector-Quantized Variational Autoencoder (SVQ), leverages recent advances in\nunsupervised object-centric learning to address this limitation. Specifically,\nwe observe that a simple approach quantizing at the object level poses a\nsignificant challenge and propose constructing scene representations\nhierarchically, from low-level discrete concept schemas to object\nrepresentations. Additionally, we suggest a novel method for structured\nsemantic world modeling by training a prior over these representations,\nenabling the ability to generate images by sampling the semantic properties of\nthe objects in the scene. In experiments on various 2D and 3D object-centric\ndatasets, we find that our model achieves superior generation performance\ncompared to non-semantic vector quantization methods such as VQ-VAE and\nprevious object-centric generative models. Furthermore, we find that the\nsemantic discrete representations can solve downstream scene understanding\ntasks that require reasoning about the properties of different objects in the\nscene.\n","authors":["Yi-Fu Wu","Minseung Lee","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2402.01203v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2311.12052v2","updated":"2024-02-02T08:06:00Z","published":"2023-11-18T10:22:44Z","title":"MagicPose: Realistic Human Poses and Facial Expressions Retargeting with\n Identity-aware Diffusion","summary":" In this work, we propose MagicPose, a diffusion-based model for 2D human pose\nand facial expression retargeting. Specifically, given a reference image, we\naim to generate a person's new images by controlling the poses and facial\nexpressions while keeping the identity unchanged. To this end, we propose a\ntwo-stage training strategy to disentangle human motions and appearance (e.g.,\nfacial expressions, skin tone and dressing), consisting of (1) the pre-training\nof an appearance-control block and (2) learning appearance-disentangled pose\ncontrol. Our novel design enables robust appearance control over generated\nhuman images, including body, facial attributes, and even background. By\nleveraging the prior knowledge of image diffusion models, MagicPose generalizes\nwell to unseen human identities and complex poses without the need for\nadditional fine-tuning. Moreover, the proposed model is easy to use and can be\nconsidered as a plug-in module/extension to Stable Diffusion.\n","authors":["Di Chang","Yichun Shi","Quankai Gao","Jessica Fu","Hongyi Xu","Guoxian Song","Qing Yan","Yizhe Zhu","Xiao Yang","Mohammad Soleymani"],"pdf_url":"https://arxiv.org/pdf/2311.12052v2.pdf","comment":"Project Page:https://boese0601.github.io/magicdance/\n Code:https://github.com/Boese0601/MagicDance"},{"id":"http://arxiv.org/abs/2307.07184v2","updated":"2024-02-02T08:05:10Z","published":"2023-07-14T06:34:00Z","title":"TVPR: Text-to-Video Person Retrieval and a New Benchmark","summary":" Most existing methods for text-based person retrieval focus on text-to-image\nperson retrieval. Nevertheless, due to the lack of dynamic information provided\nby isolated frames, the performance is hampered when the person is obscured in\nisolated frames or variable motion details are given in the textual\ndescription. In this paper, we propose a new task called Text-to-Video Person\nRetrieval(TVPR) which aims to effectively overcome the limitations of isolated\nframes. Since there is no dataset or benchmark that describes person videos\nwith natural language, we construct a large-scale cross-modal person video\ndataset containing detailed natural language annotations, such as person's\nappearance, actions and interactions with environment, etc., termed as\nText-to-Video Person Re-identification (TVPReid) dataset, which will be\npublicly available. To this end, a Text-to-Video Person Retrieval Network\n(TVPRN) is proposed. Specifically, TVPRN acquires video representations by\nfusing visual and motion representations of person videos, which can deal with\ntemporal occlusion and the absence of variable motion details in isolated\nframes. Meanwhile, we employ the pre-trained BERT to obtain caption\nrepresentations and the relationship between caption and video representations\nto reveal the most relevant person videos. To evaluate the effectiveness of the\nproposed TVPRN, extensive experiments have been conducted on TVPReid dataset.\nTo the best of our knowledge, TVPRN is the first successful attempt to use\nvideo for text-based person retrieval task and has achieved state-of-the-art\nperformance on TVPReid dataset. The TVPReid dataset will be publicly available\nto benefit future research.\n","authors":["Fan Ni","Xu Zhang","Jianhui Wu","Guan-Nan Dong","Aichun Zhu","Hui Liu","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.07184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07929v3","updated":"2024-02-02T08:02:35Z","published":"2023-09-13T05:43:35Z","title":"Prompting Segmentation with Sound Is Generalizable Audio-Visual Source\n Localizer","summary":" Never having seen an object and heard its sound simultaneously, can the model\nstill accurately localize its visual position from the input audio? In this\nwork, we concentrate on the Audio-Visual Localization and Segmentation tasks\nbut under the demanding zero-shot and few-shot scenarios. To achieve this goal,\ndifferent from existing approaches that mostly employ the\nencoder-fusion-decoder paradigm to decode localization information from the\nfused audio-visual feature, we introduce the encoder-prompt-decoder paradigm,\naiming to better fit the data scarcity and varying data distribution dilemmas\nwith the help of abundant knowledge from pre-trained models. Specifically, we\nfirst propose to construct Semantic-aware Audio Prompt (SAP) to help the visual\nfoundation model focus on sounding objects, meanwhile, the semantic gap between\nthe visual and audio modalities is also encouraged to shrink. Then, we develop\na Correlation Adapter (ColA) to keep minimal training efforts as well as\nmaintain adequate knowledge of the visual foundation model. By equipping with\nthese means, extensive experiments demonstrate that this new paradigm\noutperforms other fusion-based methods in both the unseen class and\ncross-dataset settings. We hope that our work can further promote the\ngeneralization study of Audio-Visual Localization and Segmentation in practical\napplication scenarios.\n","authors":["Yaoting Wang","Weisong Liu","Guangyao Li","Jian Ding","Di Hu","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2309.07929v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2402.00863v2","updated":"2024-02-02T07:39:54Z","published":"2024-02-01T18:58:44Z","title":"Geometry Transfer for Stylizing Radiance Fields","summary":" Shape and geometric patterns are essential in defining stylistic identity.\nHowever, current 3D style transfer methods predominantly focus on transferring\ncolors and textures, often overlooking geometric aspects. In this paper, we\nintroduce Geometry Transfer, a novel method that leverages geometric\ndeformation for 3D style transfer. This technique employs depth maps to extract\na style guide, subsequently applied to stylize the geometry of radiance fields.\nMoreover, we propose new techniques that utilize geometric cues from the 3D\nscene, thereby enhancing aesthetic expressiveness and more accurately\nreflecting intended styles. Our extensive experiments show that Geometry\nTransfer enables a broader and more expressive range of stylizations, thereby\nsignificantly expanding the scope of 3D style transfer.\n","authors":["Hyunyoung Jung","Seonghyeon Nam","Nikolaos Sarafianos","Sungjoo Yoo","Alexander Sorkine-Hornung","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2402.00863v2.pdf","comment":"project page: https://hyblue.github.io/geo-srf/"},{"id":"http://arxiv.org/abs/2402.01191v1","updated":"2024-02-02T07:26:56Z","published":"2024-02-02T07:26:56Z","title":"Unsupervised Generation of Pseudo Normal PET from MRI with Diffusion\n Model for Epileptic Focus Localization","summary":" [$^{18}$F]fluorodeoxyglucose (FDG) positron emission tomography (PET) has\nemerged as a crucial tool in identifying the epileptic focus, especially in\ncases where magnetic resonance imaging (MRI) diagnosis yields indeterminate\nresults. FDG PET can provide the metabolic information of glucose and help\nidentify abnormal areas that are not easily found through MRI. However, the\neffectiveness of FDG PET-based assessment and diagnosis depends on the\nselection of a healthy control group. The healthy control group typically\nconsists of healthy individuals similar to epilepsy patients in terms of age,\ngender, and other aspects for providing normal FDG PET data, which will be used\nas a reference for enhancing the accuracy and reliability of the epilepsy\ndiagnosis. However, significant challenges arise when a healthy PET control\ngroup is unattainable. Yaakub \\emph{et al.} have previously introduced a\nPix2PixGAN-based method for MRI to PET translation. This method used paired MRI\nand FDG PET scans from healthy individuals for training, and produced pseudo\nnormal FDG PET images from patient MRIs that are subsequently used for lesion\ndetection. However, this approach requires a large amount of high-quality,\npaired MRI and PET images from healthy control subjects, which may not always\nbe available. In this study, we investigated unsupervised learning methods for\nunpaired MRI to PET translation for generating pseudo normal FDG PET for\nepileptic focus localization. Two deep learning methods, CycleGAN and SynDiff,\nwere employed, and we found that diffusion-based method achieved improved\nperformance in accurately localizing the epileptic focus.\n","authors":["Wentao Chen","Jiwei Li","Xichen Xu","Hui Huang","Siyu Yuan","Miao Zhang","Tianming Xu","Jie Luo","Weimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.01191v1.pdf","comment":"SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2402.01188v1","updated":"2024-02-02T07:17:39Z","published":"2024-02-02T07:17:39Z","title":"Segment Any Change","summary":" Visual foundation models have achieved remarkable results in zero-shot image\nclassification and segmentation, but zero-shot change detection remains an open\nproblem. In this paper, we propose the segment any change models (AnyChange), a\nnew type of change detection model that supports zero-shot prediction and\ngeneralization on unseen change types and data distributions. AnyChange is\nbuilt on the segment anything model (SAM) via our training-free adaptation\nmethod, bitemporal latent matching. By revealing and exploiting intra-image and\ninter-image semantic similarities in SAM's latent space, bitemporal latent\nmatching endows SAM with zero-shot change detection capabilities in a\ntraining-free way. We also propose a point query mechanism to enable\nAnyChange's zero-shot object-centric change detection capability. We perform\nextensive experiments to confirm the effectiveness of AnyChange for zero-shot\nchange detection. AnyChange sets a new record on the SECOND benchmark for\nunsupervised change detection, exceeding the previous SOTA by up to 4.4% F$_1$\nscore, and achieving comparable accuracy with negligible manual annotations (1\npixel per image) for supervised change detection.\n","authors":["Zhuo Zheng","Yanfei Zhong","Liangpei Zhang","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2402.01188v1.pdf","comment":"technical report, 12 pages"},{"id":"http://arxiv.org/abs/2310.00808v2","updated":"2024-02-02T07:14:19Z","published":"2023-10-01T22:25:40Z","title":"Completing Visual Objects via Bridging Generation and Segmentation","summary":" This paper presents a novel approach to object completion, with the primary\ngoal of reconstructing a complete object from its partially visible components.\nOur method, named MaskComp, delineates the completion process through iterative\nstages of generation and segmentation. In each iteration, the object mask is\nprovided as an additional condition to boost image generation, and, in return,\nthe generated images can lead to a more accurate mask by fusing the\nsegmentation of images. We demonstrate that the combination of one generation\nand one segmentation stage effectively functions as a mask denoiser. Through\nalternation between the generation and segmentation stages, the partial object\nmask is progressively refined, providing precise shape guidance and yielding\nsuperior object completion results. Our experiments demonstrate the superiority\nof MaskComp over existing approaches, e.g., ControlNet and Stable Diffusion,\nestablishing it as an effective solution for object completion.\n","authors":["Xiang Li","Yinpeng Chen","Chung-Ching Lin","Hao Chen","Kai Hu","Rita Singh","Bhiksha Raj","Lijuan Wang","Zicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2310.00808v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01187v1","updated":"2024-02-02T07:13:07Z","published":"2024-02-02T07:13:07Z","title":"DeepBranchTracer: A Generally-Applicable Approach to Curvilinear\n Structure Reconstruction Using Multi-Feature Learning","summary":" Curvilinear structures, which include line-like continuous objects, are\nfundamental geometrical elements in image-based applications. Reconstructing\nthese structures from images constitutes a pivotal research area in computer\nvision. However, the complex topology and ambiguous image evidence render this\nprocess a challenging task. In this paper, we introduce DeepBranchTracer, a\nnovel method that learns both external image features and internal geometric\ncharacteristics to reconstruct curvilinear structures. Firstly, we formulate\nthe curvilinear structures extraction as a geometric attribute estimation\nproblem. Then, a curvilinear structure feature learning network is designed to\nextract essential branch attributes, including the image features of centerline\nand boundary, and the geometric features of direction and radius. Finally,\nutilizing a multi-feature fusion tracing strategy, our model iteratively traces\nthe entire branch by integrating the extracted image and geometric features. We\nextensively evaluated our model on both 2D and 3D datasets, demonstrating its\nsuperior performance over existing segmentation and reconstruction methods in\nterms of accuracy and continuity.\n","authors":["Chao Liu","Ting Zhao","Nenggan Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.01187v1.pdf","comment":"10 pages, 6 figures, AAAI 2024 accepted"},{"id":"http://arxiv.org/abs/2402.01186v1","updated":"2024-02-02T07:11:07Z","published":"2024-02-02T07:11:07Z","title":"Ambient-Pix2PixGAN for Translating Medical Images from Noisy Data","summary":" Image-to-image translation is a common task in computer vision and has been\nrapidly increasing the impact on the field of medical imaging. Deep\nlearning-based methods that employ conditional generative adversarial networks\n(cGANs), such as Pix2PixGAN, have been extensively explored to perform\nimage-to-image translation tasks. However, when noisy medical image data are\nconsidered, such methods cannot be directly applied to produce clean images.\nRecently, an augmented GAN architecture named AmbientGAN has been proposed that\ncan be trained on noisy measurement data to synthesize high-quality clean\nmedical images. Inspired by AmbientGAN, in this work, we propose a new cGAN\narchitecture, Ambient-Pix2PixGAN, for performing medical image-to-image\ntranslation tasks by use of noisy measurement data. Numerical studies that\nconsider MRI-to-PET translation are conducted. Both traditional image quality\nmetrics and task-based image quality metrics are employed to assess the\nproposed Ambient-Pix2PixGAN. It is demonstrated that our proposed\nAmbient-Pix2PixGAN can be successfully trained on noisy measurement data to\nproduce high-quality translated images in target imaging modality.\n","authors":["Wentao Chen","Xichen Xu","Jie Luo","Weimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.01186v1.pdf","comment":"SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2308.06341v2","updated":"2024-02-02T06:30:46Z","published":"2023-08-11T18:29:28Z","title":"Surrogate Model for Geological CO2 Storage and Its Use in Hierarchical\n MCMC History Matching","summary":" Deep-learning-based surrogate models show great promise for use in geological\ncarbon storage operations. In this work we target an important application -\nthe history matching of storage systems characterized by a high degree of\n(prior) geological uncertainty. Toward this goal, we extend the recently\nintroduced recurrent R-U-Net surrogate model to treat geomodel realizations\ndrawn from a wide range of geological scenarios. These scenarios are defined by\na set of metaparameters, which include the horizontal correlation length, mean\nand standard deviation of log-permeability, permeability anisotropy ratio, and\nconstants in the porosity-permeability relationship. An infinite number of\nrealizations can be generated for each set of metaparameters, so the range of\nprior uncertainty is large. The surrogate model is trained with flow simulation\nresults, generated using the open-source simulator GEOS, for 2000 random\nrealizations. The flow problems involve four wells, each injecting 1 Mt\nCO2/year, for 30 years. The trained surrogate model is shown to provide\naccurate predictions for new realizations over the full range of geological\nscenarios, with median relative error of 1.3% in pressure and 4.5% in\nsaturation. The surrogate model is incorporated into a hierarchical Markov\nchain Monte Carlo history matching workflow, where the goal is to generate\nhistory matched geomodel realizations and posterior estimates of the\nmetaparameters. We show that, using observed data from monitoring wells in\nsynthetic `true' models, geological uncertainty is reduced substantially. This\nleads to posterior 3D pressure and saturation fields that display much closer\nagreement with the true-model responses than do prior predictions.\n","authors":["Yifu Han","Francois P. Hamon","Su Jiang","Louis J. Durlofsky"],"pdf_url":"https://arxiv.org/pdf/2308.06341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01171v1","updated":"2024-02-02T06:30:33Z","published":"2024-02-02T06:30:33Z","title":"AmbientCycleGAN for Establishing Interpretable Stochastic Object Models\n Based on Mathematical Phantoms and Medical Imaging Measurements","summary":" Medical imaging systems that are designed for producing diagnostically\ninformative images should be objectively assessed via task-based measures of\nimage quality (IQ). Ideally, computation of task-based measures of IQ needs to\naccount for all sources of randomness in the measurement data, including the\nvariability in the ensemble of objects to be imaged. To address this need,\nstochastic object models (SOMs) that can generate an ensemble of synthesized\nobjects or phantoms can be employed. Various mathematical SOMs or phantoms were\ndeveloped that can interpretably synthesize objects, such as lumpy object\nmodels and parameterized torso phantoms. However, such SOMs that are purely\nmathematically defined may not be able to comprehensively capture realistic\nobject variations. To establish realistic SOMs, it is desirable to use\nexperimental data. An augmented generative adversarial network (GAN),\nAmbientGAN, was recently proposed for establishing SOMs from medical imaging\nmeasurements. However, it remains unclear to which extent the\nAmbientGAN-produced objects can be interpretably controlled. This work\nintroduces a novel approach called AmbientCycleGAN that translates mathematical\nSOMs to realistic SOMs by use of noisy measurement data. Numerical studies that\nconsider clustered lumpy background (CLB) models and real mammograms are\nconducted. It is demonstrated that our proposed method can stably establish\nSOMs based on mathematical models and noisy measurement data. Moreover, the\nability of the proposed AmbientCycleGAN to interpretably control image features\nin the synthesized objects is investigated.\n","authors":["Xichen Xu","Wentao Chen","Weimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.01171v1.pdf","comment":"SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2402.01169v1","updated":"2024-02-02T06:23:00Z","published":"2024-02-02T06:23:00Z","title":"Faster Inference of Integer SWIN Transformer by Removing the GELU\n Activation","summary":" SWIN transformer is a prominent vision transformer model that has\nstate-of-the-art accuracy in image classification tasks. Despite this success,\nits unique architecture causes slower inference compared with similar deep\nneural networks. Integer quantization of the model is one of the methods used\nto improve its inference latency. However, state-of-the-art has not been able\nto fully quantize the model. In this work, we improve upon the inference\nlatency of the state-of-the-art methods by removing the floating-point\noperations, which are associated with the GELU activation in Swin Transformer.\nWhile previous work proposed to replace the non-integer operations with linear\napproximation functions, we propose to replace GELU with ReLU activation. The\nadvantage of ReLU over previous methods is its low memory and computation\ncomplexity. We use iterative knowledge distillation to compensate for the lost\naccuracy due to replacing GELU with ReLU. We quantize our GELU-less SWIN\ntransformer and show that on an RTX 4090 NVIDIA GPU we can improve the\ninference latency of the quantized SWIN transformer by at least $11\\%$ while\nmaintaining an accuracy drop of under $0.5\\%$ on the ImageNet evaluation\ndataset.\n","authors":["Mohammadreza Tayaranian","Seyyed Hasan Mozafari","James J. Clark","Brett Meyer","Warren Gross"],"pdf_url":"https://arxiv.org/pdf/2402.01169v1.pdf","comment":"5 pages, 1 figure. Submitted to Edge Intelligence Workshop III, an\n AAAI 2024 workshop"},{"id":"http://arxiv.org/abs/2402.01166v1","updated":"2024-02-02T06:20:44Z","published":"2024-02-02T06:20:44Z","title":"A Comprehensive Survey on 3D Content Generation","summary":" Recent years have witnessed remarkable advances in artificial intelligence\ngenerated content(AIGC), with diverse input modalities, e.g., text, image,\nvideo, audio and 3D. The 3D is the most close visual modality to real-world 3D\nenvironment and carries enormous knowledge. The 3D content generation shows\nboth academic and practical values while also presenting formidable technical\nchallenges. This review aims to consolidate developments within the burgeoning\ndomain of 3D content generation. Specifically, a new taxonomy is proposed that\ncategorizes existing approaches into three types: 3D native generative methods,\n2D prior-based 3D generative methods, and hybrid 3D generative methods. The\nsurvey covers approximately 60 papers spanning the major techniques. Besides,\nwe discuss limitations of current 3D content generation techniques, and point\nout open challenges as well as promising directions for future work.\nAccompanied with this survey, we have established a project website where the\nresources on 3D content generation research are provided. The project page is\navailable at https://github.com/hitcslj/Awesome-AIGC-3D.\n","authors":["Jian Liu","Xiaoshui Huang","Tianyu Huang","Lu Chen","Yuenan Hou","Shixiang Tang","Ziwei Liu","Wanli Ouyang","Wangmeng Zuo","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01163v1","updated":"2024-02-02T06:06:45Z","published":"2024-02-02T06:06:45Z","title":"Enhanced Urban Region Profiling with Adversarial Self-Supervised\n Learning","summary":" Urban region profiling is pivotal for smart cities, but mining fine-grained\nsemantics from noisy and incomplete urban data remains challenging. In\nresponse, we propose a novel self-supervised graph collaborative filtering\nmodel for urban region embedding called EUPAS. Specifically, region\nheterogeneous graphs containing human mobility data, point of interests (POIs)\ninformation, and geographic neighborhood details for each region are fed into\nthe model, which generates region embeddings that preserve intra-region and\ninter-region dependencies through GCNs and multi-head attention. Meanwhile, we\nintroduce spatial perturbation augmentation to generate positive samples that\nare semantically similar and spatially close to the anchor, preparing for\nsubsequent contrastive learning. Furthermore, adversarial training is employed\nto construct an effective pretext task by generating strong positive pairs and\nmining hard negative pairs for the region embeddings. Finally, we jointly\noptimize supervised and self-supervised learning to encourage the model to\ncapture the high-level semantics of region embeddings while ignoring the noisy\nand unimportant details. Extensive experiments on real-world datasets\ndemonstrate the superiority of our model over state-of-the-art methods.\n","authors":["Weiliang Chan","Qianqian Ren","Jinbao Li"],"pdf_url":"https://arxiv.org/pdf/2402.01163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01162v1","updated":"2024-02-02T06:05:18Z","published":"2024-02-02T06:05:18Z","title":"2AFC Prompting of Large Multimodal Models for Image Quality Assessment","summary":" While abundant research has been conducted on improving high-level visual\nunderstanding and reasoning capabilities of large multimodal models~(LMMs),\ntheir visual quality assessment~(IQA) ability has been relatively\nunder-explored. Here we take initial steps towards this goal by employing the\ntwo-alternative forced choice~(2AFC) prompting, as 2AFC is widely regarded as\nthe most reliable way of collecting human opinions of visual quality.\nSubsequently, the global quality score of each image estimated by a particular\nLMM can be efficiently aggregated using the maximum a posterior estimation.\nMeanwhile, we introduce three evaluation criteria: consistency, accuracy, and\ncorrelation, to provide comprehensive quantifications and deeper insights into\nthe IQA capability of five LMMs. Extensive experiments show that existing LMMs\nexhibit remarkable IQA ability on coarse-grained quality comparison, but there\nis room for improvement on fine-grained quality discrimination. The proposed\ndataset sheds light on the future development of IQA models based on LMMs. The\ncodes will be made publicly available at https://github.com/h4nwei/2AFC-LMMs.\n","authors":["Hanwei Zhu","Xiangjie Sui","Baoliang Chen","Xuelin Liu","Peilin Chen","Yuming Fang","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2402.01162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01157v1","updated":"2024-02-02T05:53:22Z","published":"2024-02-02T05:53:22Z","title":"Source-Free Unsupervised Domain Adaptation with Hypothesis Consolidation\n of Prediction Rationale","summary":" Source-Free Unsupervised Domain Adaptation (SFUDA) is a challenging task\nwhere a model needs to be adapted to a new domain without access to target\ndomain labels or source domain data. The primary difficulty in this task is\nthat the model's predictions may be inaccurate, and using these inaccurate\npredictions for model adaptation can lead to misleading results. To address\nthis issue, this paper proposes a novel approach that considers multiple\nprediction hypotheses for each sample and investigates the rationale behind\neach hypothesis. By consolidating these hypothesis rationales, we identify the\nmost likely correct hypotheses, which we then use as a pseudo-labeled set to\nsupport a semi-supervised learning procedure for model adaptation. To achieve\nthe optimal performance, we propose a three-step adaptation process: model\npre-adaptation, hypothesis consolidation, and semi-supervised learning.\nExtensive experimental results demonstrate that our approach achieves\nstate-of-the-art performance in the SFUDA task and can be easily integrated\ninto existing approaches to improve their performance. The codes are available\nat \\url{https://github.com/GANPerf/HCPR}.\n","authors":["Yangyang Shu","Xiaofeng Cao","Qi Chen","Bowen Zhang","Ziqin Zhou","Anton van den Hengel","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16150v2","updated":"2024-02-02T05:40:41Z","published":"2023-03-27T14:05:49Z","title":"Multimodal video and IMU kinematic dataset on daily life activities\n using affordable devices (VIDIMU)","summary":" Human activity recognition and clinical biomechanics are challenging problems\nin physical telerehabilitation medicine. However, most publicly available\ndatasets on human body movements cannot be used to study both problems in an\nout-of-the-lab movement acquisition setting. The objective of the VIDIMU\ndataset is to pave the way towards affordable patient gross motor tracking\nsolutions for daily life activities recognition and kinematic analysis. The\ndataset includes 13 activities registered using a commodity camera and five\ninertial sensors. The video recordings were acquired in 54 subjects, of which\n16 also had simultaneous recordings of inertial sensors. The novelty of dataset\nlies in: (i) the clinical relevance of the chosen movements, (ii) the combined\nutilization of affordable video and custom sensors, and (iii) the\nimplementation of state-of-the-art tools for multimodal data processing of 3D\nbody pose tracking and motion reconstruction in a musculoskeletal model from\ninertial data. The validation confirms that a minimally disturbing acquisition\nprotocol, performed according to real-life conditions can provide a\ncomprehensive picture of human joint angles during daily life activities.\n","authors":["Mario Martínez-Zarzuela","Javier González-Alonso","Míriam Antón-Rodríguez","Francisco J. Díaz-Pernas","Henning Müller","Cristina Simón-Martínez"],"pdf_url":"https://arxiv.org/pdf/2303.16150v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01149v1","updated":"2024-02-02T05:25:51Z","published":"2024-02-02T05:25:51Z","title":"Scale Equalization for Multi-Level Feature Fusion","summary":" Deep neural networks have exhibited remarkable performance in a variety of\ncomputer vision fields, especially in semantic segmentation tasks. Their\nsuccess is often attributed to multi-level feature fusion, which enables them\nto understand both global and local information from an image. However, we\nfound that multi-level features from parallel branches are on different scales.\nThe scale disequilibrium is a universal and unwanted flaw that leads to\ndetrimental gradient descent, thereby degrading performance in semantic\nsegmentation. We discover that scale disequilibrium is caused by bilinear\nupsampling, which is supported by both theoretical and empirical evidence.\nBased on this observation, we propose injecting scale equalizers to achieve\nscale equilibrium across multi-level features after bilinear upsampling. Our\nproposed scale equalizers are easy to implement, applicable to any\narchitecture, hyperparameter-free, implementable without requiring extra\ncomputational cost, and guarantee scale equilibrium for any dataset.\nExperiments showed that adopting scale equalizers consistently improved the\nmIoU index across various target datasets, including ADE20K, PASCAL VOC 2012,\nand Cityscapes, as well as various decoder choices, including UPerHead,\nPSPHead, ASPPHead, SepASPPHead, and FCNHead.\n","authors":["Bum Jun Kim","Sang Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2402.01149v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2108.06545v3","updated":"2024-02-02T05:13:52Z","published":"2021-08-14T14:19:37Z","title":"PICCOLO: Point Cloud-Centric Omnidirectional Localization","summary":" We present PICCOLO, a simple and efficient algorithm for omnidirectional\nlocalization. Given a colored point cloud and a 360 panorama image of a scene,\nour objective is to recover the camera pose at which the panorama image is\ntaken. Our pipeline works in an off-the-shelf manner with a single image given\nas a query and does not require any training of neural networks or collecting\nground-truth poses of images. Instead, we match each point cloud color to the\nholistic view of the panorama image with gradient-descent optimization to find\nthe camera pose. Our loss function, called sampling loss, is point\ncloud-centric, evaluated at the projected location of every point in the point\ncloud. In contrast, conventional photometric loss is image-centric, comparing\ncolors at each pixel location. With a simple change in the compared entities,\nsampling loss effectively overcomes the severe visual distortion of\nomnidirectional images, and enjoys the global context of the 360 view to handle\nchallenging scenarios for visual localization. PICCOLO outperforms existing\nomnidirectional localization algorithms in both accuracy and stability when\nevaluated in various environments. Code is available at\n\\url{https://github.com/82magnolia/panoramic-localization/}.\n","authors":["Junho Kim","Changwoon Choi","Hojun Jang","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2108.06545v3.pdf","comment":"Accepted to ICCV 2021"},{"id":"http://arxiv.org/abs/2312.07586v4","updated":"2024-02-02T04:56:56Z","published":"2023-12-11T02:40:40Z","title":"Characteristic Guidance: Non-linear Correction for Diffusion Model at\n Large Guidance Scale","summary":" Popular guidance for denoising diffusion probabilistic model (DDPM) linearly\ncombines distinct conditional models together to provide enhanced control over\nsamples. However, this approach overlooks nonlinear effects that become\nsignificant when guidance scale is large. To address this issue, we propose\ncharacteristic guidance, a guidance method that provides first-principle\nnon-linear correction for classifier-free guidance. Such correction forces the\nguided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process,\nin a way that is training-free and compatible with existing sampling methods.\nExperiments show that characteristic guidance enhances semantic characteristics\nof prompts and mitigate irregularities in image generation, proving effective\nin diverse applications ranging from simulating magnet phase transitions to\nlatent space sampling.\n","authors":["Candi Zheng","Yuan Lan"],"pdf_url":"https://arxiv.org/pdf/2312.07586v4.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2207.05317v2","updated":"2024-02-02T04:46:34Z","published":"2022-07-12T05:10:32Z","title":"CPO: Change Robust Panorama to Point Cloud Localization","summary":" We present CPO, a fast and robust algorithm that localizes a 2D panorama with\nrespect to a 3D point cloud of a scene possibly containing changes. To robustly\nhandle scene changes, our approach deviates from conventional feature point\nmatching, and focuses on the spatial context provided from panorama images.\nSpecifically, we propose efficient color histogram generation and subsequent\nrobust localization using score maps. By utilizing the unique equivariance of\nspherical projections, we propose very fast color histogram generation for a\nlarge number of camera poses without explicitly rendering images for all\ncandidate poses. We accumulate the regional consistency of the panorama and\npoint cloud as 2D/3D score maps, and use them to weigh the input color values\nto further increase robustness. The weighted color distribution quickly finds\ngood initial poses and achieves stable convergence for gradient-based\noptimization. CPO is lightweight and achieves effective localization in all\ntested scenarios, showing stable performance despite scene changes, repetitive\nstructures, or featureless regions, which are typical challenges for visual\nlocalization with perspective cameras. Code is available at\n\\url{https://github.com/82magnolia/panoramic-localization/}.\n","authors":["Junho Kim","Hojun Jang","Changwoon Choi","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2207.05317v2.pdf","comment":"Accepted to ECCV 2022"},{"id":"http://arxiv.org/abs/2310.03337v3","updated":"2024-02-02T04:45:00Z","published":"2023-10-05T06:44:13Z","title":"Denoising Diffusion Step-aware Models","summary":" Denoising Diffusion Probabilistic Models (DDPMs) have garnered popularity for\ndata generation across various domains. However, a significant bottleneck is\nthe necessity for whole-network computation during every step of the generative\nprocess, leading to high computational overheads. This paper presents a novel\nframework, Denoising Diffusion Step-aware Models (DDSM), to address this\nchallenge. Unlike conventional approaches, DDSM employs a spectrum of neural\nnetworks whose sizes are adapted according to the importance of each generative\nstep, as determined through evolutionary search. This step-wise network\nvariation effectively circumvents redundant computational efforts, particularly\nin less critical steps, thereby enhancing the efficiency of the diffusion\nmodel. Furthermore, the step-aware design can be seamlessly integrated with\nother efficiency-geared diffusion models such as DDIMs and latent diffusion,\nthus broadening the scope of computational savings. Empirical evaluations\ndemonstrate that DDSM achieves computational savings of 49% for CIFAR-10, 61%\nfor CelebA-HQ, 59% for LSUN-bedroom, 71% for AFHQ, and 76% for ImageNet, all\nwithout compromising the generation quality.\n","authors":["Shuai Yang","Yukang Chen","Luozhou Wang","Shu Liu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03337v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01134v1","updated":"2024-02-02T04:17:02Z","published":"2024-02-02T04:17:02Z","title":"DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping","summary":" Automated Aerial Triangulation (AAT), aiming to restore image pose and\nreconstruct sparse points simultaneously, plays a pivotal role in earth\nobservation. With its rich research heritage spanning several decades in\nphotogrammetry, AAT has evolved into a fundamental process widely applied in\nlarge-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its\nadvancements, classic AAT methods still face challenges like low efficiency and\nlimited robustness. This paper introduces DeepAAT, a deep learning network\ndesigned specifically for AAT of UAV imagery. DeepAAT considers both spatial\nand spectral characteristics of imagery, enhancing its capability to resolve\nerroneous matching pairs and accurately predict image poses. DeepAAT marks a\nsignificant leap in AAT's efficiency, ensuring thorough scene coverage and\nprecision. Its processing speed outpaces incremental AAT methods by hundreds of\ntimes and global AAT methods by tens of times while maintaining a comparable\nlevel of reconstruction accuracy. Additionally, DeepAAT's scene clustering and\nmerging strategy facilitate rapid localization and pose determination for\nlarge-scale UAV images, even under constrained computing resources. The\nexperimental results demonstrate DeepAAT's substantial improvements over\nconventional AAT methods, highlighting its potential in the efficiency and\naccuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry\nsociety, the code of DeepAAT will be released at:\nhttps://github.com/WHU-USI3DV/DeepAAT.\n","authors":["Zequan Chen","Jianping Li","Qusheng Li","Bisheng Yang","Zhen Dong"],"pdf_url":"https://arxiv.org/pdf/2402.01134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01126v1","updated":"2024-02-02T03:57:11Z","published":"2024-02-02T03:57:11Z","title":"Seeing Objects in a Cluttered World: Computational Objectness from\n Motion in Video","summary":" Perception of the visually disjoint surfaces of our cluttered world as whole\nobjects, physically distinct from those overlapping them, is a cognitive\nphenomenon called objectness that forms the basis of our visual perception.\nShared by all vertebrates and present at birth in humans, it enables\nobject-centric representation and reasoning about the visual world. We present\na computational approach to objectness that leverages motion cues and\nspatio-temporal attention using a pair of supervised spatio-temporal\nR(2+1)U-Nets. The first network detects motion boundaries and classifies the\npixels at those boundaries in terms of their local foreground-background sense.\nThis motion boundary sense (MBS) information is passed, along with a\nspatio-temporal object attention cue, to an attentional surface perception\n(ASP) module which infers the form of the attended object over a sequence of\nframes and classifies its 'pixels' as visible or obscured. The spatial form of\nthe attention cue is flexible, but it must loosely track the attended object\nwhich need not be visible. We demonstrate the ability of this simple but novel\napproach to infer objectness from phenomenology without object models, and show\nthat it delivers robust perception of individual attended objects in cluttered\nscenes, even with blur and camera shake. We show that our data diversity and\naugmentation minimizes bias and facilitates transfer to real video. Finally, we\ndescribe how this computational objectness capability can grow in\nsophistication and anchor a robust modular video object perception framework.\n","authors":["Douglas Poland","Amar Saini"],"pdf_url":"https://arxiv.org/pdf/2402.01126v1.pdf","comment":"10 pages, 11 figures, plus 18 pages of Supplemental Information"},{"id":"http://arxiv.org/abs/2402.01123v1","updated":"2024-02-02T03:50:45Z","published":"2024-02-02T03:50:45Z","title":"A Single Simple Patch is All You Need for AI-generated Image Detection","summary":" The recent development of generative models unleashes the potential of\ngenerating hyper-realistic fake images. To prevent the malicious usage of fake\nimages, AI-generated image detection aims to distinguish fake images from real\nimages. Nevertheless, existing methods usually suffer from poor\ngeneralizability across different generators. In this work, we propose an\nembarrassingly simple approach named SSP, i.e., feeding the noise pattern of a\nSingle Simple Patch (SSP) to a binary classifier, which could achieve 14.6%\nrelative improvement over the recent method on GenImage dataset. Our SSP method\nis very robust and generalizable, which could serve as a simple and competitive\nbaseline for the future methods.\n","authors":["Jiaxuan Chen","Jieteng Yao","Li Niu"],"pdf_url":"https://arxiv.org/pdf/2402.01123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00752v2","updated":"2024-02-02T03:35:04Z","published":"2024-02-01T16:43:58Z","title":"Optimal Projection for 3D Gaussian Splatting","summary":" 3D Gaussian Splatting has garnered extensive attention and application in\nreal-time neural rendering. Concurrently, concerns have been raised about the\nlimitations of this technology in aspects such as point cloud storage,\nperformance , and robustness in sparse viewpoints , leading to various\nimprovements. However, there has been a notable lack of attention to the\nprojection errors introduced by the local affine approximation inherent in the\nsplatting itself, and the consequential impact of these errors on the quality\nof photo-realistic rendering. This paper addresses the projection error\nfunction of 3D Gaussian Splatting, commencing with the residual error from the\nfirst-order Taylor expansion of the projection function $\\phi$. The analysis\nestablishes a correlation between the error and the Gaussian mean position.\nSubsequently, leveraging function optimization theory, this paper analyzes the\nfunction's minima to provide an optimal projection strategy for Gaussian\nSplatting referred to Optimal Gaussian Splatting. Experimental validation\nfurther confirms that this projection methodology reduces artifacts, resulting\nin a more convincingly realistic rendering.\n","authors":["Letian Huang","Jiayang Bai","Jie Guo","Yanwen Guo"],"pdf_url":"https://arxiv.org/pdf/2402.00752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00321v2","updated":"2024-02-02T03:32:19Z","published":"2024-02-01T04:15:39Z","title":"SmartCooper: Vehicular Collaborative Perception with Adaptive Fusion and\n Judger Mechanism","summary":" In recent years, autonomous driving has garnered significant attention due to\nits potential for improving road safety through collaborative perception among\nconnected and autonomous vehicles (CAVs). However, time-varying channel\nvariations in vehicular transmission environments demand dynamic allocation of\ncommunication resources. Moreover, in the context of collaborative perception,\nit is important to recognize that not all CAVs contribute valuable data, and\nsome CAV data even have detrimental effects on collaborative perception. In\nthis paper, we introduce SmartCooper, an adaptive collaborative perception\nframework that incorporates communication optimization and a judger mechanism\nto facilitate CAV data fusion. Our approach begins with optimizing the\nconnectivity of vehicles while considering communication constraints. We then\ntrain a learnable encoder to dynamically adjust the compression ratio based on\nthe channel state information (CSI). Subsequently, we devise a judger mechanism\nto filter the detrimental image data reconstructed by adaptive decoders. We\nevaluate the effectiveness of our proposed algorithm on the OpenCOOD platform.\nOur results demonstrate a substantial reduction in communication costs by\n23.10\\% compared to the non-judger scheme. Additionally, we achieve a\nsignificant improvement on the average precision of Intersection over Union\n(AP@IoU) by 7.15\\% compared with state-of-the-art schemes.\n","authors":["Yuang Zhang","Haonan An","Zhengru Fang","Guowen Xu","Yuan Zhou","Xianhao Chen","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2402.00321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00351v2","updated":"2024-02-02T03:27:08Z","published":"2024-02-01T05:35:25Z","title":"Machine Unlearning for Image-to-Image Generative Models","summary":" Machine unlearning has emerged as a new paradigm to deliberately forget data\nsamples from a given model in order to adhere to stringent regulations.\nHowever, existing machine unlearning methods have been primarily focused on\nclassification models, leaving the landscape of unlearning for generative\nmodels relatively unexplored. This paper serves as a bridge, addressing the gap\nby providing a unifying framework of machine unlearning for image-to-image\ngenerative models. Within this framework, we propose a\ncomputationally-efficient algorithm, underpinned by rigorous theoretical\nanalysis, that demonstrates negligible performance degradation on the retain\nsamples, while effectively removing the information from the forget samples.\nEmpirical studies on two large-scale datasets, ImageNet-1K and Places-365,\nfurther show that our algorithm does not rely on the availability of the retain\nsamples, which further complies with data retention policy. To our best\nknowledge, this work is the first that represents systemic, theoretical,\nempirical explorations of machine unlearning specifically tailored for\nimage-to-image generative models. Our code is available at\nhttps://github.com/jpmorganchase/l2l-generator-unlearning.\n","authors":["Guihong Li","Hsiang Hsu","Chun-Fu Chen","Radu Marculescu"],"pdf_url":"https://arxiv.org/pdf/2402.00351v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.15877v2","updated":"2024-02-02T03:22:08Z","published":"2024-01-29T04:16:37Z","title":"3DPFIX: Improving Remote Novices' 3D Printing Troubleshooting through\n Human-AI Collaboration","summary":" The widespread consumer-grade 3D printers and learning resources online\nenable novices to self-train in remote settings. While troubleshooting plays an\nessential part of 3D printing, the process remains challenging for many remote\nnovices even with the help of well-developed online sources, such as online\ntroubleshooting archives and online community help. We conducted a formative\nstudy with 76 active 3D printing users to learn how remote novices leverage\nonline resources in troubleshooting and their challenges. We found that remote\nnovices cannot fully utilize online resources. For example, the online archives\nstatically provide general information, making it hard to search and relate\ntheir unique cases with existing descriptions. Online communities can\npotentially ease their struggles by providing more targeted suggestions, but a\nhelper who can provide custom help is rather scarce, making it hard to obtain\ntimely assistance. We propose 3DPFIX, an interactive 3D troubleshooting system\npowered by the pipeline to facilitate Human-AI Collaboration, designed to\nimprove novices' 3D printing experiences and thus help them easily accumulate\ntheir domain knowledge. We built 3DPFIX that supports automated diagnosis and\nsolution-seeking. 3DPFIX was built upon shared dialogues about failure cases\nfrom Q&A discourses accumulated in online communities. We leverage social\nannotations (i.e., comments) to build an annotated failure image dataset for AI\nclassifiers and extract a solution pool. Our summative study revealed that\nusing 3DPFIX helped participants spend significantly less effort in diagnosing\nfailures and finding a more accurate solution than relying on their common\npractice. We also found that 3DPFIX users learn about 3D printing\ndomain-specific knowledge. We discuss the implications of leveraging\ncommunity-driven data in developing future Human-AI Collaboration designs.\n","authors":["Nahyun Kwon","Tong Sun","Yuyang Gao","Liang Zhao","Xu Wang","Jeeeun Kim","Sungsoo Ray Hong"],"pdf_url":"https://arxiv.org/pdf/2401.15877v2.pdf","comment":"CSCW2024"},{"id":"http://arxiv.org/abs/2311.15040v2","updated":"2024-02-02T03:18:45Z","published":"2023-11-25T14:38:54Z","title":"InstaStyle: Inversion Noise of a Stylized Image is Secretly a Style\n Adviser","summary":" Stylized text-to-image generation focuses on creating images from textual\ndescriptions while adhering to a style specified by a few reference images.\nHowever, subtle style variations within different reference images can hinder\nthe model from accurately learning the target style. In this paper, we propose\nInstaStyle, a novel approach that excels in generating high-fidelity stylized\nimages with only a single reference image. Our approach is based on the finding\nthat the inversion noise from a stylized reference image inherently carries the\nstyle signal, as evidenced by their non-zero signal-to-noise ratio. We employ\nDDIM inversion to extract this noise from the reference image and leverage a\ndiffusion model to generate new stylized images from the \"style\" noise.\nAdditionally, the inherent ambiguity and bias of textual prompts impede the\nprecise conveying of style. To address this, we introduce a learnable style\ntoken via prompt refinement, which enhances the accuracy of the style\ndescription for the reference image. Qualitative and quantitative experimental\nresults demonstrate that InstaStyle achieves superior performance compared to\ncurrent benchmarks. Furthermore, our approach also showcases its capability in\nthe creative task of style combination with mixed inversion noise.\n","authors":["Xing Cui","Zekun Li","Pei Pei Li","Huaibo Huang","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2311.15040v2.pdf","comment":"21 pages,20 figures"},{"id":"http://arxiv.org/abs/2312.10109v2","updated":"2024-02-02T03:08:07Z","published":"2023-12-15T06:57:05Z","title":"Enlighten-Your-Voice: When Multimodal Meets Zero-shot Low-light Image\n Enhancement","summary":" Low-light image enhancement is a crucial visual task, and many unsupervised\nmethods tend to overlook the degradation of visible information in low-light\nscenes, which adversely affects the fusion of complementary information and\nhinders the generation of satisfactory results. To address this, our study\nintroduces \"Enlighten-Your-Voice\", a multimodal enhancement framework that\ninnovatively enriches user interaction through voice and textual commands. This\napproach does not merely signify a technical leap but also represents a\nparadigm shift in user engagement. Our model is equipped with a Dual\nCollaborative Attention Module (DCAM) that meticulously caters to distinct\ncontent and color discrepancies, thereby facilitating nuanced enhancements.\nComplementarily, we introduce a Semantic Feature Fusion (SFM) plug-and-play\nmodule that synergizes semantic context with low-light enhancement operations,\nsharpening the algorithm's efficacy. Crucially, \"Enlighten-Your-Voice\"\nshowcases remarkable generalization in unsupervised zero-shot scenarios. The\nsource code can be accessed from\nhttps://github.com/zhangbaijin/Enlighten-Your-Voice\n","authors":["Xiaofeng Zhang","Zishan Xu","Hao Tang","Chaochen Gu","Wei Chen","Shanying Zhu","Xinping Guan"],"pdf_url":"https://arxiv.org/pdf/2312.10109v2.pdf","comment":"It needs revised"},{"id":"http://arxiv.org/abs/2402.00281v2","updated":"2024-02-02T02:56:43Z","published":"2024-02-01T02:13:49Z","title":"Guided Interpretable Facial Expression Recognition via Spatial Action\n Unit Cues","summary":" While state-of-the-art facial expression recognition (FER) classifiers\nachieve a high level of accuracy, they lack interpretability, an important\naspect for end-users. To recognize basic facial expressions, experts resort to\na codebook associating a set of spatial action units to a facial expression. In\nthis paper, we follow the same expert footsteps, and propose a learning\nstrategy that allows us to explicitly incorporate spatial action units (aus)\ncues into the classifier's training to build a deep interpretable model. In\nparticular, using this aus codebook, input image expression label, and facial\nlandmarks, a single action units heatmap is built to indicate the most\ndiscriminative regions of interest in the image w.r.t the facial expression. We\nleverage this valuable spatial cue to train a deep interpretable classifier for\nFER. This is achieved by constraining the spatial layer features of a\nclassifier to be correlated with \\aus map. Using a composite loss, the\nclassifier is trained to correctly classify an image while yielding\ninterpretable visual layer-wise attention correlated with aus maps, simulating\nthe experts' decision process. This is achieved using only the image class\nexpression as supervision and without any extra manual annotations. Moreover,\nour method is generic. It can be applied to any CNN- or transformer-based deep\nclassifier without the need for architectural change or adding significant\ntraining time. Our extensive evaluation on two public benchmarks RAFDB, and\nAFFECTNET datasets shows that our proposed strategy can improve layer-wise\ninterpretability without degrading classification performance. In addition, we\nexplore a common type of interpretable classifiers that rely on\nClass-Activation Mapping methods (CAMs), and we show that our training\ntechnique improves the CAM interpretability.\n","authors":["Soufiane Belharbi","Marco Pedersoli","Alessandro Lameiras Koerich","Simon Bacon","Eric Granger"],"pdf_url":"https://arxiv.org/pdf/2402.00281v2.pdf","comment":"11"},{"id":"http://arxiv.org/abs/2402.01105v1","updated":"2024-02-02T02:44:59Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01103v1","updated":"2024-02-02T02:40:51Z","published":"2024-02-02T02:40:51Z","title":"Compositional Generative Modeling: A Single Model is Not All You Need","summary":" Large monolithic generative models trained on massive amounts of data have\nbecome an increasingly dominant approach in AI research. In this paper, we\nargue that we should instead construct large generative systems by composing\nsmaller generative models together. We show how such a compositional generative\napproach enables us to learn distributions in a more data-efficient manner,\nenabling generalization to parts of the data distribution unseen at training\ntime. We further show how this enables us to program and construct new\ngenerative models for tasks completely unseen at training. Finally, we show\nthat in many cases, we can discover separate compositional components from\ndata.\n","authors":["Yilun Du","Leslie Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2402.01103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12425v2","updated":"2024-02-02T02:06:20Z","published":"2024-01-23T01:25:00Z","title":"The Neglected Tails of Vision-Language Models","summary":" Vision-language models (VLMs) excel in zero-shot recognition but their\nperformance varies greatly across different visual concepts. For example,\nalthough CLIP achieves impressive accuracy on ImageNet (60-80%), its\nperformance drops below 10% for more than ten concepts like night snake,\npresumably due to their limited presence in the pretraining data. However,\nmeasuring the frequency of concepts in VLMs' large-scale datasets is\nchallenging. We address this by using large language models (LLMs) to count the\nnumber of pretraining texts that contain synonyms of these concepts. Our\nanalysis confirms that popular datasets, such as LAION, exhibit a long-tailed\nconcept distribution, yielding biased performance in VLMs. We also find that\ndownstream applications of VLMs, including visual chatbots (e.g., GPT-4V) and\ntext-to-image models (e.g., Stable Diffusion), often fail to recognize or\ngenerate images of rare concepts identified by our method. To mitigate the\nimbalanced performance of zero-shot VLMs, we propose REtrieval-Augmented\nLearning (REAL). First, instead of prompting VLMs using the original class\nnames, REAL uses their most frequent synonyms found in pretraining texts. This\nsimple change already outperforms costly human-engineered and LLM-enriched\nprompts over nine benchmark datasets. Second, REAL trains a linear classifier\non a small yet balanced set of pretraining data retrieved using concept\nsynonyms. REAL surpasses the previous zero-shot SOTA, using 400x less storage\nand 10,000x less training time!\n","authors":["Shubham Parashar","Zhiqiu Lin","Tian Liu","Xiangjue Dong","Yanan Li","Deva Ramanan","James Caverlee","Shu Kong"],"pdf_url":"https://arxiv.org/pdf/2401.12425v2.pdf","comment":"Project Page:\n https://shubhamprshr27.github.io/neglected-tails-of-vlms/"},{"id":"http://arxiv.org/abs/2302.08062v3","updated":"2024-02-02T02:04:32Z","published":"2023-02-16T03:57:21Z","title":"Fossil Image Identification using Deep Learning Ensembles of Data\n Augmented Multiviews","summary":" Identification of fossil species is crucial to evolutionary studies. Recent\nadvances from deep learning have shown promising prospects in fossil image\nidentification. However, the quantity and quality of labeled fossil images are\noften limited due to fossil preservation, conditioned sampling, and expensive\nand inconsistent label annotation by domain experts, which pose great\nchallenges to training deep learning based image classification models. To\naddress these challenges, we follow the idea of the wisdom of crowds and\npropose a multiview ensemble framework, which collects Original (O), Gray (G),\nand Skeleton (S) views of each fossil image reflecting its different\ncharacteristics to train multiple base models, and then makes the final\ndecision via soft voting. Experiments on the largest fusulinid dataset with\n2400 images show that the proposed OGS consistently outperforms baselines\n(using a single model for each view), and obtains superior or comparable\nperformance compared to OOO (using three base models for three the same\nOriginal views). Besides, as the training data decreases, the proposed\nframework achieves more gains. While considering the identification consistency\nestimation with respect to human experts, OGS receives the highest agreement\nwith the original labels of dataset and with the re-identifications of two\nhuman experts. The validation performance provides a quantitative estimation of\nconsistency across different experts and genera. We conclude that the proposed\nframework can present state-of-the-art performance in the fusulinid fossil\nidentification case study. This framework is designed for general fossil\nidentification and it is expected to see applications to other fossil datasets\nin future work. The source code is publicly available at\nhttps://github.com/houchengbin/Fossil-Image-Identification to benefit future\nresearch in fossil image identification.\n","authors":["Chengbin Hou","Xinyu Lin","Hanhui Huang","Sheng Xu","Junxuan Fan","Yukun Shi","Hairong Lv"],"pdf_url":"https://arxiv.org/pdf/2302.08062v3.pdf","comment":"published in Methods in Ecology and Evolution"},{"id":"http://arxiv.org/abs/2401.14832v2","updated":"2024-02-02T02:01:39Z","published":"2024-01-26T13:01:28Z","title":"Text Image Inpainting via Global Structure-Guided Diffusion Models","summary":" Real-world text can be damaged by corrosion issues caused by environmental or\nhuman factors, which hinder the preservation of the complete styles of texts,\ne.g., texture and structure. These corrosion issues, such as graffiti signs and\nincomplete signatures, bring difficulties in understanding the texts, thereby\nposing significant challenges to downstream applications, e.g., scene text\nrecognition and signature identification. Notably, current inpainting\ntechniques often fail to adequately address this problem and have difficulties\nrestoring accurate text images along with reasonable and consistent styles.\nFormulating this as an open problem of text image inpainting, this paper aims\nto build a benchmark to facilitate its study. In doing so, we establish two\nspecific text inpainting datasets which contain scene text images and\nhandwritten text images, respectively. Each of them includes images revamped by\nreal-life and synthetic datasets, featuring pairs of original images, corrupted\nimages, and other assistant information. On top of the datasets, we further\ndevelop a novel neural framework, Global Structure-guided Diffusion Model\n(GSDM), as a potential solution. Leveraging the global structure of the text as\na prior, the proposed GSDM develops an efficient diffusion model to recover\nclean texts. The efficacy of our approach is demonstrated by thorough empirical\nstudy, including a substantial boost in both recognition accuracy and image\nquality. These findings not only highlight the effectiveness of our method but\nalso underscore its potential to enhance the broader field of text image\nunderstanding and processing. Code and datasets are available at:\nhttps://github.com/blackprotoss/GSDM.\n","authors":["Shipeng Zhu","Pengfei Fang","Chenjie Zhu","Zuoyan Zhao","Qiang Xu","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2401.14832v2.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2402.01095v1","updated":"2024-02-02T01:58:16Z","published":"2024-02-02T01:58:16Z","title":"How many views does your deep neural network use for prediction?","summary":" The generalization ability of Deep Neural Networks (DNNs) is still not fully\nunderstood, despite numerous theoretical and empirical analyses. Recently,\nAllen-Zhu & Li (2023) introduced the concept of multi-views to explain the\ngeneralization ability of DNNs, but their main target is ensemble or distilled\nmodels, and no method for estimating multi-views used in a prediction of a\nspecific input is discussed. In this paper, we propose Minimal Sufficient Views\n(MSVs), which is similar to multi-views but can be efficiently computed for\nreal images. MSVs is a set of minimal and distinct features in an input, each\nof which preserves a model's prediction for the input. We empirically show that\nthere is a clear relationship between the number of MSVs and prediction\naccuracy across models, including convolutional and transformer models,\nsuggesting that a multi-view like perspective is also important for\nunderstanding the generalization ability of (non-ensemble or non-distilled)\nDNNs.\n","authors":["Keisuke Kawano","Takuro Kutsuna","Keisuke Sano"],"pdf_url":"https://arxiv.org/pdf/2402.01095v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2402.00225v2","updated":"2024-02-02T01:55:32Z","published":"2024-01-31T23:06:39Z","title":"Geometry aware 3D generation from in-the-wild images in ImageNet","summary":" Generating accurate 3D models is a challenging problem that traditionally\nrequires explicit learning from 3D datasets using supervised learning. Although\nrecent advances have shown promise in learning 3D models from 2D images, these\nmethods often rely on well-structured datasets with multi-view images of each\ninstance or camera pose information. Furthermore, these datasets usually\ncontain clean backgrounds with simple shapes, making them expensive to acquire\nand hard to generalize, which limits the applicability of these methods. To\novercome these limitations, we propose a method for reconstructing 3D geometry\nfrom the diverse and unstructured Imagenet dataset without camera pose\ninformation. We use an efficient triplane representation to learn 3D models\nfrom 2D images and modify the architecture of the generator backbone based on\nStyleGAN2 to adapt to the highly diverse dataset. To prevent mode collapse and\nimprove the training stability on diverse data, we propose to use multi-view\ndiscrimination. The trained generator can produce class-conditional 3D models\nas well as renderings from arbitrary viewpoints. The class-conditional\ngeneration results demonstrate significant improvement over the current\nstate-of-the-art method. Additionally, using PTI, we can efficiently\nreconstruct the whole 3D geometry from single-view images.\n","authors":["Qijia Shen","Guangrun Wang"],"pdf_url":"https://arxiv.org/pdf/2402.00225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08594v2","updated":"2024-02-02T01:41:21Z","published":"2023-12-14T01:33:18Z","title":"CT-MVSNet: Efficient Multi-View Stereo with Cross-scale Transformer","summary":" Recent deep multi-view stereo (MVS) methods have widely incorporated\ntransformers into cascade network for high-resolution depth estimation,\nachieving impressive results. However, existing transformer-based methods are\nconstrained by their computational costs, preventing their extension to finer\nstages. In this paper, we propose a novel cross-scale transformer (CT) that\nprocesses feature representations at different stages without additional\ncomputation. Specifically, we introduce an adaptive matching-aware transformer\n(AMT) that employs different interactive attention combinations at multiple\nscales. This combined strategy enables our network to capture intra-image\ncontext information and enhance inter-image feature relationships. Besides, we\npresent a dual-feature guided aggregation (DFGA) that embeds the coarse global\nsemantic information into the finer cost volume construction to further\nstrengthen global and local feature awareness. Meanwhile, we design a feature\nmetric loss (FM Loss) that evaluates the feature bias before and after\ntransformation to reduce the impact of feature mismatch on depth estimation.\nExtensive experiments on DTU dataset and Tanks and Temples (T\\&T) benchmark\ndemonstrate that our method achieves state-of-the-art results. Code is\navailable at https://github.com/wscstrive/CT-MVSNet.\n","authors":["Sicheng Wang","Hao Jiang","Lei Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.08594v2.pdf","comment":"Accepted at the 30th International Conference on Multimedia\n Modeling(MMM'24 Oral)"},{"id":"http://arxiv.org/abs/2401.17874v2","updated":"2024-02-02T00:21:31Z","published":"2024-01-31T14:32:56Z","title":"VR-based generation of photorealistic synthetic data for training\n hand-object tracking models","summary":" Supervised learning models for precise tracking of hand-object interactions\n(HOI) in 3D require large amounts of annotated data for training. Moreover, it\nis not intuitive for non-experts to label 3D ground truth (e.g. 6DoF object\npose) on 2D images. To address these issues, we present \"blender-hoisynth\", an\ninteractive synthetic data generator based on the Blender software.\nBlender-hoisynth can scalably generate and automatically annotate visual HOI\ntraining data. Other competing approaches usually generate synthetic HOI data\ncompeletely without human input. While this may be beneficial in some\nscenarios, HOI applications inherently necessitate direct control over the HOIs\nas an expression of human intent. With blender-hoisynth, it is possible for\nusers to interact with objects via virtual hands using standard Virtual Reality\nhardware. The synthetically generated data are characterized by a high degree\nof photorealism and contain visually plausible and physically realistic videos\nof hands grasping objects and moving them around in 3D. To demonstrate the\nefficacy of our data generation, we replace large parts of the training data in\nthe well-known DexYCB dataset with hoisynth data and train a state-of-the-art\nHOI reconstruction model with it. We show that there is no significant\ndegradation in the model performance despite the data replacement.\n","authors":["Chengyan Zhang","Rahul Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2401.17874v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.05975v3","updated":"2024-02-02T17:30:51Z","published":"2024-01-11T15:22:55Z","title":"End-to-end Learnable Clustering for Intent Learning in Recommendation","summary":" Intent learning, which aims to learn users' intents for user understanding\nand item recommendation, has become a hot research spot in recent years.\nHowever, the existing methods suffer from complex and cumbersome alternating\noptimization, limiting the performance and scalability. To this end, we propose\na novel intent learning method termed \\underline{ELCRec}, by unifying behavior\nrepresentation learning into an \\underline{E}nd-to-end \\underline{L}earnable\n\\underline{C}lustering framework, for effective and efficient\n\\underline{Rec}ommendation. Concretely, we encode users' behavior sequences and\ninitialize the cluster centers (latent intents) as learnable neurons. Then, we\ndesign a novel learnable clustering module to separate different cluster\ncenters, thus decoupling users' complex intents. Meanwhile, it guides the\nnetwork to learn intents from behaviors by forcing behavior embeddings close to\ncluster centers. This allows simultaneous optimization of recommendation and\nclustering via mini-batch data. Moreover, we propose intent-assisted\ncontrastive learning by using cluster centers as self-supervision signals,\nfurther enhancing mutual promotion. Both experimental results and theoretical\nanalyses demonstrate the superiority of ELCRec from six perspectives. Compared\nto the runner-up, ELCRec improves NDCG@5 by 8.9\\% and reduces computational\ncosts by 22.5\\% on Beauty dataset. Furthermore, due to the scalability and\nuniversal applicability, we deploy this method on the industrial recommendation\nsystem with 130 million page views and achieve promising results.\n","authors":["Yue Liu","Shihao Zhu","Jun Xia","Yingwei Ma","Jian Ma","Wenliang Zhong","Xinwang Liu","Guannan Zhang","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.05975v3.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2306.05817v5","updated":"2024-02-02T12:11:44Z","published":"2023-06-09T11:31:50Z","title":"How Can Recommender Systems Benefit from Large Language Models: A Survey","summary":" With the rapid development of online services, recommender systems (RS) have\nbecome increasingly indispensable for mitigating information overload. Despite\nremarkable progress, conventional recommendation models (CRM) still have some\nlimitations, e.g., lacking open-world knowledge, and difficulties in\ncomprehending users' underlying preferences and motivations. Meanwhile, large\nlanguage models (LLM) have shown impressive general intelligence and human-like\ncapabilities, which mainly stem from their extensive open-world knowledge,\nreasoning ability, as well as their comprehension of human culture and society.\nConsequently, the emergence of LLM is inspiring the design of recommender\nsystems and pointing out a promising research direction, i.e., whether we can\nincorporate LLM and benefit from their knowledge and capabilities to compensate\nfor the limitations of CRM. In this paper, we conduct a comprehensive survey on\nthis research direction from the perspective of the whole pipeline in\nreal-world recommender systems. Specifically, we summarize existing works from\ntwo orthogonal aspects: where and how to adapt LLM to RS. For the WHERE\nquestion, we discuss the roles that LLM could play in different stages of the\nrecommendation pipeline, i.e., feature engineering, feature encoder,\nscoring/ranking function, user interaction, and pipeline controller. For the\nHOW question, we investigate the training and inference strategies, resulting\nin two fine-grained taxonomy criteria, i.e., whether to tune LLM or not, and\nwhether to involve conventional recommendation models for inference. Then, we\nhighlight key challenges in adapting LLM to RS from three aspects, i.e.,\nefficiency, effectiveness, and ethics. Finally, we summarize the survey and\ndiscuss the future prospects. We actively maintain a GitHub repository for\npapers and other related resources:\nhttps://github.com/CHIANGEL/Awesome-LLM-for-RecSys/.\n","authors":["Jianghao Lin","Xinyi Dai","Yunjia Xi","Weiwen Liu","Bo Chen","Hao Zhang","Yong Liu","Chuhan Wu","Xiangyang Li","Chenxu Zhu","Huifeng Guo","Yong Yu","Ruiming Tang","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.05817v5.pdf","comment":"New version released with 27-page main content; Look-up table in\n appendix"},{"id":"http://arxiv.org/abs/2402.01339v1","updated":"2024-02-02T11:52:07Z","published":"2024-02-02T11:52:07Z","title":"Improving Sequential Recommendations with LLMs","summary":" The sequential recommendation problem has attracted considerable research\nattention in the past few years, leading to the rise of numerous recommendation\nmodels. In this work, we explore how Large Language Models (LLMs), which are\nnowadays introducing disruptive effects in many AI-based applications, can be\nused to build or improve sequential recommendation approaches. Specifically, we\ndesign three orthogonal approaches and hybrids of those to leverage the power\nof LLMs in different ways. In addition, we investigate the potential of each\napproach by focusing on its comprising technical aspects and determining an\narray of alternative choices for each one. We conduct extensive experiments on\nthree datasets and explore a large variety of configurations, including\ndifferent language models and baseline recommendation models, to obtain a\ncomprehensive picture of the performance of each approach. Among other\nobservations, we highlight that initializing state-of-the-art sequential\nrecommendation models such as BERT4Rec or SASRec with embeddings obtained from\nan LLM can lead to substantial performance gains in terms of accuracy.\nFurthermore, we find that fine-tuning an LLM for recommendation tasks enables\nit to learn not only the tasks, but also concepts of a domain to some extent.\nWe also show that fine-tuning OpenAI GPT leads to considerably better\nperformance than fine-tuning Google PaLM 2. Overall, our extensive experiments\nindicate a huge potential value of leveraging LLMs in future recommendation\napproaches. We publicly share the code and data of our experiments to ensure\nreproducibility.\n","authors":["Artun Boz","Wouter Zorgdrager","Zoe Kotti","Jesse Harte","Panos Louridas","Dietmar Jannach","Marios Fragkoulis"],"pdf_url":"https://arxiv.org/pdf/2402.01339v1.pdf","comment":"33 pages, 12 figures, 7 tables"},{"id":"http://arxiv.org/abs/2402.01294v1","updated":"2024-02-02T10:31:28Z","published":"2024-02-02T10:31:28Z","title":"Minimizing Regret in Billboard Advertisement under Zonal Influence\n Constraint","summary":" In a typical billboard advertisement technique, a number of digital\nbillboards are owned by an influence provider, and many advertisers approach\nthe influence provider for a specific number of views of their advertisement\ncontent on a payment basis. If the influence provider provides the demanded or\nmore influence, then he will receive the full payment or else a partial\npayment. In the context of an influence provider, if he provides more or less\nthan an advertiser's demanded influence, it is a loss for him. This is\nformalized as 'Regret', and naturally, in the context of the influence\nprovider, the goal will be to allocate the billboard slots among the\nadvertisers such that the total regret is minimized. In this paper, we study\nthis problem as a discrete optimization problem and propose four solution\napproaches. The first one selects the billboard slots from the available ones\nin an incremental greedy manner, and we call this method the Budget Effective\nGreedy approach. In the second one, we introduce randomness with the first one,\nwhere we perform the marginal gain computation for a sample of randomly chosen\nbillboard slots. The remaining two approaches are further improvements over the\nsecond one. We analyze all the algorithms to understand their time and space\ncomplexity. We implement them with real-life trajectory and billboard datasets\nand conduct a number of experiments. It has been observed that the randomized\nbudget effective greedy approach takes reasonable computational time while\nminimizing the regret.\n","authors":["Dildar Ali","Suman Banerjee","Yamuna Prasad"],"pdf_url":"https://arxiv.org/pdf/2402.01294v1.pdf","comment":"32 Pages"},{"id":"http://arxiv.org/abs/2401.06683v2","updated":"2024-02-02T09:54:18Z","published":"2024-01-12T16:43:28Z","title":"DQNC2S: DQN-based Cross-stream Crisis event Summarizer","summary":" Summarizing multiple disaster-relevant data streams simultaneously is\nparticularly challenging as existing Retrieve&Re-ranking strategies suffer from\nthe inherent redundancy of multi-stream data and limited scalability in a\nmulti-query setting. This work proposes an online approach to crisis timeline\ngeneration based on weak annotation with Deep Q-Networks. It selects on-the-fly\nthe relevant pieces of text without requiring neither human annotations nor\ncontent re-ranking. This makes the inference time independent of the number of\ninput queries. The proposed approach also incorporates a redundancy filter into\nthe reward function to effectively handle cross-stream content overlaps. The\nachieved ROUGE and BERTScore results are superior to those of best-performing\nmodels on the CrisisFACTS 2022 benchmark.\n","authors":["Daniele Rege Cambrin","Luca Cagliero","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2401.06683v2.pdf","comment":"accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2312.09901v3","updated":"2024-02-02T09:50:31Z","published":"2023-12-15T15:53:45Z","title":"Temporally and Distributionally Robust Optimization for Cold-Start\n Recommendation","summary":" Collaborative Filtering (CF) recommender models highly depend on user-item\ninteractions to learn CF representations, thus falling short of recommending\ncold-start items. To address this issue, prior studies mainly introduce item\nfeatures (e.g., thumbnails) for cold-start item recommendation. They learn a\nfeature extractor on warm-start items to align feature representations with\ninteractions, and then leverage the feature extractor to extract the feature\nrepresentations of cold-start items for interaction prediction. Unfortunately,\nthe features of cold-start items, especially the popular ones, tend to diverge\nfrom those of warm-start ones due to temporal feature shifts, preventing the\nfeature extractor from accurately learning feature representations of\ncold-start items.\n To alleviate the impact of temporal feature shifts, we consider using\nDistributionally Robust Optimization (DRO) to enhance the generation ability of\nthe feature extractor. Nonetheless, existing DRO methods face an inconsistency\nissue: the worse-case warm-start items emphasized during DRO training might not\nalign well with the cold-start item distribution. To capture the temporal\nfeature shifts and combat this inconsistency issue, we propose a novel temporal\nDRO with new optimization objectives, namely, 1) to integrate a worst-case\nfactor to improve the worst-case performance, and 2) to devise a shifting\nfactor to capture the shifting trend of item features and enhance the\noptimization of the potentially popular groups in cold-start items. Substantial\nexperiments on three real-world datasets validate the superiority of our\ntemporal DRO in enhancing the generalization ability of cold-start recommender\nmodels. The code is available at https://github.com/Linxyhaha/TDRO/.\n","authors":["Xinyu Lin","Wenjie Wang","Jujia Zhao","Yongqi Li","Fuli Feng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2312.09901v3.pdf","comment":"Accepted by AAAI'24"},{"id":"http://arxiv.org/abs/2402.01253v1","updated":"2024-02-02T09:20:48Z","published":"2024-02-02T09:20:48Z","title":"HimiRec: Modeling Hierarchical Multi-interest for Recommendation","summary":" Industrial recommender systems usually consist of the retrieval stage and the\nranking stage, to handle the billion-scale of users and items. The retrieval\nstage retrieves candidate items relevant to user interests for recommendations\nand has attracted much attention. Frequently, users show hierarchical\nmulti-interests reflected in a heavy user of a certain NBA team Golden State\nWarriors in Sports, who is also a light user of almost the whole Animation.\nBoth Sports and Animation are at the same level. However, most existing methods\nimplicitly learn this hierarchical difference, making more fine-grained\ninterest information to be averaged and limiting detailed understanding of the\nuser's different needs in heavy interests and other light interests. Therefore,\nwe propose a novel two-stage approach to explicitly modeling hierarchical\nmulti-interest for recommendation in this work. In the first hierarchical\nmulti-interest mining stage, the hierarchical clustering and transformer-based\nmodel adaptively generate circles or sub-circles that users are interested in.\nIn the second stage, the partition of retrieval space allows the EBR models to\nonly deal with items within each circle and accurately capture user's refined\ninterests. Experimental results show that the proposed approach achieves\nstate-of-the-art performance. Our framework has also successfully deployed at\nLofter (one of the largest derivative content communities with 10 million\nmonthly active users) for over four months.\n","authors":["Haolei Pei","Yuanyuan Xu","Yangping Zhu","Yuan Nie"],"pdf_url":"https://arxiv.org/pdf/2402.01253v1.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.17878v2","updated":"2024-02-02T08:32:20Z","published":"2024-01-31T14:36:44Z","title":"A Survey on Data-Centric Recommender Systems","summary":" Recommender systems (RSs) have become an essential tool for mitigating\ninformation overload in a range of real-world applications. Recent trends in\nRSs have revealed a major paradigm shift, moving the spotlight from\nmodel-centric innovations to data-centric efforts (e.g., improving data quality\nand quantity). This evolution has given rise to the concept of data-centric\nrecommender systems (Data-Centric RSs), marking a significant development in\nthe field. This survey provides the first systematic overview of Data-Centric\nRSs, covering 1) the foundational concepts of recommendation data and\nData-Centric RSs; 2) three primary issues of recommendation data; 3) recent\nresearch developed to address these issues; and 4) several potential future\ndirections of Data-Centric RSs.\n","authors":["Riwei Lai","Li Chen","Rui Chen","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.17878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01176v1","updated":"2024-02-02T06:44:22Z","published":"2024-02-02T06:44:22Z","title":"Towards a Unified Language Model for Knowledge-Intensive Tasks Utilizing\n External Corpus","summary":" The advent of large language models (LLMs) has showcased their efficacy\nacross various domains, yet they often hallucinate, especially in\nknowledge-intensive tasks that require external knowledge sources. To improve\nfactual accuracy of language models, retrieval-augmented generation (RAG) has\nemerged as a popular solution. However, traditional retrieval modules often\nrely on large-scale document indexes, which can be disconnected from generative\ntasks. Through generative retrieval (GR) approach, language models can achieve\nsuperior retrieval performance by directly generating relevant document\nidentifiers (DocIDs). However, the relationship between GR and downstream\ntasks, as well as the potential of LLMs in GR, remains unexplored. In this\npaper, we present a unified language model that utilizes external corpus to\nhandle various knowledge-intensive tasks by seamlessly integrating generative\nretrieval, closed-book generation, and RAG. In order to achieve effective\nretrieval and generation through a unified continuous decoding process, we\nintroduce the following mechanisms: (1) a ranking-oriented DocID decoding\nstrategy, which improves ranking ability by directly learning from a DocID\nranking list; (2) a continuous generation strategy to facilitate effective and\nefficient RAG; (3) well-designed auxiliary DocID understanding tasks to enhance\nthe model's comprehension of DocIDs and their relevance to downstream tasks.\nOur approach is evaluated on the widely used KILT benchmark using two variants\nof backbone models: an encoder-decoder T5 model and a decoder-only LLM, Llama2.\nExperimental results showcase the superior performance of our models in both\nretrieval and downstream knowledge-intensive tasks.\n","authors":["Xiaoxi Li","Zhicheng Dou","Yujia Zhou","Fangchao Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01135v1","updated":"2024-02-02T04:20:13Z","published":"2024-02-02T04:20:13Z","title":"A Multi-Agent Conversational Recommender System","summary":" Due to strong capabilities in conducting fluent, multi-turn conversations\nwith users, Large Language Models (LLMs) have the potential to further improve\nthe performance of Conversational Recommender System (CRS). Unlike the aimless\nchit-chat that LLM excels at, CRS has a clear target. So it is imperative to\ncontrol the dialogue flow in the LLM to successfully recommend appropriate\nitems to the users. Furthermore, user feedback in CRS can assist the system in\nbetter modeling user preferences, which has been ignored by existing studies.\nHowever, simply prompting LLM to conduct conversational recommendation cannot\naddress the above two key challenges.\n In this paper, we propose Multi-Agent Conversational Recommender System\n(MACRS) which contains two essential modules. First, we design a multi-agent\nact planning framework, which can control the dialogue flow based on four\nLLM-based agents. This cooperative multi-agent framework will generate various\ncandidate responses based on different dialogue acts and then choose the most\nappropriate response as the system response, which can help MACRS plan suitable\ndialogue acts. Second, we propose a user feedback-aware reflection mechanism\nwhich leverages user feedback to reason errors made in previous turns to adjust\nthe dialogue act planning, and higher-level user information from implicit\nsemantics. We conduct extensive experiments based on user simulator to\ndemonstrate the effectiveness of MACRS in recommendation and user preferences\ncollection. Experimental results illustrate that MACRS demonstrates an\nimprovement in user interaction experience compared to directly using LLMs.\n","authors":["Jiabao Fang","Shen Gao","Pengjie Ren","Xiuying Chen","Suzan Verberne","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2402.01135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01124v1","updated":"2024-02-02T03:52:21Z","published":"2024-02-02T03:52:21Z","title":"TransFR: Transferable Federated Recommendation with Pre-trained Language\n Models","summary":" Federated recommendations (FRs), facilitating multiple local clients to\ncollectively learn a global model without disclosing user private data, have\nemerged as a prevalent architecture for privacy-preserving recommendations. In\nconventional FRs, a dominant paradigm is to utilize discrete identities to\nrepresent users/clients and items, which are subsequently mapped to\ndomain-specific embeddings to participate in model training. Despite\nconsiderable performance, we reveal three inherent limitations that can not be\nignored in federated settings, i.e., non-transferability across domains,\nunavailability in cold-start settings, and potential privacy violations during\nfederated training. To this end, we propose a transferable federated\nrecommendation model with universal textual representations, TransFR, which\ndelicately incorporates the general capabilities empowered by pre-trained\nlanguage models and the personalized abilities by fine-tuning local private\ndata. Specifically, it first learns domain-agnostic representations of items by\nexploiting pre-trained models with public textual corpora. To tailor for\nfederated recommendation, we further introduce an efficient federated\nfine-tuning and a local training mechanism. This facilitates personalized local\nheads for each client by utilizing their private behavior data. By\nincorporating pre-training and fine-tuning within FRs, it greatly improves the\nadaptation efficiency transferring to a new domain and the generalization\ncapacity to address cold-start issues. Through extensive experiments on several\ndatasets, we demonstrate that our TransFR model surpasses several\nstate-of-the-art FRs in terms of accuracy, transferability, and privacy.\n","authors":["Honglei Zhang","He Liu","Haoxuan Li","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2402.01124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01934v1","updated":"2024-02-02T22:16:56Z","published":"2024-02-02T22:16:56Z","title":"Clarifying the Path to User Satisfaction: An Investigation into\n Clarification Usefulness","summary":" Clarifying questions are an integral component of modern information\nretrieval systems, directly impacting user satisfaction and overall system\nperformance. Poorly formulated questions can lead to user frustration and\nconfusion, negatively affecting the system's performance. This research\naddresses the urgent need to identify and leverage key features that contribute\nto the classification of clarifying questions, enhancing user satisfaction. To\ngain deeper insights into how different features influence user satisfaction,\nwe conduct a comprehensive analysis, considering a broad spectrum of lexical,\nsemantic, and statistical features, such as question length and sentiment\npolarity. Our empirical results provide three main insights into the qualities\nof effective query clarification: (1) specific questions are more effective\nthan generic ones; (2) the subjectivity and emotional tone of a question play a\nrole; and (3) shorter and more ambiguous queries benefit significantly from\nclarification. Based on these insights, we implement feature-integrated user\nsatisfaction prediction using various classifiers, both traditional and\nneural-based, including random forest, BERT, and large language models. Our\nexperiments show a consistent and significant improvement, particularly in\ntraditional classifiers, with a minimum performance boost of 45\\%. This study\npresents invaluable guidelines for refining the formulation of clarifying\nquestions and enhancing both user satisfaction and system performance.\n","authors":["Hossein A. Rahmani","Xi Wang","Mohammad Aliannejadi","Mohammadmehdi Naghiaei","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2402.01934v1.pdf","comment":"EACL"},{"id":"http://arxiv.org/abs/2402.01916v1","updated":"2024-02-02T21:36:03Z","published":"2024-02-02T21:36:03Z","title":"CoLe and LYS at BioASQ MESINESP8 Task: similarity based descriptor\n assignment in Spanish","summary":" In this paper, we describe our participation in the MESINESP Task of the\nBioASQ biomedical semantic indexing challenge. The participating system follows\nan approach based solely on conventional information retrieval tools. We have\nevaluated various alternatives for extracting index terms from IBECS/LILACS\ndocuments in order to be stored in an Apache Lucene index. Those indexed\nrepresentations are queried using the contents of the article to be annotated\nand a ranked list of candidate labels is created from the retrieved documents.\nWe also have evaluated a sort of limited Label Powerset approach which creates\nmeta-labels joining pairs of DeCS labels with high co-occurrence scores, and an\nalternative method based on label profile matching. Results obtained in\nofficial runs seem to confirm the suitability of this approach for languages\nlike Spanish.\n","authors":["Francisco J. Ribadas-Pena","Shuyuan Cao","Elmurod Kuriyozov"],"pdf_url":"https://arxiv.org/pdf/2402.01916v1.pdf","comment":"Accepted at the 8th BioASQ Workshop at the 11th Conference and Labs\n of the Evaluation Forum (CLEF) 2020. 11 pages"},{"id":"http://arxiv.org/abs/2402.03370v1","updated":"2024-02-02T08:15:43Z","published":"2024-02-02T08:15:43Z","title":"Detection of tortured phrases in scientific literature","summary":" This paper presents various automatic detection methods to extract so called\ntortured phrases from scientific papers. These tortured phrases, e.g. flag to\nclamor instead of signal to noise, are the results of paraphrasing tools used\nto escape plagiarism detection. We built a dataset and evaluated several\nstrategies to flag previously undocumented tortured phrases. The proposed and\ntested methods are based on language models and either on embeddings\nsimilarities or on predictions of masked token. We found that an approach using\ntoken prediction and that propagates the scores to the chunk level gives the\nbest results. With a recall value of .87 and a precision value of .61, it could\nretrieve new tortured phrases to be submitted to domain experts for validation.\n","authors":["Eléna Martel","Martin Lentschat","Cyril Labbé"],"pdf_url":"https://arxiv.org/pdf/2402.03370v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.01635v1","updated":"2024-02-02T18:54:18Z","published":"2024-02-02T18:54:18Z","title":"kNN Algorithm for Conditional Mean and Variance Estimation with\n Automated Uncertainty Quantification and Variable Selection","summary":" In this paper, we introduce a kNN-based regression method that synergizes the\nscalability and adaptability of traditional non-parametric kNN models with a\nnovel variable selection technique. This method focuses on accurately\nestimating the conditional mean and variance of random response variables,\nthereby effectively characterizing conditional distributions across diverse\nscenarios.Our approach incorporates a robust uncertainty quantification\nmechanism, leveraging our prior estimation work on conditional mean and\nvariance. The employment of kNN ensures scalable computational efficiency in\npredicting intervals and statistical accuracy in line with optimal\nnon-parametric rates. Additionally, we introduce a new kNN semi-parametric\nalgorithm for estimating ROC curves, accounting for covariates. For selecting\nthe smoothing parameter k, we propose an algorithm with theoretical\nguarantees.Incorporation of variable selection enhances the performance of the\nmethod significantly over conventional kNN techniques in various modeling\ntasks. We validate the approach through simulations in low, moderate, and\nhigh-dimensional covariate spaces. The algorithm's effectiveness is\nparticularly notable in biomedical applications as demonstrated in two case\nstudies. Concluding with a theoretical analysis, we highlight the consistency\nand convergence rate of our method over traditional kNN models, particularly\nwhen the underlying regression model takes values in a low-dimensional space.\n","authors":["Marcos Matabuena","Juan C. Vidal","Oscar Hernan Madrid Padilla","Jukka-Pekka Onnela"],"pdf_url":"https://arxiv.org/pdf/2402.01635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11435v5","updated":"2024-02-02T18:52:51Z","published":"2023-03-20T20:28:17Z","title":"Inversion by Direct Iteration: An Alternative to Denoising Diffusion for\n Image Restoration","summary":" Inversion by Direct Iteration (InDI) is a new formulation for supervised\nimage restoration that avoids the so-called \"regression to the mean\" effect and\nproduces more realistic and detailed images than existing regression-based\nmethods. It does this by gradually improving image quality in small steps,\nsimilar to generative denoising diffusion models. Image restoration is an\nill-posed problem where multiple high-quality images are plausible\nreconstructions of a given low-quality input. Therefore, the outcome of a\nsingle step regression model is typically an aggregate of all possible\nexplanations, therefore lacking details and realism. The main advantage of InDI\nis that it does not try to predict the clean target image in a single step but\ninstead gradually improves the image in small steps, resulting in better\nperceptual quality. While generative denoising diffusion models also work in\nsmall steps, our formulation is distinct in that it does not require knowledge\nof any analytic form of the degradation process. Instead, we directly learn an\niterative restoration process from low-quality and high-quality paired\nexamples. InDI can be applied to virtually any image degradation, given paired\ntraining data. In conditional denoising diffusion image restoration the\ndenoising network generates the restored image by repeatedly denoising an\ninitial image of pure noise, conditioned on the degraded input. Contrary to\nconditional denoising formulations, InDI directly proceeds by iteratively\nrestoring the input low-quality image, producing high-quality results on a\nvariety of image restoration tasks, including motion and out-of-focus\ndeblurring, super-resolution, compression artifact removal, and denoising.\n","authors":["Mauricio Delbracio","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2303.11435v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01632v1","updated":"2024-02-02T18:52:16Z","published":"2024-02-02T18:52:16Z","title":"Beyond Lengthscales: No-regret Bayesian Optimisation With Unknown\n Hyperparameters Of Any Type","summary":" Bayesian optimisation requires fitting a Gaussian process model, which in\nturn requires specifying hyperparameters - most of the theoretical literature\nassumes those hyperparameters are known. The commonly used maximum likelihood\nestimator for hyperparameters of the Gaussian process is consistent only if the\ndata fills the space uniformly, which does not have to be the case in Bayesian\noptimisation. Since no guarantees exist regarding the correctness of\nhyperparameter estimation, and those hyperparameters can significantly affect\nthe Gaussian process fit, theoretical analysis of Bayesian optimisation with\nunknown hyperparameters is very challenging. Previously proposed algorithms\nwith the no-regret property were only able to handle the special case of\nunknown lengthscales, reproducing kernel Hilbert space norm and applied only to\nthe frequentist case. We propose a novel algorithm, HE-GP-UCB, which is the\nfirst algorithm enjoying the no-regret property in the case of unknown\nhyperparameters of arbitrary form, and which supports both Bayesian and\nfrequentist settings. Our proof idea is novel and can easily be extended to\nother variants of Bayesian optimisation. We show this by extending our\nalgorithm to the adversarially robust optimisation setting under unknown\nhyperparameters. Finally, we empirically evaluate our algorithm on a set of toy\nproblems and show that it can outperform the maximum likelihood estimator.\n","authors":["Juliusz Ziomek","Masaki Adachi","Michael A. Osborne"],"pdf_url":"https://arxiv.org/pdf/2402.01632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01629v1","updated":"2024-02-02T18:44:37Z","published":"2024-02-02T18:44:37Z","title":"Position Paper: Generalized grammar rules and structure-based\n generalization beyond classical equivariance for lexical tasks and\n transduction","summary":" Compositional generalization is one of the main properties which\ndifferentiates lexical learning in humans from state-of-art neural networks. We\npropose a general framework for building models that can generalize\ncompositionally using the concept of Generalized Grammar Rules (GGRs), a class\nof symmetry-based compositional constraints for transduction tasks, which we\nview as a transduction analogue of equivariance constraints in physics-inspired\ntasks. Besides formalizing generalized notions of symmetry for language\ntransduction, our framework is general enough to contain many existing works as\nspecial cases. We present ideas on how GGRs might be implemented, and in the\nprocess draw connections to reinforcement learning and other areas of research.\n","authors":["Mircea Petrache","Shubhendu Trivedi"],"pdf_url":"https://arxiv.org/pdf/2402.01629v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2402.01621v1","updated":"2024-02-02T18:39:40Z","published":"2024-02-02T18:39:40Z","title":"Stochastic Two Points Method for Deep Model Zeroth-order Optimization","summary":" Large foundation models, such as large language models, have performed\nexceptionally well in various application scenarios. Building or fully\nfine-tuning such large models is usually prohibitive due to either hardware\nbudget or lack of access to backpropagation. The zeroth-order methods offer a\npromising direction for tackling this challenge, where only forward passes are\nneeded to update the model. This paper introduces an efficient Stochastic\nTwo-Point (S2P) approach within the gradient-free regime. We present the\ntheoretical convergence properties of S2P under the general and relaxed\nsmoothness assumptions. The theoretical properties also shed light on a faster\nand more stable S2P variant, Accelerated S2P (AS2P), through exploiting our new\nconvergence properties that better represent the dynamics of deep models in\ntraining. Our comprehensive empirical results show that AS2P is highly\neffective in optimizing objectives for large deep models, including language\nmodels, and outperforms standard methods across various model types and scales,\nwith 2 $\\times$ speed-up in training over most conducted tasks.\n","authors":["Yijiang Pang","Jiayu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.01621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13886v2","updated":"2024-02-02T18:35:11Z","published":"2023-10-21T01:34:30Z","title":"Nonlinear Filtering with Brenier Optimal Transport Maps","summary":" This paper is concerned with the problem of nonlinear filtering, i.e.,\ncomputing the conditional distribution of the state of a stochastic dynamical\nsystem given a history of noisy partial observations. Conventional sequential\nimportance resampling (SIR) particle filters suffer from fundamental\nlimitations, in scenarios involving degenerate likelihoods or high-dimensional\nstates, due to the weight degeneracy issue. In this paper, we explore an\nalternative method, which is based on estimating the Brenier optimal transport\n(OT) map from the current prior distribution of the state to the posterior\ndistribution at the next time step. Unlike SIR particle filters, the OT\nformulation does not require the analytical form of the likelihood. Moreover,\nit allows us to harness the approximation power of neural networks to model\ncomplex and multi-modal distributions and employ stochastic optimization\nalgorithms to enhance scalability. Extensive numerical experiments are\npresented that compare the OT method to the SIR particle filter and the\nensemble Kalman filter, evaluating the performance in terms of sample\nefficiency, high-dimensional scalability, and the ability to capture complex\nand multi-modal distributions.\n","authors":["Mohammad Al-Jarrah","Niyizhen Jin","Bamdad Hosseini","Amirhossein Taghvaei"],"pdf_url":"https://arxiv.org/pdf/2310.13886v2.pdf","comment":"25 pages, 16 figures, 1 Table"},{"id":"http://arxiv.org/abs/2311.17929v5","updated":"2024-02-02T18:33:43Z","published":"2023-11-25T22:26:58Z","title":"New Online Communities: Graph Deep Learning on Anonymous Voting Networks\n to Identify Sybils in Polycentric Governance","summary":" This research examines the polycentric governance of digital assets in\nblockchain-based Decentralized Autonomous Organizations (DAOs). It offers a\ntheoretical framework and addresses a critical challenge facing decentralized\ngovernance by developing a method to identify sybils, or spurious identities.\nSybils pose significant organizational sustainability threats to DAOs and\nother, commons-based online communities, and threat models are identified. The\nexperimental method uses graph deep learning techniques to identify sybil\nactivity in a DAO governance dataset (snapshot.org). Specifically, a Graph\nConvolutional Neural Network (GCNN) learned voting behaviours and a fast\nk-means vector clustering algorithm (FAISS) used high-dimensional embeddings to\nidentify similar nodes in a graph. The results reveal that deep learning can\neffectively identify sybils, reducing the voting graph by 2-5%. This research\nunderscores the importance of sybil resistance in DAOs and offers a novel\nperspective on decentralized governance, informing future policy, regulation,\nand governance practices.\n","authors":["Quinn DuPont"],"pdf_url":"https://arxiv.org/pdf/2311.17929v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08204v2","updated":"2024-02-02T18:31:52Z","published":"2023-10-12T10:50:21Z","title":"STELLA: Continual Audio-Video Pre-training with Spatio-Temporal\n Localized Alignment","summary":" Continuously learning a variety of audio-video semantics over time is crucial\nfor audio-related reasoning tasks in our ever-evolving world. However, this is\na nontrivial problem and poses two critical challenges: sparse spatio-temporal\ncorrelation between audio-video pairs and multimodal correlation overwriting\nthat forgets audio-video relations. To tackle this problem, we propose a new\ncontinual audio-video pre-training method with two novel ideas: (1) Localized\nPatch Importance Scoring: we introduce a multimodal encoder to determine the\nimportance score for each patch, emphasizing semantically intertwined\naudio-video patches. (2) Replay-guided Correlation Assessment: to reduce the\ncorruption of previously learned audiovisual knowledge due to drift, we propose\nto assess the correlation of the current patches on the past steps to identify\nthe patches exhibiting high correlations with the past steps. Based on the\nresults from the two ideas, we perform probabilistic patch selection for\neffective continual audio-video pre-training. Experimental validation on\nmultiple benchmarks shows that our method achieves a 3.69%p of relative\nperformance gain in zero-shot retrieval tasks compared to strong continual\nlearning baselines, while reducing memory consumption by ~45%.\n","authors":["Jaewoo Lee","Jaehong Yoon","Wonjae Kim","Yunji Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.08204v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2110.03155v5","updated":"2024-02-02T18:31:23Z","published":"2021-10-07T03:14:46Z","title":"The Benefits of Being Categorical Distributional: Uncertainty-aware\n Regularized Exploration in Reinforcement Learning","summary":" The theoretical advantages of distributional reinforcement learning~(RL) over\nclassical RL remain elusive despite its remarkable empirical performance.\nStarting from Categorical Distributional RL~(CDRL), we attribute the potential\nsuperiority of distributional RL to a derived distribution-matching\nregularization by applying a return density function decomposition technique.\nThis unexplored regularization in the distributional RL context is aimed at\ncapturing additional return distribution information regardless of only its\nexpectation, contributing to an augmented reward signal in the policy\noptimization. Compared with the entropy regularization in MaxEnt RL that\nexplicitly optimizes the policy to encourage the exploration, the resulting\nregularization in CDRL implicitly optimizes policies guided by the new reward\nsignal to align with the uncertainty of target return distributions, leading to\nan uncertainty-aware exploration effect. Finally, extensive experiments\nsubstantiate the importance of this uncertainty-aware regularization in\ndistributional RL on the empirical benefits over classical RL.\n","authors":["Ke Sun","Yingnan Zhao","Enze Shi","Yafei Wang","Xiaodong Yan","Bei Jiang","Linglong Kong"],"pdf_url":"https://arxiv.org/pdf/2110.03155v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01617v1","updated":"2024-02-02T18:27:21Z","published":"2024-02-02T18:27:21Z","title":"A GP-based Robust Motion Planning Framework for Agile Autonomous Robot\n Navigation and Recovery in Unknown Environments","summary":" For autonomous mobile robots, uncertainties in the environment and system\nmodel can lead to failure in the motion planning pipeline, resulting in\npotential collisions. In order to achieve a high level of robust autonomy,\nthese robots should be able to proactively predict and recover from such\nfailures. To this end, we propose a Gaussian Process (GP) based model for\nproactively detecting the risk of future motion planning failure. When this\nrisk exceeds a certain threshold, a recovery behavior is triggered that\nleverages the same GP model to find a safe state from which the robot may\ncontinue towards the goal. The proposed approach is trained in simulation only\nand can generalize to real world environments on different robotic platforms.\nSimulations and physical experiments demonstrate that our framework is capable\nof both predicting planner failures and recovering the robot to states where\nplanner success is likely, all while producing agile motion.\n","authors":["Nicholas Mohammad","Jacob Higgins","Nicola Bezzo"],"pdf_url":"https://arxiv.org/pdf/2402.01617v1.pdf","comment":"To Appear in 2024 IEEE/RSJ International Conference on Robotics and\n Automation (ICRA), 2024"},{"id":"http://arxiv.org/abs/2402.01614v1","updated":"2024-02-02T18:24:37Z","published":"2024-02-02T18:24:37Z","title":"L2G2G: a Scalable Local-to-Global Network Embedding with Graph\n Autoencoders","summary":" For analysing real-world networks, graph representation learning is a popular\ntool. These methods, such as a graph autoencoder (GAE), typically rely on\nlow-dimensional representations, also called embeddings, which are obtained\nthrough minimising a loss function; these embeddings are used with a decoder\nfor downstream tasks such as node classification and edge prediction. While\nGAEs tend to be fairly accurate, they suffer from scalability issues. For\nimproved speed, a Local2Global approach, which combines graph patch embeddings\nbased on eigenvector synchronisation, was shown to be fast and achieve good\naccuracy. Here we propose L2G2G, a Local2Global method which improves GAE\naccuracy without sacrificing scalability. This improvement is achieved by\ndynamically synchronising the latent node representations, while training the\nGAEs. It also benefits from the decoder computing an only local patch loss.\nHence, aligning the local embeddings in each epoch utilises more information\nfrom the graph than a single post-training alignment does, while maintaining\nscalability. We illustrate on synthetic benchmarks, as well as real-world\nexamples, that L2G2G achieves higher accuracy than the standard Local2Global\napproach and scales efficiently on the larger data sets. We find that for large\nand dense networks, it even outperforms the slow, but assumed more accurate,\nGAEs.\n","authors":["Ruikang Ouyang","Andrew Elliott","Stratis Limnios","Mihai Cucuringu","Gesine Reinert"],"pdf_url":"https://arxiv.org/pdf/2402.01614v1.pdf","comment":"13 pages, 4 figures, Complex Networks 2023, Volume I, SCI 1141"},{"id":"http://arxiv.org/abs/2402.01608v1","updated":"2024-02-02T18:14:16Z","published":"2024-02-02T18:14:16Z","title":"Contingency Analysis of a Grid of Connected EVs for Primary Frequency\n Control of an Industrial Microgrid Using Efficient Control Scheme","summary":" After over a century of internal combustion engines ruling the transport\nsector, electric vehicles appear to be on the verge of gaining traction due to\na slew of advantages, including lower operating costs and lower CO2 emissions.\nBy using the Vehicle-to-Grid (or Grid-to-Vehicle if Electric vehicles (EVs) are\nutilized as load) approach, EVs can operate as both a load and a source.\nPrimary frequency regulation and congestion management are two essential\ncharacteristics of this technology that are added to an industrial microgrid.\nIndustrial Microgrids are made up of different energy sources such as wind\nfarms and PV farms, storage systems, and loads. EVs have gained a lot of\ninterest as a technique for frequency management because of their ability to\nregulate quickly. Grid reliability depends on this quick reaction. Different\ncontingency, state of charge of the electric vehicles, and a varying number of\nEVs in an EV fleet are considered in this work, and a proposed control scheme\nfor frequency management is presented. This control scheme enables\nbidirectional power flow, allowing for primary frequency regulation during the\nvarious scenarios that an industrial microgrid may encounter over the course of\na 24-h period. The presented controller will provide dependable frequency\nregulation support to the industrial microgrid during contingencies, as will be\ndemonstrated by simulation results, achieving a more reliable system. However,\nsimulation results will show that by increasing a number of the EVs in a fleet\nfor the Vehicle-to-Grid approach, an industrial microgrid\\'s frequency can be\nenhanced even further.\n","authors":["J. N. Sabhahit","S. S. Solanke","V. K. Jadoun","H. Malik","F. P. García Márquez","J. M. Pinar-Pérez"],"pdf_url":"https://arxiv.org/pdf/2402.01608v1.pdf","comment":"Published in energies (MDPI) 2022"},{"id":"http://arxiv.org/abs/2402.01607v1","updated":"2024-02-02T18:11:43Z","published":"2024-02-02T18:11:43Z","title":"Natural Counterfactuals With Necessary Backtracking","summary":" Counterfactual reasoning is pivotal in human cognition and especially\nimportant for providing explanations and making decisions. While Judea Pearl's\ninfluential approach is theoretically elegant, its generation of a\ncounterfactual scenario often requires interventions that are too detached from\nthe real scenarios to be feasible. In response, we propose a framework of\nnatural counterfactuals and a method for generating counterfactuals that are\nnatural with respect to the actual world's data distribution. Our methodology\nrefines counterfactual reasoning, allowing changes in causally preceding\nvariables to minimize deviations from realistic scenarios. To generate natural\ncounterfactuals, we introduce an innovative optimization framework that permits\nbut controls the extent of backtracking with a naturalness criterion. Empirical\nexperiments indicate the effectiveness of our method.\n","authors":["Guang-Yuan Hao","Jiji Zhang","Biwei Huang","Hao Wang","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15963v2","updated":"2024-02-02T18:11:27Z","published":"2024-01-29T08:47:31Z","title":"NoFunEval: Funny How Code LMs Falter on Requirements Beyond Functional\n Correctness","summary":" Existing evaluation benchmarks of language models of code (code LMs) focus\nalmost exclusively on whether the LMs can generate functionally-correct code.\nIn real-world software engineering, developers think beyond functional\ncorrectness. They have requirements on \"how\" a functionality should be\nimplemented to meet overall system design objectives like efficiency, security,\nand maintainability. They would also trust the code LMs more if the LMs\ndemonstrate robust understanding of requirements and code semantics.\n We propose a new benchmark NoFunEval to evaluate code LMs on non-functional\nrequirements and simple classification instances for both functional and\nnon-functional requirements. We propose a prompting method, Coding Concepts\n(CoCo), as a way for a developer to communicate the domain knowledge to the\nLMs. We conduct an extensive evaluation of twenty-two code LMs. Our finding is\nthat they generally falter when tested on our benchmark, hinting at fundamental\nblindspots in their training setups. Surprisingly, even the classification\naccuracy on functional-correctness instances derived from the popular HumanEval\nbenchmark is low, calling in question the depth of their comprehension and the\nsource of their success in generating functionally-correct code in the first\nplace. We will release our benchmark and evaluation scripts publicly at\nhttps://aka.ms/NoFunEval.\n","authors":["Manav Singhal","Tushar Aggarwal","Abhijeet Awasthi","Nagarajan Natarajan","Aditya Kanade"],"pdf_url":"https://arxiv.org/pdf/2401.15963v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.13836v2","updated":"2024-02-02T18:07:37Z","published":"2023-10-20T22:20:50Z","title":"Foundation Model's Embedded Representations May Detect Distribution\n Shift","summary":" Sampling biases can cause distribution shifts between train and test datasets\nfor supervised learning tasks, obscuring our ability to understand the\ngeneralization capacity of a model. This is especially important considering\nthe wide adoption of pre-trained foundational neural networks -- whose behavior\nremains poorly understood -- for transfer learning (TL) tasks. We present a\ncase study for TL on the Sentiment140 dataset and show that many pre-trained\nfoundation models encode different representations of Sentiment140's manually\ncurated test set $M$ from the automatically labeled training set $P$,\nconfirming that a distribution shift has occurred. We argue training on $P$ and\nmeasuring performance on $M$ is a biased measure of generalization. Experiments\non pre-trained GPT-2 show that the features learnable from $P$ do not improve\n(and in fact hamper) performance on $M$. Linear probes on pre-trained GPT-2's\nrepresentations are robust and may even outperform overall fine-tuning,\nimplying a fundamental importance for discerning distribution shift in\ntrain/test splits for model interpretation.\n","authors":["Max Vargas","Adam Tsou","Andrew Engel","Tony Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.13836v2.pdf","comment":"17 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.17026v3","updated":"2024-02-02T18:04:58Z","published":"2023-05-26T15:35:43Z","title":"How Powerful are Decoder-Only Transformer Neural Models?","summary":" In this article we prove that the general transformer neural model\nundergirding modern large language models (LLMs) is Turing complete under\nreasonable assumptions. This is the first work to directly address the Turing\ncompleteness of the underlying technology employed in GPT-x as past work has\nfocused on the more expressive, full auto-encoder transformer architecture.\nFrom this theoretical analysis, we show that the sparsity/compressibility of\nthe word embedding is an important consideration for Turing completeness to\nhold. We also show that Transformers are are a variant of B machines studied by\nHao Wang.\n","authors":["Jesse Roberts"],"pdf_url":"https://arxiv.org/pdf/2305.17026v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03674v2","updated":"2024-02-02T18:04:02Z","published":"2023-04-07T14:47:13Z","title":"Machine Learning with Requirements: a Manifesto","summary":" In the recent years, machine learning has made great advancements that have\nbeen at the root of many breakthroughs in different application domains.\nHowever, it is still an open issue how make them applicable to high-stakes or\nsafety-critical application domains, as they can often be brittle and\nunreliable. In this paper, we argue that requirements definition and\nsatisfaction can go a long way to make machine learning models even more\nfitting to the real world, especially in critical domains. To this end, we\npresent two problems in which (i) requirements arise naturally, (ii) machine\nlearning models are or can be fruitfully deployed, and (iii) neglecting the\nrequirements can have dramatic consequences. We show how the requirements\nspecification can be fruitfully integrated into the standard machine learning\ndevelopment pipeline, proposing a novel pyramid development process in which\nrequirements definition may impact all the subsequent phases in the pipeline,\nand viceversa.\n","authors":["Eleonora Giunchiglia","Fergus Imrie","Mihaela van der Schaar","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2304.03674v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.00769v4","updated":"2024-02-02T17:59:50Z","published":"2022-02-01T21:27:51Z","title":"Distributional Reinforcement Learning by Sinkhorn Divergence","summary":" The empirical success of distributional reinforcement learning~(RL) highly\ndepends on the distribution representation and the choice of distribution\ndivergence. In this paper, we propose \\textit{Sinkhorn distributional\nRL~(SinkhornDRL)} that learns unrestricted statistics from return distributions\nand leverages Sinkhorn divergence to minimize the difference between current\nand target Bellman return distributions. Theoretically, we prove the\ncontraction properties of SinkhornDRL, consistent with the interpolation nature\nof Sinkhorn divergence between Wasserstein distance and Maximum Mean\nDiscrepancy~(MMD). We also establish the equivalence between Sinkhorn\ndivergence and a regularized MMD with a regularized Moment Matching behavior,\ncontributing to explaining the superiority of SinkhornDRL. Empirically, we show\nthat SinkhornDRL is consistently better or comparable to existing algorithms on\nthe Atari games suite.\n","authors":["Ke Sun","Yingnan Zhao","Wulong Liu","Bei Jiang","Linglong Kong"],"pdf_url":"https://arxiv.org/pdf/2202.00769v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2110.03155"},{"id":"http://arxiv.org/abs/2402.01598v1","updated":"2024-02-02T17:51:49Z","published":"2024-02-02T17:51:49Z","title":"Learning from Two Decades of Blood Pressure Data: Demography-Specific\n Patterns Across 75 Million Patient Encounters","summary":" Hypertension remains a global health concern with a rising prevalence,\nnecessitating effective monitoring and understanding of blood pressure (BP)\ndynamics. This study delves into the wealth of information derived from BP\nmeasurement, a crucial approach in informing our understanding of hypertensive\ntrends. Numerous studies have reported on the relationship between BP variation\nand various factors. In this research, we leveraged an extensive dataset\ncomprising 75 million records spanning two decades, offering a unique\nopportunity to explore and analyze BP variations across demographic features\nsuch as age, race, and gender. Our findings revealed that gender-based BP\nvariation was not statistically significant, challenging conventional\nassumptions. Interestingly, systolic blood pressure (SBP) consistently\nincreased with age, while diastolic blood pressure (DBP) displayed a\ndistinctive peak in the forties age group. Moreover, our analysis uncovered\nintriguing similarities in the distribution of BP among some of the racial\ngroups. This comprehensive investigation contributes to the ongoing discourse\non hypertension and underscores the importance of considering diverse\ndemographic factors in understanding BP variations. Our results provide\nvaluable insights that may inform personalized healthcare approaches tailored\nto specific demographic profiles.\n","authors":["Seyedeh Somayyeh Mousavi","Yuting Guo","Abeed Sarker","Reza Sameni"],"pdf_url":"https://arxiv.org/pdf/2402.01598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00786v2","updated":"2024-02-02T17:43:41Z","published":"2024-02-01T17:17:55Z","title":"CroissantLLM: A Truly Bilingual French-English Language Model","summary":" We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T\nEnglish and French tokens, to bring to the research and industrial community a\nhigh-performance, fully open-sourced bilingual model that runs swiftly on\nconsumer-grade local hardware. To that end, we pioneer the approach of training\nan intrinsically bilingual model with a 1:1 English-to-French pretraining data\nratio, a custom tokenizer, and bilingual finetuning datasets. We release the\ntraining dataset, notably containing a French split with manually curated,\nhigh-quality, and varied data sources. To assess performance outside of\nEnglish, we craft a novel benchmark, FrenchBench, consisting of an array of\nclassification and generation tasks, covering various orthogonal aspects of\nmodel performance in the French Language. Additionally, rooted in transparency\nand to foster further Large Language Model research, we release codebases, and\ndozens of checkpoints across various model sizes, training data distributions,\nand training steps, as well as fine-tuned Chat models, and strong translation\nmodels. We evaluate our model through the FMTI framework, and validate 81 % of\nthe transparency criteria, far beyond the scores of even most open initiatives.\nThis work enriches the NLP landscape, breaking away from previous\nEnglish-centric work in order to strengthen our understanding of\nmultilinguality in language models.\n","authors":["Manuel Faysse","Patrick Fernandes","Nuno M. Guerreiro","António Loison","Duarte M. Alves","Caio Corro","Nicolas Boizard","João Alves","Ricardo Rei","Pedro H. Martins","Antoni Bigata Casademunt","François Yvon","André F. T. Martins","Gautier Viaud","Céline Hudelot","Pierre Colombo"],"pdf_url":"https://arxiv.org/pdf/2402.00786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11313v3","updated":"2024-02-02T17:36:32Z","published":"2023-06-20T06:15:19Z","title":"Deep graph kernel point processes","summary":" Point process models are widely used for continuous asynchronous event data,\nwhere each data point includes time and additional information called \"marks\",\nwhich can be locations, nodes, or event types. This paper presents a novel\npoint process model for discrete event data over graphs, where the event\ninteraction occurs within a latent graph structure. Our model builds upon\nHawkes's classic influence kernel-based formulation in the original\nself-exciting point processes work to capture the influence of historical\nevents on future events' occurrence. The key idea is to represent the influence\nkernel by Graph Neural Networks (GNN) to capture the underlying graph structure\nwhile harvesting the strong representation power of GNNs. Compared with prior\nworks focusing on directly modeling the conditional intensity function using\nneural networks, our kernel presentation herds the repeated event influence\npatterns more effectively by combining statistical and deep models, achieving\nbetter model estimation/learning efficiency and superior predictive\nperformance. Our work significantly extends the existing deep spatio-temporal\nkernel for point process data, which is inapplicable to our setting due to the\nfundamental difference in the nature of the observation space being Euclidean\nrather than a graph. We present comprehensive experiments on synthetic and\nreal-world data to show the superior performance of the proposed approach\nagainst the state-of-the-art in predicting future events and uncovering the\nrelational structure among data.\n","authors":["Zheng Dong","Matthew Repasky","Xiuyuan Cheng","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2306.11313v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06380v2","updated":"2024-02-02T17:31:05Z","published":"2023-10-10T07:46:54Z","title":"CAST: Cluster-Aware Self-Training for Tabular Data","summary":" Self-training has gained attraction because of its simplicity and\nversatility, yet it is vulnerable to noisy pseudo-labels caused by erroneous\nconfidence. Several solutions have been proposed to handle the problem, but\nthey require significant modifications in self-training algorithms or model\narchitecture, and most have limited applicability in tabular domains. To\naddress this issue, we explore a novel direction of reliable confidence in\nself-training contexts and conclude that the confidence, which represents the\nvalue of the pseudo-label, should be aware of the cluster assumption. In this\nregard, we propose Cluster-Aware Self-Training (CAST) for tabular data, which\nenhances existing self-training algorithms at a negligible cost without\nsignificant modifications. Concretely, CAST regularizes the confidence of the\nclassifier by leveraging local density for each class in the labeled training\ndata, forcing the pseudo-labels in low-density regions to have lower\nconfidence. Extensive empirical evaluations on up to 21 real-world datasets\nconfirm not only the superior performance of CAST but also its robustness in\nvarious setups in self-training contexts.\n","authors":["Minwook Kim","Juseong Kim","Ki Beom Kim","Giltae Song"],"pdf_url":"https://arxiv.org/pdf/2310.06380v2.pdf","comment":"10 pages for main body, and 16 additional pages for reference and\n appendix"},{"id":"http://arxiv.org/abs/2402.00306v2","updated":"2024-02-02T17:29:21Z","published":"2024-02-01T03:39:15Z","title":"An Accurate and Low-Parameter Machine Learning Architecture for Next\n Location Prediction","summary":" Next location prediction is a discipline that involves predicting a users\nnext location. Its applications include resource allocation, quality of\nservice, energy efficiency, and traffic management. This paper proposes an\nenergy-efficient, small, and low parameter machine learning (ML) architecture\nfor accurate next location prediction, deployable on modest base stations and\nedge devices. To accomplish this we ran a hundred hyperparameter experiments on\nthe full human mobility patterns of an entire city, to determine an exact ML\narchitecture that reached a plateau of accuracy with the least amount of model\nparameters. We successfully achieved a reduction in the number of model\nparameters within published ML architectures from 202 million down to 2\nmillion. This reduced the total size of the model parameters from 791 MB down\nto 8 MB. Additionally, this decreased the training time by a factor of four,\nthe amount of graphics processing unit (GPU) memory needed for training by a\nfactor of twenty, and the overall accuracy was increased from 80.16% to 82.54%.\nThis improvement allows for modest base stations and edge devices which do not\nhave a large amount of memory or storage, to deploy and utilize the proposed ML\narchitecture for next location prediction.\n","authors":["Calvin Jary","Nafiseh Kahani"],"pdf_url":"https://arxiv.org/pdf/2402.00306v2.pdf","comment":"Paper was accepted and presented in person at the 2023 IEEE Future\n Networks World Forum, in Baltimore, Maryland, USA"},{"id":"http://arxiv.org/abs/2402.01586v1","updated":"2024-02-02T17:26:23Z","published":"2024-02-02T17:26:23Z","title":"TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent\n Constitution","summary":" The emergence of LLM-based agents has garnered considerable attention, yet\ntheir trustworthiness remains an under-explored area. As agents can directly\ninteract with the physical environment, their reliability and safety is\ncritical. This paper presents an Agent-Constitution-based agent framework,\nTrustAgent, an initial investigation into improving the safety dimension of\ntrustworthiness in LLM-based agents. This framework consists of threefold\nstrategies: pre-planning strategy which injects safety knowledge to the model\nprior to plan generation, in-planning strategy which bolsters safety during\nplan generation, and post-planning strategy which ensures safety by\npost-planning inspection. Through experimental analysis, we demonstrate how\nthese approaches can effectively elevate an LLM agent's safety by identifying\nand preventing potential dangers. Furthermore, we explore the intricate\nrelationships between safety and helpfulness, and between the model's reasoning\nability and its efficacy as a safe agent. This paper underscores the imperative\nof integrating safety awareness and trustworthiness into the design and\ndeployment of LLM-based agents, not only to enhance their performance but also\nto ensure their responsible integration into human-centric environments. Data\nand code are available at https://github.com/agiresearch/TrustAgent.\n","authors":["Wenyue Hua","Xianjun Yang","Zelong Li","Cheng Wei","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01586v1.pdf","comment":"16 pages, 3 figures, 5 tables, comments and suggestions are welcome"},{"id":"http://arxiv.org/abs/2312.15101v2","updated":"2024-02-02T17:26:07Z","published":"2023-12-22T22:46:48Z","title":"Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model\n Conversions","summary":" Converting deep learning models between frameworks is a common step to\nmaximize model compatibility across devices and leverage optimization features\nthat may be exclusively provided in one deep learning framework. However, this\nconversion process may be riddled with bugs, making the converted models either\nundeployable or problematic, considerably degrading their prediction\ncorrectness.\n We propose an automated approach for fault localization and repair, Fix-Con,\nduring model conversion between deep learning frameworks. Fix-Con is capable of\ndetecting and fixing faults introduced in model input, parameters,\nhyperparameters, and the model graph during conversion.\n Fix-Con uses a set of fault types mined from surveying conversion issues\nraised to localize potential conversion faults in the converted target model,\nand then repairs them appropriately, e.g. replacing the parameters of the\ntarget model with those from the source model. This is done iteratively for\nevery image in the dataset with output label differences between the source\nmodel and the converted target model until all differences are resolved. We\nevaluate the effectiveness of Fix-Con in fixing model conversion bugs of three\nwidely used image recognition models converted across four different deep\nlearning frameworks. Overall, Fix-Con was able to either completely repair, or\nsignificantly improve the performance of 14 out of the 15 erroneous conversion\ncases.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2312.15101v2.pdf","comment":"12 pages, 3 figures, 4 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2402.00318v2","updated":"2024-02-02T17:08:39Z","published":"2024-02-01T04:05:24Z","title":"Analog-digital Scheduling for Federated Learning: A\n Communication-Efficient Approach","summary":" Over-the-air (OTA) computation has recently emerged as a\ncommunication-efficient Federated Learning (FL) paradigm to train machine\nlearning models over wireless networks. However, its performance is limited by\nthe device with the worst SNR, resulting in fast yet noisy updates. On the\nother hand, allocating orthogonal resource blocks (RB) to individual devices\nvia digital channels mitigates the noise problem, at the cost of increased\ncommunication latency. In this paper, we address this discrepancy and present\nADFL, a novel Analog-Digital FL scheme: in each round, the parameter server\n(PS) schedules each device to either upload its gradient via the analog OTA\nscheme or transmit its quantized gradient over an orthogonal RB using the\n``digital\" scheme. Focusing on a single FL round, we cast the optimal\nscheduling problem as the minimization of the mean squared error (MSE) on the\nestimated global gradient at the PS, subject to a delay constraint, yielding\nthe optimal device scheduling configuration and quantization bits for the\ndigital devices. Our simulation results show that ADFL, by scheduling most of\nthe devices in the OTA scheme while also occasionally employing the digital\nscheme for a few devices, consistently outperforms OTA-only and digital-only\nschemes, in both i.i.d. and non-i.i.d. settings.\n","authors":["Muhammad Faraz Ul Abrar","Nicolò Michelusi"],"pdf_url":"https://arxiv.org/pdf/2402.00318v2.pdf","comment":"Appeared at the 2023 Asilomar Conference on Signals, Systems, and\n Computers"},{"id":"http://arxiv.org/abs/2402.01571v1","updated":"2024-02-02T17:07:39Z","published":"2024-02-02T17:07:39Z","title":"Spiking Music: Audio Compression with Event Based Auto-encoders","summary":" Neurons in the brain communicate information via punctual events called\nspikes. The timing of spikes is thought to carry rich information, but it is\nnot clear how to leverage this in digital systems. We demonstrate that\nevent-based encoding is efficient for audio compression. To build this\nevent-based representation we use a deep binary auto-encoder, and under high\nsparsity pressure, the model enters a regime where the binary event matrix is\nstored more efficiently with sparse matrix storage algorithms. We test this on\nthe large MAESTRO dataset of piano recordings against vector quantized\nauto-encoders. Not only does our \"Spiking Music compression\" algorithm achieve\na competitive compression/reconstruction trade-off, but selectivity and\nsynchrony between encoded events and piano key strikes emerge without\nsupervision in the sparse regime.\n","authors":["Martim Lisboa","Guillaume Bellec"],"pdf_url":"https://arxiv.org/pdf/2402.01571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01567v1","updated":"2024-02-02T17:00:17Z","published":"2024-02-02T17:00:17Z","title":"Understanding Adam Optimizer via Online Learning of Updates: Adam is\n FTRL in Disguise","summary":" Despite the success of the Adam optimizer in practice, the theoretical\nunderstanding of its algorithmic components still remains limited. In\nparticular, most existing analyses of Adam show the convergence rate that can\nbe simply achieved by non-adative algorithms like SGD. In this work, we provide\na different perspective based on online learning that underscores the\nimportance of Adam's algorithmic components. Inspired by Cutkosky et al.\n(2023), we consider the framework called online learning of updates, where we\nchoose the updates of an optimizer based on an online learner. With this\nframework, the design of a good optimizer is reduced to the design of a good\nonline learner. Our main observation is that Adam corresponds to a principled\nonline learning framework called Follow-the-Regularized-Leader (FTRL). Building\non this observation, we study the benefits of its algorithmic components from\nthe online learning perspective.\n","authors":["Kwangjun Ahn","Zhiyu Zhang","Yunbum Kook","Yan Dai"],"pdf_url":"https://arxiv.org/pdf/2402.01567v1.pdf","comment":"Comments would be appreciated!"},{"id":"http://arxiv.org/abs/2309.06895v2","updated":"2024-02-02T16:55:00Z","published":"2023-09-13T11:37:04Z","title":"MagiCapture: High-Resolution Multi-Concept Portrait Customization","summary":" Large-scale text-to-image models including Stable Diffusion are capable of\ngenerating high-fidelity photorealistic portrait images. There is an active\nresearch area dedicated to personalizing these models, aiming to synthesize\nspecific subjects or styles using provided sets of reference images. However,\ndespite the plausible results from these personalization methods, they tend to\nproduce images that often fall short of realism and are not yet on a\ncommercially viable level. This is particularly noticeable in portrait image\ngeneration, where any unnatural artifact in human faces is easily discernible\ndue to our inherent human bias. To address this, we introduce MagiCapture, a\npersonalization method for integrating subject and style concepts to generate\nhigh-resolution portrait images using just a few subject and style references.\nFor instance, given a handful of random selfies, our fine-tuned model can\ngenerate high-quality portrait images in specific styles, such as passport or\nprofile photos. The main challenge with this task is the absence of ground\ntruth for the composed concepts, leading to a reduction in the quality of the\nfinal output and an identity shift of the source subject. To address these\nissues, we present a novel Attention Refocusing loss coupled with auxiliary\npriors, both of which facilitate robust learning within this weakly supervised\nlearning setting. Our pipeline also includes additional post-processing steps\nto ensure the creation of highly realistic outputs. MagiCapture outperforms\nother baselines in both quantitative and qualitative evaluations and can also\nbe generalized to other non-human objects.\n","authors":["Junha Hyung","Jaeyo Shin","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2309.06895v2.pdf","comment":"18 pages, 17 figures"},{"id":"http://arxiv.org/abs/2310.13164v4","updated":"2024-02-02T16:43:57Z","published":"2023-10-19T21:31:11Z","title":"Almost Equivariance via Lie Algebra Convolutions","summary":" Recently, the equivariance of models with respect to a group action has\nbecome an important topic of research in machine learning. Analysis of the\nbuilt-in equivariance of existing neural network architectures, as well as the\nstudy of building models that explicitly \"bake in\" equivariance, have become\nsignificant research areas in their own right. However, imbuing an architecture\nwith a specific group equivariance imposes a strong prior on the types of data\ntransformations that the model expects to see. While strictly-equivariant\nmodels enforce symmetries, real-world data does not always conform to such\nstrict equivariances. In such cases, the prior of strict equivariance can\nactually prove too strong and cause models to underperform. Therefore, in this\nwork we study a closely related topic, that of almost equivariance. We provide\na definition of almost equivariance and give a practical method for encoding\nalmost equivariance in models by appealing to the Lie algebra of a Lie group.\nSpecifically, we define Lie algebra convolutions and demonstrate that they\noffer several benefits over Lie group convolutions, including being\nwell-defined for non-compact Lie groups having non-surjective exponential map.\nFrom there, we demonstrate connections between the notions of equivariance and\nisometry and those of almost equivariance and almost isometry. We prove two\nexistence theorems, one showing the existence of almost isometries within\nbounded distance of isometries of a manifold, and another showing the converse\nfor Hilbert spaces. We extend these theorems to prove the existence of almost\nequivariant manifold embeddings within bounded distance of fully equivariant\nembedding functions, subject to certain constraints on the group action and the\nfunction class. Finally, we demonstrate the validity of our approach by\nbenchmarking against datasets in fully equivariant and almost equivariant\nsettings.\n","authors":["Daniel McNeela"],"pdf_url":"https://arxiv.org/pdf/2310.13164v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13734v3","updated":"2024-02-02T16:42:19Z","published":"2023-01-31T16:12:31Z","title":"Improving Monte Carlo Evaluation with Offline Data","summary":" Most reinforcement learning practitioners evaluate their policies with online\nMonte Carlo estimators for either hyperparameter tuning or testing different\nalgorithmic design choices, where the policy is repeatedly executed in the\nenvironment to get the average outcome. Such massive interactions with the\nenvironment are prohibitive in many scenarios. In this paper, we propose novel\nmethods that improve the data efficiency of online Monte Carlo estimators while\nmaintaining their unbiasedness. We first propose a tailored closed-form\nbehavior policy that provably reduces the variance of an online Monte Carlo\nestimator. We then design efficient algorithms to learn this closed-form\nbehavior policy from previously collected offline data. Theoretical analysis is\nprovided to characterize how the behavior policy learning error affects the\namount of reduced variance. Compared with previous works, our method achieves\nbetter empirical performance in a broader set of environments, with fewer\nrequirements for offline data.\n","authors":["Shuze Liu","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.13734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01546v1","updated":"2024-02-02T16:39:08Z","published":"2024-02-02T16:39:08Z","title":"Privacy-Preserving Distributed Learning for Residential Short-Term Load\n Forecasting","summary":" In the realm of power systems, the increasing involvement of residential\nusers in load forecasting applications has heightened concerns about data\nprivacy. Specifically, the load data can inadvertently reveal the daily\nroutines of residential users, thereby posing a risk to their property\nsecurity. While federated learning (FL) has been employed to safeguard user\nprivacy by enabling model training without the exchange of raw data, these FL\nmodels have shown vulnerabilities to emerging attack techniques, such as Deep\nLeakage from Gradients and poisoning attacks. To counteract these, we initially\nemploy a Secure-Aggregation (SecAgg) algorithm that leverages multiparty\ncomputation cryptographic techniques to mitigate the risk of gradient leakage.\nHowever, the introduction of SecAgg necessitates the deployment of additional\nsub-center servers for executing the multiparty computation protocol, thereby\nescalating computational complexity and reducing system robustness, especially\nin scenarios where one or more sub-centers are unavailable. To address these\nchallenges, we introduce a Markovian Switching-based distributed training\nframework, the convergence of which is substantiated through rigorous\ntheoretical analysis. The Distributed Markovian Switching (DMS) topology shows\nstrong robustness towards the poisoning attacks as well. Case studies employing\nreal-world power system load data validate the efficacy of our proposed\nalgorithm. It not only significantly minimizes communication complexity but\nalso maintains accuracy levels comparable to traditional FL methods, thereby\nenhancing the scalability of our load forecasting algorithm.\n","authors":["Yi Dong","Yingjie Wang","Mariana Gama","Mustafa A. Mustafa","Geert Deconinck","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2402.01546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15659v2","updated":"2024-02-02T16:38:35Z","published":"2023-05-25T02:12:33Z","title":"How to escape sharp minima with random perturbations","summary":" Modern machine learning applications have witnessed the remarkable success of\noptimization algorithms that are designed to find flat minima. Motivated by\nthis design choice, we undertake a formal study that (i) formulates the notion\nof flat minima, and (ii) studies the complexity of finding them. Specifically,\nwe adopt the trace of the Hessian of the cost function as a measure of\nflatness, and use it to formally define the notion of approximate flat minima.\nUnder this notion, we then analyze algorithms that find approximate flat minima\nefficiently. For general cost functions, we discuss a gradient-based algorithm\nthat finds an approximate flat local minimum efficiently. The main component of\nthe algorithm is to use gradients computed from randomly perturbed iterates to\nestimate a direction that leads to flatter minima. For the setting where the\ncost function is an empirical risk over training data, we present a faster\nalgorithm that is inspired by a recently proposed practical algorithm called\nsharpness-aware minimization, supporting its success in practice.\n","authors":["Kwangjun Ahn","Ali Jadbabaie","Suvrit Sra"],"pdf_url":"https://arxiv.org/pdf/2305.15659v2.pdf","comment":"Comments would be appreciated!"},{"id":"http://arxiv.org/abs/2306.05185v2","updated":"2024-02-02T16:37:31Z","published":"2023-06-08T13:33:20Z","title":"On the Identification and Optimization of Nonsmooth Superposition\n Operators in Semilinear Elliptic PDEs","summary":" We study an infinite-dimensional optimization problem that aims to identify\nthe Nemytskii operator in the nonlinear part of a prototypical semilinear\nelliptic partial differential equation (PDE) which minimizes the distance\nbetween the PDE-solution and a given desired state. In contrast to previous\nworks, we consider this identification problem in a low-regularity regime in\nwhich the function inducing the Nemytskii operator is a-priori only known to be\nan element of $H^1_{loc}(\\mathbb{R})$. This makes the studied problem class a\nsuitable point of departure for the rigorous analysis of training problems for\nlearning-informed PDEs in which an unknown superposition operator is\napproximated by means of a neural network with nonsmooth activation functions\n(ReLU, leaky-ReLU, etc.). We establish that, despite the low regularity of the\ncontrols, it is possible to derive a classical stationarity system for local\nminimizers and to solve the considered problem by means of a gradient\nprojection method. The convergence of the resulting algorithm is proven in the\nfunction space setting. It is also shown that the established first-order\nnecessary optimality conditions imply that locally optimal superposition\noperators share various characteristic properties with commonly used activation\nfunctions: They are always sigmoidal, continuously differentiable away from the\norigin, and typically possess a distinct kink at zero. The paper concludes with\nnumerical experiments which confirm the theoretical findings.\n","authors":["Constantin Christof","Julia Kowalczyk"],"pdf_url":"https://arxiv.org/pdf/2306.05185v2.pdf","comment":"Minor revision; to appear in ESAIM COCV"},{"id":"http://arxiv.org/abs/2402.01543v1","updated":"2024-02-02T16:35:51Z","published":"2024-02-02T16:35:51Z","title":"Adaptive Optimization for Prediction with Missing Data","summary":" When training predictive models on data with missing entries, the most widely\nused and versatile approach is a pipeline technique where we first impute\nmissing entries and then compute predictions. In this paper, we view prediction\nwith missing data as a two-stage adaptive optimization problem and propose a\nnew class of models, adaptive linear regression models, where the regression\ncoefficients adapt to the set of observed features. We show that some adaptive\nlinear regression models are equivalent to learning an imputation rule and a\ndownstream linear regression model simultaneously instead of sequentially. We\nleverage this joint-impute-then-regress interpretation to generalize our\nframework to non-linear models. In settings where data is strongly not missing\nat random, our methods achieve a 2-10% improvement in out-of-sample accuracy.\n","authors":["Dimitris Bertsimas","Arthur Delarue","Jean Pauphilet"],"pdf_url":"https://arxiv.org/pdf/2402.01543v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2104.03158"},{"id":"http://arxiv.org/abs/2402.01542v1","updated":"2024-02-02T16:35:02Z","published":"2024-02-02T16:35:02Z","title":"Learning Collective Variables for Protein Folding with Labeled Data\n Augmentation through Geodesic Interpolation","summary":" In molecular dynamics (MD) simulations, rare events, such as protein folding,\nare typically studied by means of enhanced sampling techniques, most of which\nrely on the definition of a collective variable (CV) along which the\nacceleration occurs. Obtaining an expressive CV is crucial, but often hindered\nby the lack of information about the particular event, e.g., the transition\nfrom unfolded to folded conformation. We propose a simulation-free data\naugmentation strategy using physics-inspired metrics to generate geodesic\ninterpolations resembling protein folding transitions, thereby improving\nsampling efficiency without true transition state samples. Leveraging\ninterpolation progress parameters, we introduce a regression-based learning\nscheme for CV models, which outperforms classifier-based methods when\ntransition state data is limited and noisy\n","authors":["Soojung Yang","Juno Nam","Johannes C. B. Dietschreit","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2402.01542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01537v1","updated":"2024-02-02T16:27:45Z","published":"2024-02-02T16:27:45Z","title":"Closing the Gap in Human Behavior Analysis: A Pipeline for Synthesizing\n Trimodal Data","summary":" In pervasive machine learning, especially in Human Behavior Analysis (HBA),\nRGB has been the primary modality due to its accessibility and richness of\ninformation. However, linked with its benefits are challenges, including\nsensitivity to lighting conditions and privacy concerns. One possibility to\novercome these vulnerabilities is to resort to different modalities. For\ninstance, thermal is particularly adept at accentuating human forms, while\ndepth adds crucial contextual layers. Despite their known benefits, only a few\nHBA-specific datasets that integrate these modalities exist. To address this\nshortage, our research introduces a novel generative technique for creating\ntrimodal, i.e., RGB, thermal, and depth, human-focused datasets. This technique\ncapitalizes on human segmentation masks derived from RGB images, combined with\nthermal and depth backgrounds that are sourced automatically. With these two\ningredients, we synthesize depth and thermal counterparts from existing RGB\ndata utilizing conditional image-to-image translation. By employing this\napproach, we generate trimodal data that can be leveraged to train models for\nsettings with limited data, bad lightning conditions, or privacy-sensitive\nareas.\n","authors":["Christian Stippel","Thomas Heitzinger","Rafael Sterzinger","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2402.01537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12616v2","updated":"2024-02-02T16:24:14Z","published":"2023-12-19T21:45:38Z","title":"Online Variational Sequential Monte Carlo","summary":" Being the most classical generative model for serial data, state-space models\n(SSM) are fundamental in AI and statistical machine learning. In SSM, any form\nof parameter learning or latent state inference typically involves the\ncomputation of complex latent-state posteriors. In this work, we build upon the\nvariational sequential Monte Carlo (VSMC) method, which provides\ncomputationally efficient and accurate model parameter estimation and Bayesian\nlatent-state inference by combining particle methods and variational inference.\nWhile standard VSMC operates in the offline mode, by re-processing repeatedly a\ngiven batch of data, we distribute the approximation of the gradient of the\nVSMC surrogate ELBO in time using stochastic approximation, allowing for online\nlearning in the presence of streams of data. This results in an algorithm,\nonline VSMC, that is capable of performing efficiently, entirely on-the-fly,\nboth parameter estimation and particle proposal adaptation. In addition, we\nprovide rigorous theoretical results describing the algorithm's convergence\nproperties as the number of data tends to infinity as well as numerical\nillustrations of its excellent convergence properties and usefulness also in\nbatch-processing settings.\n","authors":["Alessandro Mastrototaro","Jimmy Olsson"],"pdf_url":"https://arxiv.org/pdf/2312.12616v2.pdf","comment":"In this version there are additional simulations in Section 5.1, some\n added references, and minor typos fixed"},{"id":"http://arxiv.org/abs/2303.05092v3","updated":"2024-02-02T16:18:10Z","published":"2023-03-09T08:04:16Z","title":"Task Aware Dreamer for Task Generalization in Reinforcement Learning","summary":" A long-standing goal of reinforcement learning is to acquire agents that can\nlearn on training tasks and generalize well on unseen tasks that may share a\nsimilar dynamic but with different reward functions. The ability to generalize\nacross tasks is important as it determines an agent's adaptability to\nreal-world scenarios where reward mechanisms might vary. In this work, we first\nshow that training a general world model can utilize similar structures in\nthese tasks and help train more generalizable agents. Extending world models\ninto the task generalization setting, we introduce a novel method named Task\nAware Dreamer (TAD), which integrates reward-informed features to identify\nconsistent latent characteristics across tasks. Within TAD, we compute the\nvariational lower bound of sample data log-likelihood, which introduces a new\nterm designed to differentiate tasks using their states, as the optimization\nobjective of our reward-informed world models. To demonstrate the advantages of\nthe reward-informed policy in TAD, we introduce a new metric called Task\nDistribution Relevance (TDR) which quantitatively measures the relevance of\ndifferent tasks. For tasks exhibiting a high TDR, i.e., the tasks differ\nsignificantly, we illustrate that Markovian policies struggle to distinguish\nthem, thus it is necessary to utilize reward-informed policies in TAD.\nExtensive experiments in both image-based and state-based tasks show that TAD\ncan significantly improve the performance of handling different tasks\nsimultaneously, especially for those with high TDR, and display a strong\ngeneralization ability to unseen tasks.\n","authors":["Chengyang Ying","Zhongkai Hao","Xinning Zhou","Hang Su","Songming Liu","Dong Yan","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01528v1","updated":"2024-02-02T16:15:24Z","published":"2024-02-02T16:15:24Z","title":"Decoding Speculative Decoding","summary":" Speculative Decoding is a widely used technique to speed up inference for\nLarge Language Models (LLMs) without modifying its outcome. When performing\ninference on an LLM, speculative decoding uses a smaller draft model which\ngenerates speculative tokens and then uses the target LLM to verify those draft\ntokens. The speedup provided by speculative decoding heavily depends on the\nchoice of the draft model. It has been widely suggested to select a draft model\nthat provides a high probability of the generated token being accepted by the\nLLM to achieve the highest throughput. However, our experiments indicate the\ncontrary with throughput diminishing as the probability of generated tokens to\nbe accepted by the target model increases. To understand this phenomenon, we\nperform extensive experiments to characterize the different factors that affect\nspeculative decoding and how those factors interact and affect the speedups.\nBased on our experiments we describe an analytical model which can be used to\ndecide the right draft model for a given workload. Further, using our insights\nwe design a new draft model for LLaMA-65B which can provide 30% higher\nthroughput than existing draft models.\n","authors":["Minghao Yan","Saurabh Agarwal","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2402.01528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18449v2","updated":"2024-02-02T16:14:01Z","published":"2023-10-27T19:47:26Z","title":"Conditional Generative Representation for Black-Box Optimization with\n Implicit Constraints","summary":" Black-box optimization (BBO) has become increasingly relevant for tackling\ncomplex decision-making problems, especially in public policy domains such as\npolice districting. However, its broader application in public policymaking is\nhindered by the complexity of defining feasible regions and the\nhigh-dimensionality of decisions. This paper introduces a novel BBO framework,\ntermed as the Conditional And Generative Black-box Optimization (CageBO). This\napproach leverages a conditional variational autoencoder to learn the\ndistribution of feasible decisions, enabling a two-way mapping between the\noriginal decision space and a simplified, constraint-free latent space. The\nCageBO efficiently handles the implicit constraints often found in public\npolicy applications, allowing for optimization in the latent space while\nevaluating objectives in the original space. We validate our method through a\ncase study on large-scale police districting problems in Atlanta, Georgia. Our\nresults reveal that our CageBO offers notable improvements in performance and\nefficiency compared to the baselines.\n","authors":["Wenqian Xing","Jungho Lee","Chong Liu","Shixiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.18449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01524v1","updated":"2024-02-02T16:10:29Z","published":"2024-02-02T16:10:29Z","title":"HyperPlanes: Hypernetwork Approach to Rapid NeRF Adaptation","summary":" Neural radiance fields (NeRFs) are a widely accepted standard for\nsynthesizing new 3D object views from a small number of base images. However,\nNeRFs have limited generalization properties, which means that we need to use\nsignificant computational resources to train individual architectures for each\nitem we want to represent. To address this issue, we propose a few-shot\nlearning approach based on the hypernetwork paradigm that does not require\ngradient optimization during inference. The hypernetwork gathers information\nfrom the training data and generates an update for universal weights. As a\nresult, we have developed an efficient method for generating a high-quality 3D\nobject representation from a small number of images in a single step. This has\nbeen confirmed by direct comparison with the state-of-the-art solutions and a\ncomprehensive ablation study.\n","authors":["Paweł Batorski","Dawid Malarz","Marcin Przewięźlikowski","Marcin Mazur","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2402.01524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01520v1","updated":"2024-02-02T16:06:24Z","published":"2024-02-02T16:06:24Z","title":"Low-Resource Cross-Domain Singing Voice Synthesis via Reduced\n Self-Supervised Speech Representations","summary":" In this paper, we propose a singing voice synthesis model, Karaoker-SSL, that\nis trained only on text and speech data as a typical multi-speaker acoustic\nmodel. It is a low-resource pipeline that does not utilize any singing data\nend-to-end, since its vocoder is also trained on speech data. Karaoker-SSL is\nconditioned by self-supervised speech representations in an unsupervised\nmanner. We preprocess these representations by selecting only a subset of their\ntask-correlated dimensions. The conditioning module is indirectly guided to\ncapture style information during training by multi-tasking. This is achieved\nwith a Conformer-based module, which predicts the pitch from the acoustic\nmodel's output. Thus, Karaoker-SSL allows singing voice synthesis without\nreliance on hand-crafted and domain-specific features. There are also no\nrequirements for text alignments or lyrics timestamps. To refine the voice\nquality, we employ a U-Net discriminator that is conditioned on the target\nspeaker and follows a Diffusion GAN training scheme.\n","authors":["Panos Kakoulidis","Nikolaos Ellinas","Georgios Vamvoukakis","Myrsini Christidou","Alexandra Vioni","Georgia Maniati","Junkwang Oh","Gunu Jho","Inchul Hwang","Pirros Tsiakoulis","Aimilios Chalamandaris"],"pdf_url":"https://arxiv.org/pdf/2402.01520v1.pdf","comment":"Accepted to IEEE ICASSP SASB 2024"},{"id":"http://arxiv.org/abs/2402.01515v1","updated":"2024-02-02T15:55:25Z","published":"2024-02-02T15:55:25Z","title":"Enhancing Stochastic Gradient Descent: A Unified Framework and Novel\n Acceleration Methods for Faster Convergence","summary":" Based on SGD, previous works have proposed many algorithms that have improved\nconvergence speed and generalization in stochastic optimization, such as SGDm,\nAdaGrad, Adam, etc. However, their convergence analysis under non-convex\nconditions is challenging. In this work, we propose a unified framework to\naddress this issue. For any first-order methods, we interpret the updated\ndirection $g_t$ as the sum of the stochastic subgradient $\\nabla f_t(x_t)$ and\nan additional acceleration term $\\frac{2|\\langle v_t, \\nabla f_t(x_t)\n\\rangle|}{\\|v_t\\|_2^2} v_t$, thus we can discuss the convergence by analyzing\n$\\langle v_t, \\nabla f_t(x_t) \\rangle$. Through our framework, we have\ndiscovered two plug-and-play acceleration methods: \\textbf{Reject Accelerating}\nand \\textbf{Random Vector Accelerating}, we theoretically demonstrate that\nthese two methods can directly lead to an improvement in convergence rate.\n","authors":["Yichuan Deng","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2402.01515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01514v1","updated":"2024-02-02T15:54:53Z","published":"2024-02-02T15:54:53Z","title":"Mapping the Multiverse of Latent Representations","summary":" Echoing recent calls to counter reliability and robustness concerns in\nmachine learning via multiverse analysis, we present PRESTO, a principled\nframework for mapping the multiverse of machine-learning models that rely on\nlatent representations. Although such models enjoy widespread adoption, the\nvariability in their embeddings remains poorly understood, resulting in\nunnecessary complexity and untrustworthy representations. Our framework uses\npersistent homology to characterize the latent spaces arising from different\ncombinations of diverse machine-learning methods, (hyper)parameter\nconfigurations, and datasets, allowing us to measure their pairwise\n(dis)similarity and statistically reason about their distributions. As we\ndemonstrate both theoretically and empirically, our pipeline preserves\ndesirable properties of collections of latent representations, and it can be\nleveraged to perform sensitivity analysis, detect anomalous embeddings, or\nefficiently and effectively navigate hyperparameter search spaces.\n","authors":["Jeremy Wayland","Corinna Coupette","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2402.01514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01509v1","updated":"2024-02-02T15:43:51Z","published":"2024-02-02T15:43:51Z","title":"Advancing Brain Tumor Inpainting with Generative Models","summary":" Synthesizing healthy brain scans from diseased brain scans offers a potential\nsolution to address the limitations of general-purpose algorithms, such as\ntissue segmentation and brain extraction algorithms, which may not effectively\nhandle diseased images. We consider this a 3D inpainting task and investigate\nthe adaptation of 2D inpainting methods to meet the requirements of 3D magnetic\nresonance imaging(MRI) data. Our contributions encompass potential\nmodifications tailored to MRI-specific needs, and we conducted evaluations of\nmultiple inpainting techniques using the BraTS2023 Inpainting datasets to\nassess their efficacy and limitations.\n","authors":["Ruizhi Zhu","Xinru Zhang","Haowen Pang","Chundan Xu","Chuyang Ye"],"pdf_url":"https://arxiv.org/pdf/2402.01509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01502v1","updated":"2024-02-02T15:36:43Z","published":"2024-02-02T15:36:43Z","title":"Why do Random Forests Work? Understanding Tree Ensembles as\n Self-Regularizing Adaptive Smoothers","summary":" Despite their remarkable effectiveness and broad application, the drivers of\nsuccess underlying ensembles of trees are still not fully understood. In this\npaper, we highlight how interpreting tree ensembles as adaptive and\nself-regularizing smoothers can provide new intuition and deeper insight to\nthis topic. We use this perspective to show that, when studied as smoothers,\nrandomized tree ensembles not only make predictions that are quantifiably more\nsmooth than the predictions of the individual trees they consist of, but also\nfurther regulate their smoothness at test-time based on the dissimilarity\nbetween testing and training inputs. First, we use this insight to revisit,\nrefine and reconcile two recent explanations of forest success by providing a\nnew way of quantifying the conjectured behaviors of tree ensembles objectively\nby measuring the effective degree of smoothing they imply. Then, we move beyond\nexisting explanations for the mechanisms by which tree ensembles improve upon\nindividual trees and challenge the popular wisdom that the superior performance\nof forests should be understood as a consequence of variance reduction alone.\nWe argue that the current high-level dichotomy into bias- and\nvariance-reduction prevalent in statistics is insufficient to understand tree\nensembles -- because the prevailing definition of bias does not capture\ndifferences in the expressivity of the hypothesis classes formed by trees and\nforests. Instead, we show that forests can improve upon trees by three distinct\nmechanisms that are usually implicitly entangled. In particular, we demonstrate\nthat the smoothing effect of ensembling can reduce variance in predictions due\nto noise in outcome generation, reduce variability in the quality of the\nlearned function given fixed input data and reduce potential bias in learnable\nfunctions by enriching the available hypothesis space.\n","authors":["Alicia Curth","Alan Jeffares","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2402.01502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15351v2","updated":"2024-02-02T15:28:17Z","published":"2023-10-23T20:30:44Z","title":"Random Exploration in Bayesian Optimization: Order-Optimal Regret and\n Computational Efficiency","summary":" We consider Bayesian optimization using Gaussian Process models, also\nreferred to as kernel-based bandit optimization. We study the methodology of\nexploring the domain using random samples drawn from a distribution. We show\nthat this random exploration approach achieves the optimal error rates. Our\nanalysis is based on novel concentration bounds in an infinite dimensional\nHilbert space established in this work, which may be of independent interest.\nWe further develop an algorithm based on random exploration with domain\nshrinking and establish its order-optimal regret guarantees under both\nnoise-free and noisy settings. In the noise-free setting, our analysis closes\nthe existing gap in regret performance and thereby resolves a COLT open\nproblem. The proposed algorithm also enjoys a computational advantage over\nprevailing methods due to the random exploration that obviates the expensive\noptimization of a non-convex acquisition function for choosing the query points\nat each iteration.\n","authors":["Sudeep Salgia","Sattar Vakili","Qing Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.15351v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09200v3","updated":"2024-02-02T15:23:17Z","published":"2023-11-15T18:43:29Z","title":"Are Normalizing Flows the Key to Unlocking the Exponential Mechanism? A\n Path through the Accuracy-Privacy Ceiling Constraining Differentially Private\n ML","summary":" The state of the art and de facto standard for differentially private machine\nlearning (ML) is differentially private stochastic gradient descent (DPSGD).\nYet, the method is inherently wasteful. By adding noise to every gradient, it\ndiminishes the overall privacy with every gradient step. Despite 15 years of\nfruitful research advancing the composition theorems, sub-sampling methods, and\nimplementation techniques, adequate accuracy and privacy is often unattainable\nwith current private ML methods. Meanwhile, the Exponential Mechanism (ExpM),\ndesigned for private optimization, has been historically sidelined from\nprivately training modern ML algorithms primarily because ExpM requires\nsampling from a historically intractable density. Despite the recent discovery\nof Normalizing Flow models (NFs), expressive deep networks for approximating\nintractable distributions, ExpM remains in the background. Our position is that\nleveraging NFs to circumvent historic obstructions of ExpM is a potentially\ntransformational solution for differentially private ML worth attention. We\nintroduce a new training method, ExpM+NF, as a potential alternative to DPSGD,\nand we provide experiment with logistic regression and a modern deep learning\nmodel to test whether training via ExpM+NF is viable with \"good\" privacy\nparameters. Under the assumption that the NF output distribution is the ExpM\ndistribution, we are able to achieve $\\varepsilon$ a low as $1\\mathrm{e}{-3}$\n-- three orders of magnitude stronger privacy with similar accuracy. This work\noutlines a new avenue for advancing differentially private ML, namely\ndiscovering NF approximation guarantees. Code to be provided after review.\n","authors":["Robert A. Bridges","Vandy J. Tombs","Christopher B. Stanley"],"pdf_url":"https://arxiv.org/pdf/2311.09200v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01493v1","updated":"2024-02-02T15:22:06Z","published":"2024-02-02T15:22:06Z","title":"Sliced-Wasserstein Estimation with Spherical Harmonics as Control\n Variates","summary":" The Sliced-Wasserstein (SW) distance between probability measures is defined\nas the average of the Wasserstein distances resulting for the associated\none-dimensional projections. As a consequence, the SW distance can be written\nas an integral with respect to the uniform measure on the sphere and the Monte\nCarlo framework can be employed for calculating the SW distance. Spherical\nharmonics are polynomials on the sphere that form an orthonormal basis of the\nset of square-integrable functions on the sphere. Putting these two facts\ntogether, a new Monte Carlo method, hereby referred to as Spherical Harmonics\nControl Variates (SHCV), is proposed for approximating the SW distance using\nspherical harmonics as control variates. The resulting approach is shown to\nhave good theoretical properties, e.g., a no-error property for Gaussian\nmeasures under a certain form of linear dependency between the variables.\nMoreover, an improved rate of convergence, compared to Monte Carlo, is\nestablished for general measures. The convergence analysis relies on the\nLipschitz property associated to the SW integrand. Several numerical\nexperiments demonstrate the superior performance of SHCV against\nstate-of-the-art methods for SW distance computation.\n","authors":["Rémi Leluc","Aymeric Dieuleveut","François Portier","Johan Segers","Aigerim Zhuman"],"pdf_url":"https://arxiv.org/pdf/2402.01493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01484v1","updated":"2024-02-02T15:12:16Z","published":"2024-02-02T15:12:16Z","title":"Connecting the Dots: Is Mode-Connectedness the Key to Feasible\n Sample-Based Inference in Bayesian Neural Networks?","summary":" A major challenge in sample-based inference (SBI) for Bayesian neural\nnetworks is the size and structure of the networks' parameter space. Our work\nshows that successful SBI is possible by embracing the characteristic\nrelationship between weight and function space, uncovering a systematic link\nbetween overparameterization and the difficulty of the sampling problem.\nThrough extensive experiments, we establish practical guidelines for sampling\nand convergence diagnosis. As a result, we present a Bayesian deep ensemble\napproach as an effective solution with competitive performance and uncertainty\nquantification.\n","authors":["Emanuel Sommer","Lisa Wimmer","Theodore Papamarkou","Ludwig Bothmann","Bernd Bischl","David Rügamer"],"pdf_url":"https://arxiv.org/pdf/2402.01484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01481v1","updated":"2024-02-02T15:07:09Z","published":"2024-02-02T15:07:09Z","title":"Multi-level protein pre-training with Vabs-Net","summary":" In recent years, there has been a surge in the development of 3D\nstructure-based pre-trained protein models, representing a significant\nadvancement over pre-trained protein language models in various downstream\ntasks. However, most existing structure-based pre-trained models primarily\nfocus on the residue level, i.e., alpha carbon atoms, while ignoring other\natoms like side chain atoms. We argue that modeling proteins at both residue\nand atom levels is important since the side chain atoms can also be crucial for\nnumerous downstream tasks, for example, molecular docking. Nevertheless, we\nfind that naively combining residue and atom information during pre-training\ntypically fails. We identify a key reason is the information leakage caused by\nthe inclusion of atom structure in the input, which renders residue-level\npre-training tasks trivial and results in insufficiently expressive residue\nrepresentations. To address this issue, we introduce a span mask pre-training\nstrategy on 3D protein chains to learn meaningful representations of both\nresidues and atoms. This leads to a simple yet effective approach to learning\nprotein representation suitable for diverse downstream tasks. Extensive\nexperimental results on binding site prediction and function prediction tasks\ndemonstrate our proposed pre-training approach significantly outperforms other\nmethods. Our code will be made public.\n","authors":["Jiale Zhao","Wanru Zhuang","Jia Song","Yaqi Li","Shuqi Lu"],"pdf_url":"https://arxiv.org/pdf/2402.01481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01476v1","updated":"2024-02-02T15:05:13Z","published":"2024-02-02T15:05:13Z","title":"Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian\n Processes","summary":" While the great capability of Transformers significantly boosts prediction\naccuracy, it could also yield overconfident predictions and require calibrated\nuncertainty estimation, which can be commonly tackled by Gaussian processes\n(GPs). Existing works apply GPs with symmetric kernels under variational\ninference to the attention kernel; however, omitting the fact that attention\nkernels are in essence asymmetric. Moreover, the complexity of deriving the GP\nposteriors remains high for large-scale data. In this work, we propose\nKernel-Eigen Pair Sparse Variational Gaussian Processes (KEP-SVGP) for building\nuncertainty-aware self-attention where the asymmetry of attention kernels is\ntackled by Kernel SVD (KSVD) and a reduced complexity is acquired. Through\nKEP-SVGP, i) the SVGP pair induced by the two sets of singular vectors from\nKSVD w.r.t. the attention kernel fully characterizes the asymmetry; ii) using\nonly a small set of adjoint eigenfunctions from KSVD, the derivation of SVGP\nposteriors can be based on the inversion of a diagonal matrix containing\nsingular values, contributing to a reduction in time complexity; iii) an\nevidence lower bound is derived so that variational parameters can be optimized\ntowards this objective. Experiments verify our excellent performances and\nefficiency on in-distribution, distribution-shift and out-of-distribution\nbenchmarks.\n","authors":["Yingyi Chen","Qinghua Tao","Francesco Tonin","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2402.01476v1.pdf","comment":"We propose Kernel-Eigen Pair Sparse Variational Gaussian Processes\n (KEP-SVGP) for building uncertainty-aware self-attention where the asymmetry\n of attention kernel is tackled by KSVD and a reduced time complexity is\n acquired"},{"id":"http://arxiv.org/abs/2310.05495v2","updated":"2024-02-02T15:04:51Z","published":"2023-10-09T07:56:56Z","title":"On the Convergence of Federated Averaging under Partial Participation\n for Over-parameterized Neural Networks","summary":" Federated learning (FL) is a widely employed distributed paradigm for\ncollaboratively training machine learning models from multiple clients without\nsharing local data. In practice, FL encounters challenges in dealing with\npartial client participation due to the limited bandwidth, intermittent\nconnection and strict synchronized delay. Simultaneously, there exist few\ntheoretical convergence guarantees in this practical setting, especially when\nassociated with the non-convex optimization of neural networks. To bridge this\ngap, we focus on the training problem of federated averaging (FedAvg) method\nfor two canonical models: a deep linear network and a two-layer ReLU network.\nUnder the over-parameterized assumption, we provably show that FedAvg converges\nto a global minimum at a linear rate $\\mathcal{O}\\left((1-\\frac{min_{i \\in\n[t]}|S_i|}{N^2})^t\\right)$ after $t$ iterations, where $N$ is the number of\nclients and $|S_i|$ is the number of the participated clients in the $i$-th\niteration. Experimental evaluations confirm our theoretical results.\n","authors":["Xin Liu","Wei li","Dazhi Zhan","Yu Pan","Xin Ma","Yu Ding","Zhisong Pan"],"pdf_url":"https://arxiv.org/pdf/2310.05495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16405v2","updated":"2024-02-02T14:53:14Z","published":"2024-01-29T18:43:49Z","title":"Scaling Sparse Fine-Tuning to Large Language Models","summary":" Large Language Models (LLMs) are difficult to fully fine-tune (e.g., with\ninstructions or human feedback) due to their sheer number of parameters. A\nfamily of parameter-efficient sparse fine-tuning methods have proven promising\nin terms of performance but their memory requirements increase proportionally\nto the size of the LLMs. In this work, we scale sparse fine-tuning to\nstate-of-the-art LLMs like LLaMA 2 7B and 13B. We propose SpIEL, a novel sparse\nfine-tuning method which, for a desired density level, maintains an array of\nparameter indices and the deltas of these parameters relative to their\npretrained values. It iterates over: (a) updating the active deltas, (b)\npruning indices (based on the change of magnitude of their deltas) and (c)\nregrowth of indices. For regrowth, we explore two criteria based on either the\naccumulated gradients of a few candidate parameters or their approximate\nmomenta estimated using the efficient SM3 optimizer. We experiment with\ninstruction-tuning of LLMs on standard dataset mixtures, finding that SpIEL is\noften superior to popular parameter-efficient fine-tuning methods like LoRA\n(low-rank adaptation) in terms of performance and comparable in terms of run\ntime. We additionally show that SpIEL is compatible with both quantization and\nefficient optimizers, to facilitate scaling to ever-larger model sizes. We\nrelease the code for SpIEL at https://github.com/AlanAnsell/peft and for the\ninstruction-tuning experiments at https://github.com/ducdauge/sft-llm.\n","authors":["Alan Ansell","Ivan Vulić","Hannah Sterz","Anna Korhonen","Edoardo M. Ponti"],"pdf_url":"https://arxiv.org/pdf/2401.16405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01460v1","updated":"2024-02-02T14:52:10Z","published":"2024-02-02T14:52:10Z","title":"Deep Conditional Generative Learning: Model and Error Analysis","summary":" We introduce an Ordinary Differential Equation (ODE) based deep generative\nmethod for learning a conditional distribution, named the Conditional Follmer\nFlow. Starting from a standard Gaussian distribution, the proposed flow could\nefficiently transform it into the target conditional distribution at time 1.\nFor effective implementation, we discretize the flow with Euler's method where\nwe estimate the velocity field nonparametrically using a deep neural network.\nFurthermore, we derive a non-asymptotic convergence rate in the Wasserstein\ndistance between the distribution of the learned samples and the target\ndistribution, providing the first comprehensive end-to-end error analysis for\nconditional distribution learning via ODE flow. Our numerical experiments\nshowcase its effectiveness across a range of scenarios, from standard\nnonparametric conditional density estimation problems to more intricate\nchallenges involving image data, illustrating its superiority over various\nexisting conditional density estimation methods.\n","authors":["Jinyuan Chang","Zhao Ding","Yuling Jiao","Ruoxuan Li","Jerry Zhijian Yang"],"pdf_url":"https://arxiv.org/pdf/2402.01460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07988v2","updated":"2024-02-02T14:43:40Z","published":"2023-03-14T15:44:40Z","title":"Unbalanced and Light Optimal Transport","summary":" While the field of continuous Entropic Optimal Transport (EOT) has been\nactively developing in recent years, it became evident that the classic EOT\nproblem is prone to different issues like the sensitivity to outliers and\nimbalance of classes in the source and target measures. This fact inspired the\ndevelopment of solvers which deal with the unbalanced EOT (UEOT) problem - the\ngeneralization of EOT allowing for mitigating the mentioned issues by relaxing\nthe marginal constraints. Surprisingly, it turns out that the existing solvers\nare either based on heuristic principles or heavy-weighted with complex\noptimization objectives involving several neural networks. We address this\nchallenge and propose a novel theoretically-justified and lightweight\nunbalanced EOT solver. Our advancement consists in developing a novel view on\nthe optimization of the UEOT problem yielding tractable and non-minimax\noptimization objective. We show that combined with a light parametrization\nrecently proposed in the field our objective leads to fast, simple and\neffective solver. It allows solving the continuous UEOT problem in minutes on\nCPU. We provide illustrative examples of the performance of our solver.\n","authors":["Milena Gazdieva","Arip Asadulaev","Alexander Korotin","Evgeny Burnaev"],"pdf_url":"https://arxiv.org/pdf/2303.07988v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01454v1","updated":"2024-02-02T14:43:19Z","published":"2024-02-02T14:43:19Z","title":"Integrating Large Language Models in Causal Discovery: A Statistical\n Causal Approach","summary":" In practical statistical causal discovery (SCD), embedding domain expert\nknowledge as constraints into the algorithm is widely accepted as significant\nfor creating consistent meaningful causal models, despite the recognized\nchallenges in systematic acquisition of the background knowledge. To overcome\nthese challenges, this paper proposes a novel methodology for causal inference,\nin which SCD methods and knowledge based causal inference (KBCI) with a large\nlanguage model (LLM) are synthesized through \"statistical causal prompting\n(SCP)\" for LLMs and prior knowledge augmentation for SCD. Experiments have\nrevealed that GPT-4 can cause the output of the LLM-KBCI and the SCD result\nwith prior knowledge from LLM-KBCI to approach the ground truth, and that the\nSCD result can be further improved, if GPT-4 undergoes SCP. Furthermore, it has\nbeen clarified that an LLM can improve SCD with its background knowledge, even\nif the LLM does not contain information on the dataset. The proposed approach\ncan thus address challenges such as dataset biases and limitations,\nillustrating the potential of LLMs to improve data-driven causal inference\nacross diverse scientific domains.\n","authors":["Masayuki Takayama","Tadahisa Okuda","Thong Pham","Tatsuyoshi Ikenoue","Shingo Fukuma","Shohei Shimizu","Akiyoshi Sannai"],"pdf_url":"https://arxiv.org/pdf/2402.01454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01450v1","updated":"2024-02-02T14:39:39Z","published":"2024-02-02T14:39:39Z","title":"Improving importance estimation in covariate shift for providing\n accurate prediction error","summary":" In traditional Machine Learning, the algorithms predictions are based on the\nassumption that the data follows the same distribution in both the training and\nthe test datasets. However, in real world data this condition does not hold\nand, for instance, the distribution of the covariates changes whereas the\nconditional distribution of the targets remains unchanged. This situation is\ncalled covariate shift problem where standard error estimation may be no longer\naccurate. In this context, the importance is a measure commonly used to\nalleviate the influence of covariate shift on error estimations. The main\ndrawback is that it is not easy to compute. The Kullback-Leibler Importance\nEstimation Procedure (KLIEP) is capable of estimating importance in a promising\nway. Despite its good performance, it fails to ignore target information, since\nit only includes the covariates information for computing the importance. In\nthis direction, this paper explores the potential performance improvement if\ntarget information is considered in the computation of the importance. Then, a\nredefinition of the importance arises in order to be generalized in this way.\nBesides the potential improvement in performance, including target information\nmake possible the application to a real application about plankton\nclassification that motivates this research and characterized by its great\ndimensionality, since considering targets rather than covariates reduces the\ncomputation and the noise in the covariates. The impact of taking target\ninformation is also explored when Logistic Regression (LR), Kernel Mean\nMatching (KMM), Ensemble Kernel Mean Matching (EKMM) and the naive predecessor\nof KLIEP called Kernel Density Estimation (KDE) methods estimate the\nimportance. The experimental results lead to a more accurate error estimation\nusing target information, especially in case of the more promising method\nKLIEP.\n","authors":["Laura Fdez-Díaz","Sara González Tomillo","Elena Montañés","José Ramón Quevedo"],"pdf_url":"https://arxiv.org/pdf/2402.01450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01444v1","updated":"2024-02-02T14:36:50Z","published":"2024-02-02T14:36:50Z","title":"Mission Critical -- Satellite Data is a Distinct Modality in Machine\n Learning","summary":" Satellite data has the potential to inspire a seismic shift for machine\nlearning -- one in which we rethink existing practices designed for traditional\ndata modalities. As machine learning for satellite data (SatML) gains traction\nfor its real-world impact, our field is at a crossroads. We can either continue\napplying ill-suited approaches, or we can initiate a new research agenda that\ncenters around the unique characteristics and challenges of satellite data.\nThis position paper argues that satellite data constitutes a distinct modality\nfor machine learning research and that we must recognize it as such to advance\nthe quality and impact of SatML research across theory, methods, and\ndeployment. We outline critical discussion questions and actionable suggestions\nto transform SatML from merely an intriguing application area to a dedicated\nresearch discipline that helps move the needle on big challenges for machine\nlearning and society.\n","authors":["Esther Rolf","Konstantin Klemmer","Caleb Robinson","Hannah Kerner"],"pdf_url":"https://arxiv.org/pdf/2402.01444v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.01441v1","updated":"2024-02-02T14:34:22Z","published":"2024-02-02T14:34:22Z","title":"Learning the Market: Sentiment-Based Ensemble Trading Agents","summary":" We propose the integration of sentiment analysis and deep-reinforcement\nlearning ensemble algorithms for stock trading, and design a strategy capable\nof dynamically altering its employed agent given concurrent market sentiment.\nIn particular, we create a simple-yet-effective method for extracting news\nsentiment and combine this with general improvements upon existing works,\nresulting in automated trading agents that effectively consider both\nqualitative market factors and quantitative stock data. We show that our\napproach results in a strategy that is profitable, robust, and risk-minimal --\noutperforming the traditional ensemble strategy as well as single agent\nalgorithms and market metrics. Our findings determine that the conventional\npractice of switching ensemble agents every fixed-number of months is\nsub-optimal, and that a dynamic sentiment-based framework greatly unlocks\nadditional performance within these agents. Furthermore, as we have designed\nour algorithm with simplicity and efficiency in mind, we hypothesize that the\ntransition of our method from historical evaluation towards real-time trading\nwith live data should be relatively simple.\n","authors":["Andrew Ye","James Xu","Yi Wang","Yifan Yu","Daniel Yan","Ryan Chen","Bosheng Dong","Vipin Chaudhary","Shuai Xu"],"pdf_url":"https://arxiv.org/pdf/2402.01441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01440v1","updated":"2024-02-02T14:32:42Z","published":"2024-02-02T14:32:42Z","title":"Few-Shot Learning on Graphs: from Meta-learning to Pre-training and\n Prompting","summary":" Graph representation learning, a critical step in graph-centric tasks, has\nseen significant advancements. Earlier techniques often operate in an\nend-to-end setting, where performance heavily relies on the availability of\nample labeled data. This constraint has spurred the emergence of few-shot\nlearning on graphs, where only a few task-specific labels are available for\neach task. Given the extensive literature in this field, this survey endeavors\nto synthesize recent developments, provide comparative insights, and identify\nfuture directions. We systematically categorize existing studies into three\nmajor families: meta-learning approaches, pre-training approaches, and hybrid\napproaches, with a finer-grained classification in each family to aid readers\nin their method selection process. Within each category, we analyze the\nrelationships among these methods and compare their strengths and limitations.\nFinally, we outline prospective future directions for few-shot learning on\ngraphs to catalyze continued innovation in this field.\n","authors":["Xingtong Yu","Yuan Fang","Zemin Liu","Yuxia Wu","Zhihao Wen","Jianyuan Bo","Xinming Zhang","Steven C. H. Hoi"],"pdf_url":"https://arxiv.org/pdf/2402.01440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01439v1","updated":"2024-02-02T14:30:48Z","published":"2024-02-02T14:30:48Z","title":"From Words to Molecules: A Survey of Large Language Models in Chemistry","summary":" In recent years, Large Language Models (LLMs) have achieved significant\nsuccess in natural language processing (NLP) and various interdisciplinary\nareas. However, applying LLMs to chemistry is a complex task that requires\nspecialized domain knowledge. This paper provides a thorough exploration of the\nnuanced methodologies employed in integrating LLMs into the field of chemistry,\ndelving into the complexities and innovations at this interdisciplinary\njuncture. Specifically, our analysis begins with examining how molecular\ninformation is fed into LLMs through various representation and tokenization\nmethods. We then categorize chemical LLMs into three distinct groups based on\nthe domain and modality of their input data, and discuss approaches for\nintegrating these inputs for LLMs. Furthermore, this paper delves into the\npretraining objectives with adaptations to chemical LLMs. After that, we\nexplore the diverse applications of LLMs in chemistry, including novel\nparadigms for their application in chemistry tasks. Finally, we identify\npromising research directions, including further integration with chemical\nknowledge, advancements in continual learning, and improvements in model\ninterpretability, paving the way for groundbreaking developments in the field.\n","authors":["Chang Liao","Yemin Yu","Yu Mei","Ying Wei"],"pdf_url":"https://arxiv.org/pdf/2402.01439v1.pdf","comment":"Submitted to IJCAI 2024 survey track"},{"id":"http://arxiv.org/abs/2402.01434v1","updated":"2024-02-02T14:26:32Z","published":"2024-02-02T14:26:32Z","title":"Conditioning non-linear and infinite-dimensional diffusion processes","summary":" Generative diffusion models and many stochastic models in science and\nengineering naturally live in infinite dimensions before discretisation. To\nincorporate observed data for statistical and learning tasks, one needs to\ncondition on observations. While recent work has treated conditioning linear\nprocesses in infinite dimensions, conditioning non-linear processes in infinite\ndimensions has not been explored. This paper conditions function valued\nstochastic processes without prior discretisation. To do so, we use an\ninfinite-dimensional version of Girsanov's theorem to condition a\nfunction-valued stochastic process, leading to a stochastic differential\nequation (SDE) for the conditioned process involving the score. We apply this\ntechnique to do time series analysis for shapes of organisms in evolutionary\nbiology, where we discretise via the Fourier basis and then learn the\ncoefficients of the score function with score matching methods.\n","authors":["Elizabeth Louise Baker","Gefan Yang","Michael L. Severinsen","Christy Anna Hipsley","Stefan Sommer"],"pdf_url":"https://arxiv.org/pdf/2402.01434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01431v1","updated":"2024-02-02T14:20:04Z","published":"2024-02-02T14:20:04Z","title":"Approximate Control for Continuous-Time POMDPs","summary":" This work proposes a decision-making framework for partially observable\nsystems in continuous time with discrete state and action spaces. As optimal\ndecision-making becomes intractable for large state spaces we employ\napproximation methods for the filtering and the control problem that scale well\nwith an increasing number of states. Specifically, we approximate the\nhigh-dimensional filtering distribution by projecting it onto a parametric\nfamily of distributions, and integrate it into a control heuristic based on the\nfully observable system to obtain a scalable policy. We demonstrate the\neffectiveness of our approach on several partially observed systems, including\nqueueing systems and chemical reaction networks.\n","authors":["Yannick Eich","Bastian Alt","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2402.01431v1.pdf","comment":"To be published in AISTATS 2024"},{"id":"http://arxiv.org/abs/2308.11842v2","updated":"2024-02-02T14:12:42Z","published":"2023-08-23T00:18:17Z","title":"${\\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative\n Multi-Agent Reinforcement Learning","summary":" Identification and analysis of symmetrical patterns in the natural world have\nled to significant discoveries across various scientific fields, such as the\nformulation of gravitational laws in physics and advancements in the study of\nchemical structures. In this paper, we focus on exploiting Euclidean symmetries\ninherent in certain cooperative multi-agent reinforcement learning (MARL)\nproblems and prevalent in many applications. We begin by formally\ncharacterizing a subclass of Markov games with a general notion of symmetries\nthat admits the existence of symmetric optimal values and policies. Motivated\nby these properties, we design neural network architectures with symmetric\nconstraints embedded as an inductive bias for multi-agent actor-critic methods.\nThis inductive bias results in superior performance in various cooperative MARL\nbenchmarks and impressive generalization capabilities such as zero-shot\nlearning and transfer learning in unseen scenarios with repeated symmetric\npatterns. The code is available at: https://github.com/dchen48/E3AC.\n","authors":["Dingyang Chen","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11842v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01424v1","updated":"2024-02-02T14:11:23Z","published":"2024-02-02T14:11:23Z","title":"A Data-Driven Analysis of Robust Automatic Piano Transcription","summary":" Algorithms for automatic piano transcription have improved dramatically in\nrecent years due to new datasets and modeling techniques. Recent developments\nhave focused primarily on adapting new neural network architectures, such as\nthe Transformer and Perceiver, in order to yield more accurate systems. In this\nwork, we study transcription systems from the perspective of their training\ndata. By measuring their performance on out-of-distribution annotated piano\ndata, we show how these models can severely overfit to acoustic properties of\nthe training data. We create a new set of audio for the MAESTRO dataset,\ncaptured automatically in a professional studio recording environment via\nYamaha Disklavier playback. Using various data augmentation techniques when\ntraining with the original and re-performed versions of the MAESTRO dataset, we\nachieve state-of-the-art note-onset accuracy of 88.4 F1-score on the MAPS\ndataset, without seeing any of its training data. We subsequently analyze these\ndata augmentation techniques in a series of ablation studies to better\nunderstand their influence on the resulting models.\n","authors":["Drew Edwards","Simon Dixon","Emmanouil Benetos","Akira Maezawa","Yuta Kusaka"],"pdf_url":"https://arxiv.org/pdf/2402.01424v1.pdf","comment":"Accepted for publication in IEEE Signal Processing Letters on 31\n Janurary, 2024"},{"id":"http://arxiv.org/abs/2311.06597v2","updated":"2024-02-02T14:03:32Z","published":"2023-11-11T15:45:44Z","title":"Understanding Grokking Through A Robustness Viewpoint","summary":" Recently, an interesting phenomenon called grokking has gained much\nattention, where generalization occurs long after the models have initially\noverfitted the training data. We try to understand this seemingly strange\nphenomenon through the robustness of the neural network. From a robustness\nperspective, we show that the popular $l_2$ weight norm (metric) of the neural\nnetwork is actually a sufficient condition for grokking. Based on the previous\nobservations, we propose perturbation-based methods to speed up the\ngeneralization process. In addition, we examine the standard training process\non the modulo addition dataset and find that it hardly learns other basic group\noperations before grokking, for example, the commutative law. Interestingly,\nthe speed-up of generalization when using our proposed method can be explained\nby learning the commutative law, a necessary condition when the model groks on\nthe test dataset. We also empirically find that $l_2$ norm correlates with\ngrokking on the test data not in a timely way, we propose new metrics based on\nrobustness and information theory and find that our new metrics correlate well\nwith the grokking phenomenon and may be used to predict grokking.\n","authors":["Zhiquan Tan","Weiran Huang"],"pdf_url":"https://arxiv.org/pdf/2311.06597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08480v2","updated":"2024-02-02T13:57:48Z","published":"2023-08-16T16:38:03Z","title":"Label Propagation Techniques for Artifact Detection in Imbalanced\n Classes using Photoplethysmogram Signals","summary":" Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring\nvital signs, but they are susceptible to motion artifacts that can lead to\ninaccurate interpretations. In this study, the use of label propagation\ntechniques to propagate labels among PPG samples is explored, particularly in\nimbalanced class scenarios where clean PPG samples are significantly\noutnumbered by artifact-contaminated samples. With a precision of 91%, a recall\nof 90% and an F1 score of 90% for the class without artifacts, the results\ndemonstrate its effectiveness in labeling a medical dataset, even when clean\nsamples are rare. For the classification of artifacts our study compares\nsupervised classifiers such as conventional classifiers and neural networks\n(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm.\nWith a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN\nsupervised model gives good results, but the semi-supervised algorithm performs\nbetter in detecting artifacts. The findings suggest that the semi-supervised\nalgorithm label propagation hold promise for artifact detection in PPG signals,\nwhich can enhance the reliability of PPG-based health monitoring systems in\nreal-world applications.\n","authors":["Clara Macabiau","Thanh-Dung Le","Kevin Albert","Philippe Jouvet","Rita Noumeir"],"pdf_url":"https://arxiv.org/pdf/2308.08480v2.pdf","comment":"Under preparation to submit to IEEE for possible publications"},{"id":"http://arxiv.org/abs/2210.13954v5","updated":"2024-02-02T13:56:21Z","published":"2022-10-25T12:16:03Z","title":"I Prefer not to Say: Protecting User Consent in Models with Optional\n Personal Data","summary":" We examine machine learning models in a setup where individuals have the\nchoice to share optional personal information with a decision-making system, as\nseen in modern insurance pricing models. Some users consent to their data being\nused whereas others object and keep their data undisclosed. In this work, we\nshow that the decision not to share data can be considered as information in\nitself that should be protected to respect users' privacy. This observation\nraises the overlooked problem of how to ensure that users who protect their\npersonal data do not suffer any disadvantages as a result. To address this\nproblem, we formalize protection requirements for models which only use the\ninformation for which active user consent was obtained. This excludes implicit\ninformation contained in the decision to share data or not. We offer the first\nsolution to this problem by proposing the notion of Protected User Consent\n(PUC), which we prove to be loss-optimal under our protection requirement. We\nobserve that privacy and performance are not fundamentally at odds with each\nother and that it is possible for a decision maker to benefit from additional\ndata while respecting users' consent. To learn PUC-compliant models, we devise\na model-agnostic data augmentation strategy with finite sample convergence\nguarantees. Finally, we analyze the implications of PUC on challenging real\ndatasets, tasks, and models.\n","authors":["Tobias Leemann","Martin Pawelczyk","Christian Thomas Eberle","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2210.13954v5.pdf","comment":"v5: AAAI-24 Camera-Ready Version Including Appendices. v1: NeurIPS\n 2022 Workshop on Algorithmic Fairness through the Lens of Causality and\n Privacy (AFCP)"},{"id":"http://arxiv.org/abs/2206.14051v2","updated":"2024-02-02T13:56:17Z","published":"2022-06-28T14:51:10Z","title":"Enhancing Business Process Simulation Models with Extraneous Activity\n Delays","summary":" Business Process Simulation (BPS) is a common approach to estimate the impact\nof changes to a business process on its performance measures. For example, it\nallows us to estimate what would be the cycle time of a process if we automated\none of its activities, or if some resources become unavailable. The starting\npoint of BPS is a business process model annotated with simulation parameters\n(a BPS model). In traditional approaches, BPS models are manually designed by\nmodeling specialists. This approach is time-consuming and error-prone. To\naddress this shortcoming, several studies have proposed methods to\nautomatically discover BPS models from event logs via process mining\ntechniques. However, current techniques in this space discover BPS models that\nonly capture waiting times caused by resource contention or resource\nunavailability. Oftentimes, a considerable portion of the waiting time in a\nbusiness process corresponds to extraneous delays, e.g., a resource waits for\nthe customer to return a phone call. This article proposes a method that\ndiscovers extraneous delays from event logs of business process executions. The\nproposed approach computes, for each pair of causally consecutive activity\ninstances in the event log, the time when the target activity instance should\ntheoretically have started, given the availability of the relevant resource.\nBased on the difference between the theoretical and the actual start times, the\napproach estimates the distribution of extraneous delays, and it enhances the\nBPS model with timer events to capture these delays. An empirical evaluation\ninvolving synthetic and real-life logs shows that the approach produces BPS\nmodels that better reflect the temporal dynamics of the process, relative to\nBPS models that do not capture extraneous delays.\n","authors":["David Chapela-Campa","Marlon Dumas"],"pdf_url":"https://arxiv.org/pdf/2206.14051v2.pdf","comment":"Extended version of the ICPM 2022 publication (see v1)"},{"id":"http://arxiv.org/abs/2401.17780v2","updated":"2024-02-02T13:55:51Z","published":"2024-01-31T12:23:24Z","title":"A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with\n Uniform PAC Guarantees","summary":" We study a primal-dual reinforcement learning (RL) algorithm for the online\nconstrained Markov decision processes (CMDP) problem, wherein the agent\nexplores an optimal policy that maximizes return while satisfying constraints.\nDespite its widespread practical use, the existing theoretical literature on\nprimal-dual RL algorithms for this problem only provides sublinear regret\nguarantees and fails to ensure convergence to optimal policies. In this paper,\nwe introduce a novel policy gradient primal-dual algorithm with uniform\nprobably approximate correctness (Uniform-PAC) guarantees, simultaneously\nensuring convergence to optimal policies, sublinear regret, and polynomial\nsample complexity for any target accuracy. Notably, this represents the first\nUniform-PAC algorithm for the online CMDP problem. In addition to the\ntheoretical guarantees, we empirically demonstrate in a simple CMDP that our\nalgorithm converges to optimal policies, while an existing algorithm exhibits\noscillatory performance and constraint violation.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Masahiro Kato","Yuki Ichihara","Soichiro Nishimori","Akiyoshi Sannai","Sho Sonoda","Wataru Kumagai","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2401.17780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01416v1","updated":"2024-02-02T13:55:37Z","published":"2024-02-02T13:55:37Z","title":"Sequence Shortening for Context-Aware Machine Translation","summary":" Context-aware Machine Translation aims to improve translations of sentences\nby incorporating surrounding sentences as context. Towards this task, two main\narchitectures have been applied, namely single-encoder (based on concatenation)\nand multi-encoder models. In this study, we show that a special case of\nmulti-encoder architecture, where the latent representation of the source\nsentence is cached and reused as the context in the next step, achieves higher\naccuracy on the contrastive datasets (where the models have to rank the correct\ntranslation among the provided sentences) and comparable BLEU and COMET scores\nas the single- and multi-encoder approaches. Furthermore, we investigate the\napplication of Sequence Shortening to the cached representations. We test three\npooling-based shortening techniques and introduce two novel methods - Latent\nGrouping and Latent Selecting, where the network learns to group tokens or\nselects the tokens to be cached as context. Our experiments show that the two\nmethods achieve competitive BLEU and COMET scores and accuracies on the\ncontrastive datasets to the other tested methods while potentially allowing for\nhigher interpretability and reducing the growth of memory requirements with\nincreased context size.\n","authors":["Paweł Mąka","Yusuf Can Semerci","Jan Scholtes","Gerasimos Spanakis"],"pdf_url":"https://arxiv.org/pdf/2402.01416v1.pdf","comment":"Findings of the ACL: EACL 2024"},{"id":"http://arxiv.org/abs/2305.15927v3","updated":"2024-02-02T13:53:56Z","published":"2023-05-25T10:54:36Z","title":"Learning Directed Graphical Models with Optimal Transport","summary":" Estimating the parameters of a probabilistic directed graphical model from\nincomplete data remains a long-standing challenge. This is because, in the\npresence of latent variables, both the likelihood function and posterior\ndistribution are intractable without further assumptions about structural\ndependencies or model classes. While existing learning methods are\nfundamentally based on likelihood maximization, here we offer a new view of the\nparameter learning problem through the lens of optimal transport. This\nperspective licenses a general framework that operates on any directed graphs\nwithout making unrealistic assumptions on the posterior over the latent\nvariables or resorting to black-box variational approximations. We develop a\ntheoretical framework and support it with extensive empirical evidence\ndemonstrating the flexibility and versatility of our approach. Across\nexperiments, we show that not only can our method recover the ground-truth\nparameters but it also performs comparably or better on downstream\napplications, notably the non-trivial task of discrete representation learning.\n","authors":["Vy Vo","Trung Le","Long-Tung Vuong","He Zhao","Edwin Bonilla","Dinh Phung"],"pdf_url":"https://arxiv.org/pdf/2305.15927v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01415v1","updated":"2024-02-02T13:53:29Z","published":"2024-02-02T13:53:29Z","title":"SMLP: Symbolic Machine Learning Prover","summary":" Symbolic Machine Learning Prover (SMLP) is a tool and a library for system\nexploration based on data samples obtained by simulating or executing the\nsystem on a number of input vectors. SMLP aims at exploring the system based on\nthis data by taking a grey-box approach: SMLP combines statistical methods of\ndata exploration with building and exploring machine learning models in close\nfeedback loop with the system's response, and exploring these models by\ncombining probabilistic and formal methods. SMLP has been applied in industrial\nsetting at Intel for analyzing and optimizing hardware designs at the analog\nlevel. SMLP is a general purpose tool and can be applied to systems that can be\nsampled and modeled by machine learning models.\n","authors":["Franz Brauße","Zurab Khasidashvili","Konstantin Korovin"],"pdf_url":"https://arxiv.org/pdf/2402.01415v1.pdf","comment":"12 pages, 4 figures. (submitted)"},{"id":"http://arxiv.org/abs/2310.11085v2","updated":"2024-02-02T13:50:42Z","published":"2023-10-17T09:10:27Z","title":"Document-Level In-Context Few-Shot Relation Extraction via Pre-Trained\n Language Models","summary":" Relation extraction aims at inferring structured human knowledge from textual\ndocuments. State-of-the-art methods based on language models commonly have two\nlimitations: (1) they require named entities to be either given as input or\ninfer them, which introduces additional noise, and (2) they require human\nannotations of documents. As a remedy, we present a novel framework for\ndocument-level in-context few-shot relation extraction via pre-trained language\nmodels. We achieve crucial benefits in that we eliminate the need for both\nnamed entity recognition and human annotation of documents. Unlike existing\nmethods based on fine-tuning, our framework is flexible in that it can be\neasily updated for a new set of relations without re-training. We evaluate our\nframework using DocRED, the largest publicly available dataset for\ndocument-level relation extraction, and demonstrate that our framework achieves\nstate-of-the-art performance. Finally, we show that our framework actually\nperforms much better than the original labels from the development set of\nDocRED. To the best of our knowledge, we are the first to reformulate the\ndocument-level relation extraction task as a tailored in-context few-shot\nlearning paradigm.\n","authors":["Yilmazcan Ozyurt","Stefan Feuerriegel","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01413v1","updated":"2024-02-02T13:45:42Z","published":"2024-02-02T13:45:42Z","title":"Objective and subjective evaluation of speech enhancement methods in the\n UDASE task of the 7th CHiME challenge","summary":" Supervised models for speech enhancement are trained using artificially\ngenerated mixtures of clean speech and noise signals. However, the synthetic\ntraining conditions may not accurately reflect real-world conditions\nencountered during testing. This discrepancy can result in poor performance\nwhen the test domain significantly differs from the synthetic training domain.\nTo tackle this issue, the UDASE task of the 7th CHiME challenge aimed to\nleverage real-world noisy speech recordings from the test domain for\nunsupervised domain adaptation of speech enhancement models. Specifically, this\ntest domain corresponds to the CHiME-5 dataset, characterized by real\nmulti-speaker and conversational speech recordings made in noisy and\nreverberant domestic environments, for which ground-truth clean speech signals\nare not available. In this paper, we present the objective and subjective\nevaluations of the systems that were submitted to the CHiME-7 UDASE task, and\nwe provide an analysis of the results. This analysis reveals a limited\ncorrelation between subjective ratings and several supervised nonintrusive\nperformance metrics recently proposed for speech enhancement. Conversely, the\nresults suggest that more traditional intrusive objective metrics can be used\nfor in-domain performance evaluation using the reverberant LibriCHiME-5 dataset\ndeveloped for the challenge. The subjective evaluation indicates that all\nsystems successfully reduced the background noise, but always at the expense of\nincreased distortion. Out of the four speech enhancement methods evaluated\nsubjectively, only one demonstrated an improvement in overall quality compared\nto the unprocessed noisy speech, highlighting the difficulty of the task. The\ntools and audio material created for the CHiME-7 UDASE task are shared with the\ncommunity.\n","authors":["Simon Leglaive","Matthieu Fraticelli","Hend ElGhazaly","Léonie Borne","Mostafa Sadeghi","Scott Wisdom","Manuel Pariente","John R. Hershey","Daniel Pressnitzer","Jon P. Barker"],"pdf_url":"https://arxiv.org/pdf/2402.01413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01412v1","updated":"2024-02-02T13:44:47Z","published":"2024-02-02T13:44:47Z","title":"Bass Accompaniment Generation via Latent Diffusion","summary":" The ability to automatically generate music that appropriately matches an\narbitrary input track is a challenging task. We present a novel controllable\nsystem for generating single stems to accompany musical mixes of arbitrary\nlength. At the core of our method are audio autoencoders that efficiently\ncompress audio waveform samples into invertible latent representations, and a\nconditional latent diffusion model that takes as input the latent encoding of a\nmix and generates the latent encoding of a corresponding stem. To provide\ncontrol over the timbre of generated samples, we introduce a technique to\nground the latent space to a user-provided reference style during diffusion\nsampling. For further improving audio quality, we adapt classifier-free\nguidance to avoid distortions at high guidance strengths when generating an\nunbounded latent space. We train our model on a dataset of pairs of mixes and\nmatching bass stems. Quantitative experiments demonstrate that, given an input\nmix, the proposed system can generate basslines with user-specified timbres.\nOur controllable conditional audio generation framework represents a\nsignificant step forward in creating generative AI tools to assist musicians in\nmusic production.\n","authors":["Marco Pasini","Maarten Grachten","Stefan Lattner"],"pdf_url":"https://arxiv.org/pdf/2402.01412v1.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.01410v1","updated":"2024-02-02T13:42:45Z","published":"2024-02-02T13:42:45Z","title":"XAI for Skin Cancer Detection with Prototypes and Non-Expert Supervision","summary":" Skin cancer detection through dermoscopy image analysis is a critical task.\nHowever, existing models used for this purpose often lack interpretability and\nreliability, raising the concern of physicians due to their black-box nature.\nIn this paper, we propose a novel approach for the diagnosis of melanoma using\nan interpretable prototypical-part model. We introduce a guided supervision\nbased on non-expert feedback through the incorporation of: 1) binary masks,\nobtained automatically using a segmentation network; and 2) user-refined\nprototypes. These two distinct information pathways aim to ensure that the\nlearned prototypes correspond to relevant areas within the skin lesion,\nexcluding confounding factors beyond its boundaries. Experimental results\ndemonstrate that, even without expert supervision, our approach achieves\nsuperior performance and generalization compared to non-interpretable models.\n","authors":["Miguel Correia","Alceu Bissoto","Carlos Santiago","Catarina Barata"],"pdf_url":"https://arxiv.org/pdf/2402.01410v1.pdf","comment":"Accepted in the iMIMIC Workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2402.01408v1","updated":"2024-02-02T13:42:12Z","published":"2024-02-02T13:42:12Z","title":"Climbing the Ladder of Interpretability with Counterfactual Concept\n Bottleneck Models","summary":" Current deep learning models are not designed to simultaneously address three\nfundamental questions: predict class labels to solve a given classification\ntask (the \"What?\"), explain task predictions (the \"Why?\"), and imagine\nalternative scenarios that could result in different predictions (the \"What\nif?\"). The inability to answer these questions represents a crucial gap in\ndeploying reliable AI agents, calibrating human trust, and deepening\nhuman-machine interaction. To bridge this gap, we introduce CounterFactual\nConcept Bottleneck Models (CF-CBMs), a class of models designed to efficiently\naddress the above queries all at once without the need to run post-hoc\nsearches. Our results show that CF-CBMs produce: accurate predictions (the\n\"What?\"), simple explanations for task predictions (the \"Why?\"), and\ninterpretable counterfactuals (the \"What if?\"). CF-CBMs can also sample or\nestimate the most probable counterfactual to: (i) explain the effect of concept\ninterventions on tasks, (ii) show users how to get a desired class label, and\n(iii) propose concept interventions via \"task-driven\" interventions.\n","authors":["Gabriele Dominici","Pietro Barbiero","Francesco Giannini","Martin Gjoreski","Giuseppe Marra","Marc Langheinrich"],"pdf_url":"https://arxiv.org/pdf/2402.01408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01401v1","updated":"2024-02-02T13:33:30Z","published":"2024-02-02T13:33:30Z","title":"Zero-Shot Machine Unlearning at Scale via Lipschitz Regularization","summary":" To comply with AI and data regulations, the need to forget private or\ncopyrighted information from trained machine learning models is increasingly\nimportant. The key challenge in unlearning is forgetting the necessary data in\na timely manner, while preserving model performance. In this work, we address\nthe zero-shot unlearning scenario, whereby an unlearning algorithm must be able\nto remove data given only a trained model and the data to be forgotten. Under\nsuch a definition, existing state-of-the-art methods are insufficient. Building\non the concepts of Lipschitz continuity, we present a method that induces\nsmoothing of the forget sample's output, with respect to perturbations of that\nsample. We show this smoothing successfully results in forgetting while\npreserving general model performance. We perform extensive empirical evaluation\nof our method over a range of contemporary benchmarks, verifying that our\nmethod achieves state-of-the-art performance under the strict constraints of\nzero-shot unlearning.\n","authors":["Jack Foster","Kyle Fogarty","Stefan Schoepf","Cengiz Öztireli","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2402.01401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01400v1","updated":"2024-02-02T13:31:24Z","published":"2024-02-02T13:31:24Z","title":"Query-Efficient Correlation Clustering with Noisy Oracle","summary":" We study a general clustering setting in which we have $n$ elements to be\nclustered, and we aim to perform as few queries as possible to an oracle that\nreturns a noisy sample of the similarity between two elements. Our setting\nencompasses many application domains in which the similarity function is costly\nto compute and inherently noisy. We propose two novel formulations of online\nlearning problems rooted in the paradigm of Pure Exploration in Combinatorial\nMulti-Armed Bandits (PE-CMAB): fixed confidence and fixed budget settings. For\nboth settings, we design algorithms that combine a sampling strategy with a\nclassic approximation algorithm for correlation clustering and study their\ntheoretical guarantees. Our results are the first examples of polynomial-time\nalgorithms that work for the case of PE-CMAB in which the underlying offline\noptimization problem is NP-hard.\n","authors":["Yuko Kuroki","Atsushi Miyauchi","Francesco Bonchi","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2402.01400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01399v1","updated":"2024-02-02T13:31:17Z","published":"2024-02-02T13:31:17Z","title":"A Probabilistic Model to explain Self-Supervised Representation Learning","summary":" Self-supervised learning (SSL) learns representations by leveraging an\nauxiliary unsupervised task, such as classifying semantically related samples,\ne.g. different data augmentations or modalities. Of the many approaches to SSL,\ncontrastive methods, e.g. SimCLR, CLIP and VicREG, have gained attention for\nlearning representations that achieve downstream performance close to that of\nsupervised learning. However, a theoretical understanding of the mechanism\nbehind these methods eludes. We propose a generative latent variable model for\nthe data and show that several families of discriminative self-supervised\nalgorithms, including contrastive methods, approximately induce its latent\nstructure over representations, providing a unifying theoretical framework. We\nalso justify links to mutual information and the use of a projection head.\nFitting our model generatively, as SimVE, improves performance over previous\nVAE methods on common benchmarks (e.g. FashionMNIST, CIFAR10, CelebA), narrows\nthe gap to discriminative methods on _content_ classification and, as our\nanalysis predicts, outperforms them where _style_ information is required,\ntaking a step toward task-agnostic representations.\n","authors":["Alice Bizeul","Bernhard Schölkopf","Carl Allen"],"pdf_url":"https://arxiv.org/pdf/2402.01399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12565v2","updated":"2024-02-02T13:27:53Z","published":"2023-02-24T10:32:30Z","title":"Variational Linearized Laplace Approximation for Bayesian Deep Learning","summary":" The Linearized Laplace Approximation (LLA) has been recently used to perform\nuncertainty estimation on the predictions of pre-trained deep neural networks\n(DNNs). However, its widespread application is hindered by significant\ncomputational costs, particularly in scenarios with a large number of training\npoints or DNN parameters. Consequently, additional approximations of LLA, such\nas Kronecker-factored or diagonal approximate GGN matrices, are utilized,\npotentially compromising the model's performance. To address these challenges,\nwe propose a new method for approximating LLA using a variational sparse\nGaussian Process (GP). Our method is based on the dual RKHS formulation of GPs\nand retains as the predictive mean the output of the original DNN. Furthermore,\nit allows for efficient stochastic optimization, which results in sub-linear\ntraining time in the size of the training dataset. Specifically, its training\ncost is independent of the number of training points. We compare our proposed\nmethod against accelerated LLA (ELLA), which relies on the Nystr\\\"om\napproximation, as well as other LLA variants employing the sample-then-optimize\nprinciple. Experimental results, both on regression and classification\ndatasets, show that our method outperforms these already existing efficient\nvariants of LLA, both in terms of the quality of the predictive distribution\nand in terms of total computational time.\n","authors":["Luis A. Ortega","Simón Rodríguez Santana","Daniel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2302.12565v2.pdf","comment":"Pre-print, under revision"},{"id":"http://arxiv.org/abs/2305.15577v2","updated":"2024-02-02T13:26:27Z","published":"2023-05-24T21:23:58Z","title":"Minimizing $f$-Divergences by Interpolating Velocity Fields","summary":" Many machine learning problems can be formulated as approximating a target\ndistribution using a particle distribution by minimizing a statistical\ndiscrepancy. Wasserstein Gradient Flow can be employed to move particles along\na path that minimizes the $f$-divergence between the \\textit{target} and\n\\textit{particle} distributions. To perform such movements we need to calculate\nthe corresponding velocity fields which include a density ratio function\nbetween these two distributions. While previous works estimated the density\nratio function first and then differentiated the estimated ratio, this approach\nmay suffer from overfitting, which leads to a less accurate estimate. Inspired\nby non-parametric curve fitting, we directly estimate these velocity fields\nusing interpolation. We prove that our method is asymptotically consistent\nunder mild conditions. We validate the effectiveness using novel applications\non domain adaptation and missing data imputation.\n","authors":["Song Liu","Jiahao Yu","Jack Simons","Mingxuan Yi","Mark Beaumont"],"pdf_url":"https://arxiv.org/pdf/2305.15577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01393v1","updated":"2024-02-02T13:17:19Z","published":"2024-02-02T13:17:19Z","title":"ALERT-Transformer: Bridging Asynchronous and Synchronous Machine\n Learning for Real-Time Event-based Spatio-Temporal Data","summary":" We seek to enable classic processing of continuous ultra-sparse\nspatiotemporal data generated by event-based sensors with dense machine\nlearning models. We propose a novel hybrid pipeline composed of asynchronous\nsensing and synchronous processing that combines several ideas: (1) an\nembedding based on PointNet models -- the ALERT module -- that can continuously\nintegrate new and dismiss old events thanks to a leakage mechanism, (2) a\nflexible readout of the embedded data that allows to feed any downstream model\nwith always up-to-date features at any sampling rate, (3) exploiting the input\nsparsity in a patch-based approach inspired by Vision Transformer to optimize\nthe efficiency of the method. These embeddings are then processed by a\ntransformer model trained for object and gesture recognition. Using this\napproach, we achieve performances at the state-of-the-art with a lower latency\nthan competitors. We also demonstrate that our asynchronous model can operate\nat any desired sampling rate.\n","authors":["Carmen Martin-Turrero","Maxence Bouvier","Manuel Breitenstein","Pietro Zanuttigh","Vincent Parret"],"pdf_url":"https://arxiv.org/pdf/2402.01393v1.pdf","comment":"Preprint version. 8 pages, 7 figures, under review"},{"id":"http://arxiv.org/abs/2402.01382v1","updated":"2024-02-02T13:06:33Z","published":"2024-02-02T13:06:33Z","title":"Emergence of heavy tails in homogenized stochastic gradient descent","summary":" It has repeatedly been observed that loss minimization by stochastic gradient\ndescent (SGD) leads to heavy-tailed distributions of neural network parameters.\nHere, we analyze a continuous diffusion approximation of SGD, called\nhomogenized stochastic gradient descent, show that it behaves asymptotically\nheavy-tailed, and give explicit upper and lower bounds on its tail-index. We\nvalidate these bounds in numerical experiments and show that they are typically\nclose approximations to the empirical tail-index of SGD iterates. In addition,\ntheir explicit form enables us to quantify the interplay between optimization\nparameters and the tail-index. Doing so, we contribute to the ongoing\ndiscussion on links between heavy tails and the generalization performance of\nneural networks as well as the ability of SGD to avoid suboptimal local minima.\n","authors":["Zhe Jiao","Martin Keller-Ressel"],"pdf_url":"https://arxiv.org/pdf/2402.01382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01379v1","updated":"2024-02-02T13:03:15Z","published":"2024-02-02T13:03:15Z","title":"Regularized boosting with an increasing coefficient magnitude stop\n criterion as meta-learner in hyperparameter optimization stacking ensemble","summary":" In Hyperparameter Optimization (HPO), only the hyperparameter configuration\nwith the best performance is chosen after performing several trials, then,\ndiscarding the effort of training all the models with every hyperparameter\nconfiguration trial and performing an ensemble of all them. This ensemble\nconsists of simply averaging the model predictions or weighting the models by a\ncertain probability. Recently, other more sophisticated ensemble strategies,\nsuch as the Caruana method or the stacking strategy has been proposed. On the\none hand, the Caruana method performs well in HPO ensemble, since it is not\naffected by the effects of multicollinearity, which is prevalent in HPO. It\njust computes the average over a subset of predictions with replacement. But it\ndoes not benefit from the generalization power of a learning process. On the\nother hand, stacking methods include a learning procedure since a meta-learner\nis required to perform the ensemble. Yet, one hardly finds advice about which\nmeta-learner is adequate. Besides, some meta-learners may suffer from the\neffects of multicollinearity or need to be tuned to reduce them. This paper\nexplores meta-learners for stacking ensemble in HPO, free of hyperparameter\ntuning, able to reduce the effects of multicollinearity and considering the\nensemble learning process generalization power. At this respect, the boosting\nstrategy seems promising as a stacking meta-learner. In fact, it completely\nremoves the effects of multicollinearity. This paper also proposes an implicit\nregularization in the classical boosting method and a novel non-parametric stop\ncriterion suitable only for boosting and specifically designed for HPO. The\nsynergy between these two improvements over boosting exhibits competitive and\npromising predictive power performance compared to other existing meta-learners\nand ensemble approaches for HPO other than the stacking ensemble.\n","authors":["Laura Fdez-Díaz","José Ramón Quevedo","Elena Montañés"],"pdf_url":"https://arxiv.org/pdf/2402.01379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01376v1","updated":"2024-02-02T13:00:38Z","published":"2024-02-02T13:00:38Z","title":"LoTR: Low Tensor Rank Weight Adaptation","summary":" In this paper we generalize and extend an idea of low-rank adaptation (LoRA)\nof large language models (LLMs) based on Transformer architecture. Widely used\nLoRA-like methods of fine-tuning LLMs are based on matrix factorization of\ngradient update. We introduce LoTR, a novel approach for parameter-efficient\nfine-tuning of LLMs which represents a gradient update to parameters in a form\nof tensor decomposition. Low-rank adapter for each layer is constructed as a\nproduct of three matrices, and tensor structure arises from sharing left and\nright multipliers of this product among layers. Simultaneous compression of a\nsequence of layers with low-rank tensor representation allows LoTR to archive\neven better parameter efficiency then LoRA especially for deep models.\nMoreover, the core tensor does not depend on original weight dimension and can\nbe made arbitrary small, which allows for extremely cheap and fast downstream\nfine-tuning.\n","authors":["Daniel Bershatsky","Daria Cherniuk","Talgat Daulbaev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2402.01376v1.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2307.04870v5","updated":"2024-02-02T12:59:24Z","published":"2023-07-10T19:34:41Z","title":"RACH-Space: Reconstructing Adaptive Convex Hull Space with Applications\n in Weak Supervision","summary":" We introduce RACH-Space, an algorithm for labelling unlabelled data in weakly\nsupervised learning, given incomplete, noisy information about the labels.\nRACH-Space offers simplicity in implementation without requiring hard\nassumptions on data or the sources of weak supervision, and is well suited for\npractical applications where fully labelled data is not available. Our method\nis built upon a geometrical interpretation of the space spanned by the set of\nweak signals. We also analyze the theoretical properties underlying the\nrelationship between the convex hulls in this space and the accuracy of our\noutput labels, bridging geometry with machine learning. Empirical results\ndemonstrate that RACH-Space works well in practice and compares favorably to\nthe best existing label models for weakly supervised learning.\n","authors":["Woojoo Na","Abiy Tasissa"],"pdf_url":"https://arxiv.org/pdf/2307.04870v5.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2402.01371v1","updated":"2024-02-02T12:48:49Z","published":"2024-02-02T12:48:49Z","title":"Critic-Actor for Average Reward MDPs with Function Approximation: A\n Finite-Time Analysis","summary":" In recent years, there has been a lot of research work activity focused on\ncarrying out asymptotic and non-asymptotic convergence analyses for\ntwo-timescale actor critic algorithms where the actor updates are performed on\na timescale that is slower than that of the critic. In a recent work, the\ncritic-actor algorithm has been presented for the infinite horizon discounted\ncost setting in the look-up table case where the timescales of the actor and\nthe critic are reversed and asymptotic convergence analysis has been presented.\nIn our work, we present the first critic-actor algorithm with function\napproximation and in the long-run average reward setting and present the first\nfinite-time (non-asymptotic) analysis of such a scheme. We obtain optimal\nlearning rates and prove that our algorithm achieves a sample complexity of\n$\\mathcal{\\tilde{O}}(\\epsilon^{-2.08})$ for the mean squared error of the\ncritic to be upper bounded by $\\epsilon$ which is better than the one obtained\nfor actor-critic in a similar setting. We also show the results of numerical\nexperiments on three benchmark settings and observe that the critic-actor\nalgorithm competes well with the actor-critic algorithm.\n","authors":["Prashansa Panda","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2402.01371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01369v1","updated":"2024-02-02T12:39:49Z","published":"2024-02-02T12:39:49Z","title":"Cheating Suffix: Targeted Attack to Text-To-Image Diffusion Models with\n Multi-Modal Priors","summary":" Diffusion models have been widely deployed in various image generation tasks,\ndemonstrating an extraordinary connection between image and text modalities.\nHowever, they face challenges of being maliciously exploited to generate\nharmful or sensitive images by appending a specific suffix to the original\nprompt. Existing works mainly focus on using single-modal information to\nconduct attacks, which fails to utilize multi-modal features and results in\nless than satisfactory performance. Integrating multi-modal priors (MMP), i.e.\nboth text and image features, we propose a targeted attack method named\nMMP-Attack in this work. Specifically, the goal of MMP-Attack is to add a\ntarget object into the image content while simultaneously removing the original\nobject. The MMP-Attack shows a notable advantage over existing works with\nsuperior universality and transferability, which can effectively attack\ncommercial text-to-image (T2I) models such as DALL-E 3. To the best of our\nknowledge, this marks the first successful attempt of transfer-based attack to\ncommercial T2I models. Our code is publicly available at\n\\url{https://github.com/ydc123/MMP-Attack}.\n","authors":["Dingcheng Yang","Yang Bai","Xiaojun Jia","Yang Liu","Xiaochun Cao","Wenjian Yu"],"pdf_url":"https://arxiv.org/pdf/2402.01369v1.pdf","comment":"10 figures"},{"id":"http://arxiv.org/abs/2311.13870v2","updated":"2024-02-02T12:37:02Z","published":"2023-11-23T09:27:08Z","title":"Multi-intention Inverse Q-learning for Interpretable Behavior\n Representation","summary":" In advancing the understanding of decision-making processes, Inverse\nReinforcement Learning (IRL) have proven instrumental in reconstructing\nanimal's multiple intentions amidst complex behaviors. Given the recent\ndevelopment of a continuous-time multi-intention IRL framework, there has been\npersistent inquiry into inferring discrete time-varying rewards with IRL. To\ntackle the challenge, we introduce Latent (Markov) Variable Inverse Q-learning\n(L(M)V-IQL), a novel class of IRL algorthms tailored for accommodating discrete\nintrinsic reward functions. Leveraging an Expectation-Maximization approach, we\ncluster observed expert trajectories into distinct intentions and independently\nsolve the IRL problem for each. Demonstrating the efficacy of L(M)V-IQL through\nsimulated experiments and its application to different real mouse behavior\ndatasets, our approach surpasses current benchmarks in animal behavior\nprediction, producing interpretable reward functions. This advancement holds\npromise for neuroscience and cognitive science, contributing to a deeper\nunderstanding of decision-making and uncovering underlying brain mechanisms.\n","authors":["Hao Zhu","Brice De La Crompe","Gabriel Kalweit","Artur Schneider","Maria Kalweit","Ilka Diester","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2311.13870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01364v1","updated":"2024-02-02T12:34:09Z","published":"2024-02-02T12:34:09Z","title":"Continual Learning for Large Language Models: A Survey","summary":" Large language models (LLMs) are not amenable to frequent re-training, due to\nhigh training costs arising from their massive scale. However, updates are\nnecessary to endow LLMs with new skills and keep them up-to-date with rapidly\nevolving human knowledge. This paper surveys recent works on continual learning\nfor LLMs. Due to the unique nature of LLMs, we catalog continue learning\ntechniques in a novel multi-staged categorization scheme, involving continual\npretraining, instruction tuning, and alignment. We contrast continual learning\nfor LLMs with simpler adaptation methods used in smaller models, as well as\nwith other enhancement strategies like retrieval-augmented generation and model\nediting. Moreover, informed by a discussion of benchmarks and evaluation, we\nidentify several challenges and future work directions for this crucial task.\n","authors":["Tongtong Wu","Linhao Luo","Yuan-Fang Li","Shirui Pan","Thuy-Trang Vu","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.01364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01361v1","updated":"2024-02-02T12:29:18Z","published":"2024-02-02T12:29:18Z","title":"To the Max: Reinventing Reward in Reinforcement Learning","summary":" In reinforcement learning (RL), different rewards can define the same optimal\npolicy but result in drastically different learning performance. For some, the\nagent gets stuck with a suboptimal behavior, and for others, it solves the task\nefficiently. Choosing a good reward function is hence an extremely important\nyet challenging problem. In this paper, we explore an alternative approach to\nusing rewards for learning. We introduce max-reward RL, where an agent\noptimizes the maximum rather than the cumulative reward. Unlike earlier works,\nour approach works for deterministic and stochastic environments and can be\neasily combined with state-of-the-art RL algorithms. In the experiments, we\nstudy the performance of max-reward RL algorithms in two goal-reaching\nenvironments from Gymnasium-Robotics and demonstrate its benefits over standard\nRL. The code is publicly available.\n","authors":["Grigorii Veviurko","Wendelin Böhmer","Mathijs de Weerdt"],"pdf_url":"https://arxiv.org/pdf/2402.01361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10249v2","updated":"2024-02-02T12:29:07Z","published":"2022-12-20T13:54:04Z","title":"Learning efficient backprojections across cortical hierarchies in real\n time","summary":" Models of sensory processing and learning in the cortex need to efficiently\nassign credit to synapses in all areas. In deep learning, a known solution is\nerror backpropagation, which however requires biologically implausible weight\ntransport from feed-forward to feedback paths.\n We introduce Phaseless Alignment Learning (PAL), a bio-plausible method to\nlearn efficient feedback weights in layered cortical hierarchies. This is\nachieved by exploiting the noise naturally found in biophysical systems as an\nadditional carrier of information. In our dynamical system, all weights are\nlearned simultaneously with always-on plasticity and using only information\nlocally available to the synapses. Our method is completely phase-free (no\nforward and backward passes or phased learning) and allows for efficient error\npropagation across multi-layer cortical hierarchies, while maintaining\nbiologically plausible signal transport and learning.\n Our method is applicable to a wide class of models and improves on previously\nknown biologically plausible ways of credit assignment: compared to random\nsynaptic feedback, it can solve complex tasks with less neurons and learn more\nuseful latent representations. We demonstrate this on various classification\ntasks using a cortical microcircuit model with prospective coding.\n","authors":["Kevin Max","Laura Kriener","Garibaldi Pineda García","Thomas Nowotny","Ismael Jaras","Walter Senn","Mihai A. Petrovici"],"pdf_url":"https://arxiv.org/pdf/2212.10249v2.pdf","comment":"Updated with streamlined main part, CIFAR-10 simulations, including\n DFA and minor fixes"},{"id":"http://arxiv.org/abs/2402.01359v1","updated":"2024-02-02T12:27:32Z","published":"2024-02-02T12:27:32Z","title":"TESSERACT: Eliminating Experimental Bias in Malware Classification\n across Space and Time (Extended Version)","summary":" Machine learning (ML) plays a pivotal role in detecting malicious software.\nDespite the high F1-scores reported in numerous studies reaching upwards of\n0.99, the issue is not completely solved. Malware detectors often experience\nperformance decay due to constantly evolving operating systems and attack\nmethods, which can render previously learned knowledge insufficient for\naccurate decision-making on new inputs. This paper argues that commonly\nreported results are inflated due to two pervasive sources of experimental bias\nin the detection task: spatial bias caused by data distributions that are not\nrepresentative of a real-world deployment; and temporal bias caused by\nincorrect time splits of data, leading to unrealistic configurations. To\naddress these biases, we introduce a set of constraints for fair experiment\ndesign, and propose a new metric, AUT, for classifier robustness in real-world\nsettings. We additionally propose an algorithm designed to tune training data\nto enhance classifier performance. Finally, we present TESSERACT, an\nopen-source framework for realistic classifier comparison. Our evaluation\nencompasses both traditional ML and deep learning methods, examining published\nworks on an extensive Android dataset with 259,230 samples over a five-year\nspan. Additionally, we conduct case studies in the Windows PE and PDF domains.\nOur findings identify the existence of biases in previous studies and reveal\nthat significant performance enhancements are possible through appropriate,\nperiodic tuning. We explore how mitigation strategies may support in achieving\na more stable and better performance over time by employing multiple strategies\nto delay performance decay.\n","authors":["Zeliang Kan","Shae McFadden","Daniel Arp","Feargus Pendlebury","Roberto Jordaney","Johannes Kinder","Fabio Pierazzi","Lorenzo Cavallaro"],"pdf_url":"https://arxiv.org/pdf/2402.01359v1.pdf","comment":"35 pages, submitted to ACM ToPS, under reviewing. arXiv admin note:\n text overlap with arXiv:1807.07838"},{"id":"http://arxiv.org/abs/2401.05015v2","updated":"2024-02-02T12:24:57Z","published":"2024-01-10T09:03:52Z","title":"An Information Theoretic Approach to Interaction-Grounded Learning","summary":" Reinforcement learning (RL) problems where the learner attempts to infer an\nunobserved reward from some feedback variables have been studied in several\nrecent papers. The setting of Interaction-Grounded Learning (IGL) is an example\nof such feedback-based RL tasks where the learner optimizes the return by\ninferring latent binary rewards from the interaction with the environment. In\nthe IGL setting, a relevant assumption used in the RL literature is that the\nfeedback variable $Y$ is conditionally independent of the context-action\n$(X,A)$ given the latent reward $R$. In this work, we propose Variational\nInformation-based IGL (VI-IGL) as an information-theoretic method to enforce\nthe conditional independence assumption in the IGL-based RL problem. The VI-IGL\nframework learns a reward decoder using an information-based objective based on\nthe conditional mutual information (MI) between $(X,A)$ and $Y$. To estimate\nand optimize the information-based terms for the continuous random variables in\nthe RL problem, VI-IGL leverages the variational representation of mutual\ninformation to obtain a min-max optimization problem. Also, we extend the\nVI-IGL framework to general $f$-Information measures leading to the generalized\n$f$-VI-IGL framework for the IGL-based RL problems. We present numerical\nresults on several reinforcement learning settings indicating an improved\nperformance compared to the existing IGL-based RL algorithm.\n","authors":["Xiaoyan Hu","Farzan Farnia","Ho-fung Leung"],"pdf_url":"https://arxiv.org/pdf/2401.05015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16513v2","updated":"2024-02-02T12:16:12Z","published":"2023-07-31T09:27:01Z","title":"Deception Abilities Emerged in Large Language Models","summary":" Large language models (LLMs) are currently at the forefront of intertwining\nartificial intelligence (AI) systems with human communication and everyday\nlife. Thus, aligning them with human values is of great importance. However,\ngiven the steady increase in reasoning abilities, future LLMs are under\nsuspicion of becoming able to deceive human operators and utilizing this\nability to bypass monitoring efforts. As a prerequisite to this, LLMs need to\npossess a conceptual understanding of deception strategies. This study reveals\nthat such strategies emerged in state-of-the-art LLMs, such as GPT-4, but were\nnon-existent in earlier LLMs. We conduct a series of experiments showing that\nstate-of-the-art LLMs are able to understand and induce false beliefs in other\nagents, that their performance in complex deception scenarios can be amplified\nutilizing chain-of-thought reasoning, and that eliciting Machiavellianism in\nLLMs can alter their propensity to deceive. In sum, revealing hitherto unknown\nmachine behavior in LLMs, our study contributes to the nascent field of machine\npsychology.\n","authors":["Thilo Hagendorff"],"pdf_url":"https://arxiv.org/pdf/2307.16513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01350v1","updated":"2024-02-02T12:09:20Z","published":"2024-02-02T12:09:20Z","title":"FedMoE: Data-Level Personalization with Mixture of Experts for\n Model-Heterogeneous Personalized Federated Learning","summary":" Federated learning (FL) is widely employed for collaborative training on\ndecentralized data but faces challenges like data, system, and model\nheterogeneity. This prompted the emergency of model-heterogeneous personalized\nfederated learning (MHPFL). However, concerns persist regarding data and model\nprivacy, model performance, communication, and computational costs in current\nMHPFL methods. To tackle these concerns, we propose a novel model-heterogeneous\npersonalized Federated learning algorithm (FedMoE) with the Mixture of Experts\n(MoE), renowned for enhancing large language models (LLMs). It assigns a shared\nhomogeneous small feature extractor and a local gating network for each\nclient's local heterogeneous large model. (1) During local training, the local\nheterogeneous model's feature extractor acts as a local expert for personalized\nfeature (representation) extraction, while the shared homogeneous small feature\nextractor serves as a global expert for generalized feature extraction. The\nlocal gating network produces personalized weights for extracted\nrepresentations from both experts on each data sample. The three models form a\nlocal heterogeneous MoE. The weighted mixed representation fuses global\ngeneralized and local personalized features and is processed by the local\nheterogeneous large model's header with personalized prediction information for\noutput. The MoE and prediction header are updated synchronously. (2) The\ntrained local homogeneous small feature extractors are sent to the server for\ncross-client information fusion via aggregation. Briefly, FedMoE first enhances\nlocal model personalization at a fine-grained data level while supporting model\nheterogeneity.\n","authors":["Liping Yi","Han Yu","Chao Ren","Heng Zhang","Gang Wang","Xiaoguang Liu","Xiaoxiao Li"],"pdf_url":"https://arxiv.org/pdf/2402.01350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16304v2","updated":"2024-02-02T12:06:21Z","published":"2023-07-30T19:14:05Z","title":"You Shall Pass: Dealing with the Zero-Gradient Problem in Predict and\n Optimize for Convex Optimization","summary":" Predict and optimize is an increasingly popular decision-making paradigm that\nemploys machine learning to predict unknown parameters of optimization\nproblems. Instead of minimizing the prediction error of the parameters, it\ntrains predictive models using task performance as a loss function. The key\nchallenge to train such models is the computation of the Jacobian of the\nsolution of the optimization problem with respect to its parameters. For linear\nproblems, this Jacobian is known to be zero or undefined; hence, approximations\nare usually employed. For non-linear convex problems, however, it is common to\nuse the exact Jacobian. This paper demonstrates that the zero-gradient problem\nappears in the non-linear case as well -- the Jacobian can have a sizeable null\nspace, thereby causing the training process to get stuck in suboptimal points.\nThrough formal proofs, this paper shows that smoothing the feasible set\nresolves this problem. Combining this insight with known techniques from the\nliterature, such as quadratic programming approximation and projection distance\nregularization, a novel method to approximate the Jacobian is derived. In\nsimulation experiments, the proposed method increases the performance in the\nnon-linear case and at least matches the existing state-of-the-art methods for\nlinear problems.\n","authors":["Grigorii Veviurko","Wendelin Böhmer","Mathijs de Weerdt"],"pdf_url":"https://arxiv.org/pdf/2307.16304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01348v1","updated":"2024-02-02T12:04:44Z","published":"2024-02-02T12:04:44Z","title":"CORE: Mitigating Catastrophic Forgetting in Continual Learning through\n Cognitive Replay","summary":" This paper introduces a novel perspective to significantly mitigate\ncatastrophic forgetting in continuous learning (CL), which emphasizes models'\ncapacity to preserve existing knowledge and assimilate new information. Current\nreplay-based methods treat every task and data sample equally and thus can not\nfully exploit the potential of the replay buffer. In response, we propose\nCOgnitive REplay (CORE), which draws inspiration from human cognitive review\nprocesses. CORE includes two key strategies: Adaptive Quantity Allocation and\nQuality-Focused Data Selection. The former adaptively modulates the replay\nbuffer allocation for each task based on its forgetting rate, while the latter\nguarantees the inclusion of representative data that best encapsulates the\ncharacteristics of each task within the buffer. Our approach achieves an\naverage accuracy of 37.95% on split-CIFAR10, surpassing the best baseline\nmethod by 6.52%. Additionally, it significantly enhances the accuracy of the\npoorest-performing task by 6.30% compared to the top baseline.\n","authors":["Jianshu Zhang","Yankai Fu","Ziheng Peng","Dongyu Yao","Kun He"],"pdf_url":"https://arxiv.org/pdf/2402.01348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v1","updated":"2024-02-02T12:02:46Z","published":"2024-02-02T12:02:46Z","title":"Skip $\\textbackslash n$: A simple method to reduce hallucination in\n Large Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks ('$\\textbackslash n\\textbackslash n$'),\nwhere the content before and after '$\\textbackslash n\\textbackslash n$' in the\ntraining data frequently exhibit significant semantic changes. This pattern\nleads the model to infer that the contents following '$\\textbackslash\nn\\textbackslash n$' should be obviously different from the preceding contents\nwith less hallucinatory descriptions, thereby increasing the probability of\nhallucinatory descriptions subsequent to the '$\\textbackslash n\\textbackslash\nn$'. We have validated this hypothesis on multiple publicly available LVLMs.\nBesides, we find that deliberately inserting '$\\textbackslash n\\textbackslash\nn$' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of `\\textbackslash n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.01344v1","updated":"2024-02-02T12:02:42Z","published":"2024-02-02T12:02:42Z","title":"Monotone, Bi-Lipschitz, and Polyak-Łojasiewicz Networks","summary":" This paper presents a new \\emph{bi-Lipschitz} invertible neural network, the\nBiLipNet, which has the ability to control both its \\emph{Lipschitzness}\n(output sensitivity to input perturbations) and \\emph{inverse Lipschitzness}\n(input distinguishability from different outputs). The main contribution is a\nnovel invertible residual layer with certified strong monotonicity and\nLipschitzness, which we compose with orthogonal layers to build bi-Lipschitz\nnetworks. The certification is based on incremental quadratic constraints,\nwhich achieves much tighter bounds compared to spectral normalization.\nMoreover, we formulate the model inverse calculation as a three-operator\nsplitting problem, for which fast algorithms are known. Based on the proposed\nbi-Lipschitz network, we introduce a new scalar-output network, the PLNet,\nwhich satisfies the Polyak-\\L{}ojasiewicz condition. It can be applied to learn\nnon-convex surrogate losses with favourable properties, e.g., a unique and\nefficiently-computable global minimum.\n","authors":["Ruigang Wang","Krishnamurthy Dvijotham","Ian R. Manchester"],"pdf_url":"https://arxiv.org/pdf/2402.01344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01343v1","updated":"2024-02-02T11:57:53Z","published":"2024-02-02T11:57:53Z","title":"Shapelet-based Model-agnostic Counterfactual Local Explanations for Time\n Series Classification","summary":" In this work, we propose a model-agnostic instance-based post-hoc\nexplainability method for time series classification. The proposed algorithm,\nnamely Time-CF, leverages shapelets and TimeGAN to provide counterfactual\nexplanations for arbitrary time series classifiers. We validate the proposed\nmethod on several real-world univariate time series classification tasks from\nthe UCR Time Series Archive. The results indicate that the counterfactual\ninstances generated by Time-CF when compared to state-of-the-art methods,\ndemonstrate better performance in terms of four explainability metrics:\ncloseness, sensibility, plausibility, and sparsity.\n","authors":["Qi Huang","Wei Chen","Thomas Bäck","Niki van Stein"],"pdf_url":"https://arxiv.org/pdf/2402.01343v1.pdf","comment":"The paper has been accepted by the XAI4Sci workshop of AAAI 2024"},{"id":"http://arxiv.org/abs/2402.01342v1","updated":"2024-02-02T11:57:50Z","published":"2024-02-02T11:57:50Z","title":"Training-time Neuron Alignment through Permutation Subspace for\n Improving Linear Mode Connectivity and Model Fusion","summary":" In deep learning, stochastic gradient descent often yields functionally\nsimilar yet widely scattered solutions in the weight space even under the same\ninitialization, causing barriers in the Linear Mode Connectivity (LMC)\nlandscape. Overcoming these barriers is crucial for understanding deep learning\ndynamics and enhancing model-fusion algorithms. Previous studies highlight the\nrole of permutation symmetry in reducing post-training barriers through network\npermutation. However, these post-hoc methods, demanding extra computations, are\nless effective for larger, complex models (e.g., ViT, LLM) due to numerous\npermutation matrices. Thus, in this paper, we study training-time neuron\nalignment. Our hypothesis suggests that training-time permutation subspace can\nreduce LMC barriers for free. We find that pruning at initialization supports\nthis. Beyond pruning, we introduce TNA-PFN, a simple yet lossless algorithm\nusing a partial gradient mask during training. TNA-PFN is theoretically and\nempirically validated for reducing LMC barriers. It excels in wide model fusion\napplications, especially in federated learning, two algorithms based on TNA-FPN\nthat are proposed to show its prospects even under heterogeneous datasets.\nMoreover, TNA-PFN can enhance the generalization of model soup for vision\ntransformers and ColD fusion for pretrained language models.\n","authors":["Zexi Li","Zhiqi Li","Jie Lin","Tao Shen","Tao Lin","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2402.01342v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2402.01341v1","updated":"2024-02-02T11:55:57Z","published":"2024-02-02T11:55:57Z","title":"Fundamental Properties of Causal Entropy and Information Gain","summary":" Recent developments enable the quantification of causal control given a\nstructural causal model (SCM). This has been accomplished by introducing\nquantities which encode changes in the entropy of one variable when intervening\non another. These measures, named causal entropy and causal information gain,\naim to address limitations in existing information theoretical approaches for\nmachine learning tasks where causality plays a crucial role. They have not yet\nbeen properly mathematically studied. Our research contributes to the formal\nunderstanding of the notions of causal entropy and causal information gain by\nestablishing and analyzing fundamental properties of these concepts, including\nbounds and chain rules. Furthermore, we elucidate the relationship between\ncausal entropy and stochastic interventions. We also propose definitions for\ncausal conditional entropy and causal conditional information gain. Overall,\nthis exploration paves the way for enhancing causal machine learning tasks\nthrough the study of recently-proposed information theoretic quantities\ngrounded in considerations about causality.\n","authors":["Francisco N. F. Q. Simoes","Mehdi Dastani","Thijs van Ommen"],"pdf_url":"https://arxiv.org/pdf/2402.01341v1.pdf","comment":"Accepted for the conference CLeaR (Causal Learning and Reasoning)\n 2024. To appear in its proceedings"},{"id":"http://arxiv.org/abs/2402.01340v1","updated":"2024-02-02T11:53:27Z","published":"2024-02-02T11:53:27Z","title":"SignSGD with Federated Defense: Harnessing Adversarial Attacks through\n Gradient Sign Decoding","summary":" Distributed learning is an effective approach to accelerate model training\nusing multiple workers. However, substantial communication delays emerge\nbetween workers and a parameter server due to massive costs associated with\ncommunicating gradients. SignSGD with majority voting (signSGD-MV) is a simple\nyet effective optimizer that reduces communication costs through one-bit\nquantization, yet the convergence rates considerably decrease as adversarial\nworkers increase. In this paper, we show that the convergence rate is invariant\nas the number of adversarial workers increases, provided that the number of\nadversarial workers is smaller than that of benign workers. The key idea\nshowing this counter-intuitive result is our novel signSGD with federated\ndefense (signSGD-FD). Unlike the traditional approaches, signSGD-FD exploits\nthe gradient information sent by adversarial workers with the proper weights,\nwhich are obtained through gradient sign decoding. Experimental results\ndemonstrate signSGD-FD achieves superior convergence rates over traditional\nalgorithms in various adversarial attack scenarios.\n","authors":["Chanho Park","Namyoon Lee"],"pdf_url":"https://arxiv.org/pdf/2402.01340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02063v2","updated":"2024-02-02T11:52:22Z","published":"2023-10-03T14:04:45Z","title":"Lessons Learned from EXMOS User Studies: A Technical Report Summarizing\n Key Takeaways from User Studies Conducted to Evaluate The EXMOS Platform","summary":" In the realm of interactive machine-learning systems, the provision of\nexplanations serves as a vital aid in the processes of debugging and enhancing\nprediction models. However, the extent to which various global model-centric\nand data-centric explanations can effectively assist domain experts in\ndetecting and resolving potential data-related issues for the purpose of model\nimprovement has remained largely unexplored. In this technical report, we\nsummarise the key findings of our two user studies. Our research involved a\ncomprehensive examination of the impact of global explanations rooted in both\ndata-centric and model-centric perspectives within systems designed to support\nhealthcare experts in optimising machine learning models through both automated\nand manual data configurations. To empirically investigate these dynamics, we\nconducted two user studies, comprising quantitative analysis involving a sample\nsize of 70 healthcare experts and qualitative assessments involving 30\nhealthcare experts. These studies were aimed at illuminating the influence of\ndifferent explanation types on three key dimensions: trust, understandability,\nand model improvement. Results show that global model-centric explanations\nalone are insufficient for effectively guiding users during the intricate\nprocess of data configuration. In contrast, data-centric explanations exhibited\ntheir potential by enhancing the understanding of system changes that occur\npost-configuration. However, a combination of both showed the highest level of\nefficacy for fostering trust, improving understandability, and facilitating\nmodel enhancement among healthcare experts. We also present essential\nimplications for developing interactive machine-learning systems driven by\nexplanations. These insights can guide the creation of more effective systems\nthat empower domain experts to harness the full potential of machine learning\n","authors":["Aditya Bhattacharya","Simone Stumpf","Lucija Gosak","Gregor Stiglic","Katrien Verbert"],"pdf_url":"https://arxiv.org/pdf/2310.02063v2.pdf","comment":"It is a technical report only. The contents are not peer-reviewed.\n Please reach out to the main author for any questions"},{"id":"http://arxiv.org/abs/2402.01338v1","updated":"2024-02-02T11:47:56Z","published":"2024-02-02T11:47:56Z","title":"Inferring the Langevin Equation with Uncertainty via Bayesian Neural\n Networks","summary":" Pervasive across diverse domains, stochastic systems exhibit fluctuations in\nprocesses ranging from molecular dynamics to climate phenomena. The Langevin\nequation has served as a common mathematical model for studying such systems,\nenabling predictions of their temporal evolution and analyses of thermodynamic\nquantities, including absorbed heat, work done on the system, and entropy\nproduction. However, inferring the Langevin equation from observed trajectories\nremains challenging, particularly for nonlinear and high-dimensional systems.\nIn this study, we present a comprehensive framework that employs Bayesian\nneural networks for inferring Langevin equations in both overdamped and\nunderdamped regimes. Our framework first provides the drift force and diffusion\nmatrix separately and then combines them to construct the Langevin equation. By\nproviding a distribution of predictions instead of a single value, our approach\nallows us to assess prediction uncertainties, which can prevent potential\nmisunderstandings and erroneous decisions about the system. We demonstrate the\neffectiveness of our framework in inferring Langevin equations for various\nscenarios including a neuron model and microscopic engine, highlighting its\nversatility and potential impact.\n","authors":["Youngkyoung Bae","Seungwoong Ha","Hawoong Jeong"],"pdf_url":"https://arxiv.org/pdf/2402.01338v1.pdf","comment":"30 pages, 17 figures"},{"id":"http://arxiv.org/abs/2402.01327v1","updated":"2024-02-02T11:26:18Z","published":"2024-02-02T11:26:18Z","title":"Supervised Algorithmic Fairness in Distribution Shifts: A Survey","summary":" Supervised fairness-aware machine learning under distribution shifts is an\nemerging field that addresses the challenge of maintaining equitable and\nunbiased predictions when faced with changes in data distributions from source\nto target domains. In real-world applications, machine learning models are\noften trained on a specific dataset but deployed in environments where the data\ndistribution may shift over time due to various factors. This shift can lead to\nunfair predictions, disproportionately affecting certain groups characterized\nby sensitive attributes, such as race and gender. In this survey, we provide a\nsummary of various types of distribution shifts and comprehensively investigate\nexisting methods based on these shifts, highlighting six commonly used\napproaches in the literature. Additionally, this survey lists publicly\navailable datasets and evaluation metrics for empirical studies. We further\nexplore the interconnection with related research fields, discuss the\nsignificant challenges, and identify potential directions for future studies.\n","authors":["Yujie Lin","Dong Li","Chen Zhao","Xintao Wu","Qin Tian","Minglai Shao"],"pdf_url":"https://arxiv.org/pdf/2402.01327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01306v1","updated":"2024-02-02T10:53:36Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":" Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner; for example, humans are\nfamously loss-averse. We show that objectives for aligning LLMs with human\nfeedback implicitly incorporate many of these biases -- the success of these\nobjectives (e.g., DPO) over cross-entropy minimization can partly be ascribed\nto them being $\\textit{human-aware loss functions}$ (HALOs). However, the\nutility functions these methods attribute to humans still differ from those in\nthe prospect theory literature. Using a Kahneman-Tversky model of human\nutility, we propose a HALO that directly maximizes the utility of generations\ninstead of maximizing the log-likelihood of preferences, as current methods do.\nWe call this approach Kahneman-Tversky Optimization (KTO), and it matches or\nexceeds the performance of preference-based methods at scales from 1B to 30B.\nCrucially, KTO does not need preferences -- only a binary signal of whether an\noutput is desirable or undesirable for a given input. This makes it far easier\nto use in the real world, where preference data is scarce and expensive.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2402.01302v1","updated":"2024-02-02T10:44:42Z","published":"2024-02-02T10:44:42Z","title":"A Unified Framework for Gradient-based Clustering of Distributed Data","summary":" We develop a family of distributed clustering algorithms that work over\nnetworks of users. In the proposed scenario, users contain a local dataset and\ncommunicate only with their immediate neighbours, with the aim of finding a\nclustering of the full, joint data. The proposed family, termed Distributed\nGradient Clustering (DGC-$\\mathcal{F}_\\rho$), is parametrized by $\\rho \\geq 1$,\ncontroling the proximity of users' center estimates, with $\\mathcal{F}$\ndetermining the clustering loss. Specialized to popular clustering losses like\n$K$-means and Huber loss, DGC-$\\mathcal{F}_\\rho$ gives rise to novel\ndistributed clustering algorithms DGC-KM$_\\rho$ and DGC-HL$_\\rho$, while a\nnovel clustering loss based on the logistic function leads to DGC-LL$_\\rho$. We\nprovide a unified analysis and establish several strong results, under mild\nassumptions. First, the sequence of centers generated by the methods converges\nto a well-defined notion of fixed point, under any center initialization and\nvalue of $\\rho$. Second, as $\\rho$ increases, the family of fixed points\nproduced by DGC-$\\mathcal{F}_\\rho$ converges to a notion of consensus fixed\npoints. We show that consensus fixed points of DGC-$\\mathcal{F}_{\\rho}$ are\nequivalent to fixed points of gradient clustering over the full data,\nguaranteeing a clustering of the full data is produced. For the special case of\nBregman losses, we show that our fixed points converge to the set of Lloyd\npoints. Numerical experiments on real data confirm our theoretical findings and\ndemonstrate strong performance of the methods.\n","authors":["Aleksandar Armacki","Dragana Bajović","Dušan Jakovetić","Soummya Kar"],"pdf_url":"https://arxiv.org/pdf/2402.01302v1.pdf","comment":"35 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.01297v1","updated":"2024-02-02T10:36:53Z","published":"2024-02-02T10:36:53Z","title":"Characterizing Overfitting in Kernel Ridgeless Regression Through the\n Eigenspectrum","summary":" We derive new bounds for the condition number of kernel matrices, which we\nthen use to enhance existing non-asymptotic test error bounds for kernel\nridgeless regression in the over-parameterized regime for a fixed input\ndimension. For kernels with polynomial spectral decay, we recover the bound\nfrom previous work; for exponential decay, our bound is non-trivial and novel.\n Our conclusion on overfitting is two-fold: (i) kernel regressors whose\neigenspectrum decays polynomially must generalize well, even in the presence of\nnoisy labeled training data; these models exhibit so-called tempered\noverfitting; (ii) if the eigenspectrum of any kernel ridge regressor decays\nexponentially, then it generalizes poorly, i.e., it exhibits catastrophic\noverfitting. This adds to the available characterization of kernel ridge\nregressors exhibiting benign overfitting as the extremal case where the\neigenspectrum of the kernel decays sub-polynomially. Our analysis combines new\nrandom matrix theory (RMT) techniques with recent tools in the kernel ridge\nregression (KRR) literature.\n","authors":["Tin Sum Cheng","Aurelien Lucchi","Anastasis Kratsios","David Belius"],"pdf_url":"https://arxiv.org/pdf/2402.01297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01296v1","updated":"2024-02-02T10:35:05Z","published":"2024-02-02T10:35:05Z","title":"Bi-CryptoNets: Leveraging Different-Level Privacy for Encrypted\n Inference","summary":" Privacy-preserving neural networks have attracted increasing attention in\nrecent years, and various algorithms have been developed to keep the balance\nbetween accuracy, computational complexity and information security from the\ncryptographic view. This work takes a different view from the input data and\nstructure of neural networks. We decompose the input data (e.g., some images)\ninto sensitive and insensitive segments according to importance and privacy.\nThe sensitive segment includes some important and private information such as\nhuman faces and we take strong homomorphic encryption to keep security, whereas\nthe insensitive one contains some background and we add perturbations. We\npropose the bi-CryptoNets, i.e., plaintext and ciphertext branches, to deal\nwith two segments, respectively, and ciphertext branch could utilize the\ninformation from plaintext branch by unidirectional connections. We adopt\nknowledge distillation for our bi-CryptoNets by transferring representations\nfrom a well-trained teacher neural network. Empirical studies show the\neffectiveness and decrease of inference latency for our bi-CryptoNets.\n","authors":["Man-Jie Yuan","Zheng Zou","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2402.01296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01295v1","updated":"2024-02-02T10:34:13Z","published":"2024-02-02T10:34:13Z","title":"ExtremeCast: Boosting Extreme Value Prediction for Global Weather\n Forecast","summary":" Data-driven weather forecast based on machine learning (ML) has experienced\nrapid development and demonstrated superior performance in the global\nmedium-range forecast compared to traditional physics-based dynamical models.\nHowever, most of these ML models struggle with accurately predicting extreme\nweather, which is closely related to the extreme value prediction. Through\nmathematical analysis, we prove that the use of symmetric losses, such as the\nMean Squared Error (MSE), leads to biased predictions and underestimation of\nextreme values. To address this issue, we introduce Exloss, a novel loss\nfunction that performs asymmetric optimization and highlights extreme values to\nobtain accurate extreme weather forecast. Furthermore, we introduce a\ntraining-free extreme value enhancement strategy named ExEnsemble, which\nincreases the variance of pixel values and improves the forecast robustness.\nCombined with an advanced global weather forecast model, extensive experiments\nshow that our solution can achieve state-of-the-art performance in extreme\nweather prediction, while maintaining the overall forecast accuracy comparable\nto the top medium-range forecast models.\n","authors":["Wanghan Xu","Kang Chen","Tao Han","Hao Chen","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2402.01295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01293v1","updated":"2024-02-02T10:30:05Z","published":"2024-02-02T10:30:05Z","title":"Can MLLMs Perform Text-to-Image In-Context Learning?","summary":" The evolution from Large Language Models (LLMs) to Multimodal Large Language\nModels (MLLMs) has spurred research into extending In-Context Learning (ICL) to\nits multimodal counterpart. Existing such studies have primarily concentrated\non image-to-text ICL. However, the Text-to-Image ICL (T2I-ICL), with its unique\ncharacteristics and potential applications, remains underexplored. To address\nthis gap, we formally define the task of T2I-ICL and present CoBSAT, the first\nT2I-ICL benchmark dataset, encompassing ten tasks. Utilizing our dataset to\nbenchmark six state-of-the-art MLLMs, we uncover considerable difficulties\nMLLMs encounter in solving T2I-ICL. We identify the primary challenges as the\ninherent complexity of multimodality and image generation. To overcome these\nchallenges, we explore strategies like fine-tuning and Chain-of-Thought\nprompting, demonstrating notable improvements. Our code and dataset are\navailable at \\url{https://github.com/UW-Madison-Lee-Lab/CoBSAT}.\n","authors":["Yuchen Zeng","Wonjun Kang","Yicong Chen","Hyung Il Koo","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2402.01293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01287v1","updated":"2024-02-02T10:23:03Z","published":"2024-02-02T10:23:03Z","title":"Spiking CenterNet: A Distillation-boosted Spiking Neural Network for\n Object Detection","summary":" In the era of AI at the edge, self-driving cars, and climate change, the need\nfor energy-efficient, small, embedded AI is growing. Spiking Neural Networks\n(SNNs) are a promising approach to address this challenge, with their\nevent-driven information flow and sparse activations. We propose Spiking\nCenterNet for object detection on event data. It combines an SNN CenterNet\nadaptation with an efficient M2U-Net-based decoder. Our model significantly\noutperforms comparable previous work on Prophesee's challenging GEN1 Automotive\nDetection Dataset while using less than half the energy. Distilling the\nknowledge of a non-spiking teacher into our SNN further increases performance.\nTo the best of our knowledge, our work is the first approach that takes\nadvantage of knowledge distillation in the field of spiking object detection.\n","authors":["Lennard Bodden","Franziska Schwaiger","Duc Bach Ha","Lars Kreuzberg","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2402.01287v1.pdf","comment":"8 pages, 5 figures. Submitted to WCCI-2024"},{"id":"http://arxiv.org/abs/2402.01282v1","updated":"2024-02-02T10:16:10Z","published":"2024-02-02T10:16:10Z","title":"Differentiable and accelerated wavelet transforms on the sphere and ball","summary":" Directional wavelet dictionaries are hierarchical representations which\nefficiently capture and segment information across scale, location and\norientation. Such representations demonstrate a particular affinity to physical\nsignals, which often exhibit highly anisotropic, localised multiscale\nstructure. Many physically important signals are observed over spherical\ndomains, such as the celestial sky in cosmology. Leveraging recent advances in\ncomputational harmonic analysis, we design new highly distributable and\nautomatically differentiable directional wavelet transforms on the\n$2$-dimensional sphere $\\mathbb{S}^2$ and $3$-dimensional ball $\\mathbb{B}^3 =\n\\mathbb{R}^+ \\times \\mathbb{S}^2$ (the space formed by augmenting the sphere\nwith the radial half-line). We observe up to a $300$-fold and $21800$-fold\nacceleration for signals on the sphere and ball, respectively, compared to\nexisting software, whilst maintaining 64-bit machine precision. Not only do\nthese algorithms dramatically accelerate existing spherical wavelet transforms,\nthe gradient information afforded by automatic differentiation unlocks many\ndata-driven analysis techniques previously not possible for these spaces. We\npublicly release both S2WAV and S2BALL, open-sourced JAX libraries for our\ntransforms that are automatically differentiable and readily deployable both on\nand over clusters of hardware accelerators (e.g. GPUs & TPUs).\n","authors":["Matthew A. Price","Alicja Polanska","Jessica Whitney","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2402.01282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07303v3","updated":"2024-02-02T10:10:40Z","published":"2023-05-12T08:16:06Z","title":"Multi-Relational Hyperbolic Word Embeddings from Natural Language\n Definitions","summary":" Natural language definitions possess a recursive, self-explanatory semantic\nstructure that can support representation learning methods able to preserve\nexplicit conceptual relations and constraints in the latent space. This paper\npresents a multi-relational model that explicitly leverages such a structure to\nderive word embeddings from definitions. By automatically extracting the\nrelations linking defined and defining terms from dictionaries, we demonstrate\nhow the problem of learning word embeddings can be formalised via a\ntranslational framework in Hyperbolic space and used as a proxy to capture the\nglobal semantic structure of definitions. An extensive empirical analysis\ndemonstrates that the framework can help imposing the desired structural\nconstraints while preserving the semantic mapping required for controllable and\ninterpretable traversal. Moreover, the experiments reveal the superiority of\nthe Hyperbolic word embeddings over the Euclidean counterparts and demonstrate\nthat the multi-relational approach can obtain competitive results when compared\nto state-of-the-art neural models, with the advantage of being intrinsically\nmore efficient and interpretable.\n","authors":["Marco Valentino","Danilo S. Carvalho","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2305.07303v3.pdf","comment":"Accepted at the 18th Conference of the European Chapter of the\n Association for Computational Linguistics (EACL 2024), camera-ready"},{"id":"http://arxiv.org/abs/2402.01275v1","updated":"2024-02-02T10:04:29Z","published":"2024-02-02T10:04:29Z","title":"Parametric-Task MAP-Elites","summary":" Optimizing a set of functions simultaneously by leveraging their similarity\nis called multi-task optimization. Current black-box multi-task algorithms only\nsolve a finite set of tasks, even when the tasks originate from a continuous\nspace. In this paper, we introduce Parametric-task MAP-Elites (PT-ME), a novel\nblack-box algorithm to solve continuous multi-task optimization problems. This\nalgorithm (1) solves a new task at each iteration, effectively covering the\ncontinuous space, and (2) exploits a new variation operator based on local\nlinear regression. The resulting dataset of solutions makes it possible to\ncreate a function that maps any task parameter to its optimal solution. We show\non two parametric-task toy problems and a more realistic and challenging\nrobotic problem in simulation that PT-ME outperforms all baselines, including\nthe deep reinforcement learning algorithm PPO.\n","authors":["Timothée Anne","Jean-Baptiste Mouret"],"pdf_url":"https://arxiv.org/pdf/2402.01275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01274v1","updated":"2024-02-02T10:00:51Z","published":"2024-02-02T10:00:51Z","title":"On the Transferability of Large-Scale Self-Supervision to Few-Shot Audio\n Classification","summary":" In recent years, self-supervised learning has excelled for its capacity to\nlearn robust feature representations from unlabelled data. Networks pretrained\nthrough self-supervision serve as effective feature extractors for downstream\ntasks, including Few-Shot Learning. While the evaluation of unsupervised\napproaches for few-shot learning is well-established in imagery, it is notably\nabsent in acoustics. This study addresses this gap by assessing large-scale\nself-supervised models' performance in few-shot audio classification.\nAdditionally, we explore the relationship between a model's few-shot learning\ncapability and other downstream task benchmarks. Our findings reveal\nstate-of-the-art performance in some few-shot problems such as\nSpeechCommandsv2, as well as strong correlations between speech-based few-shot\nproblems and various downstream audio tasks.\n","authors":["Calum Heggan","Sam Budgett","Timothy Hosepedales","Mehrdad Yeghoobi"],"pdf_url":"https://arxiv.org/pdf/2402.01274v1.pdf","comment":"Camera Ready version as submitted to ICASSP SASB Workshop 2024. 5\n pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2305.18453v4","updated":"2024-02-02T09:56:45Z","published":"2023-05-29T04:14:38Z","title":"Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis","summary":" Artificial intelligence (AI) in healthcare, especially in medical imaging,\nfaces challenges due to data scarcity and privacy concerns. Addressing these,\nwe introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI\nsynthesis. This model effectively tackles data scarcity and privacy issues by\nintegrating semantic conditioning. This involves the channel-wise concatenation\nof a conditioning image to the model input, enabling control in image\ngeneration. Med-DDPM demonstrates superior stability and performance compared\nto existing 3D brain imaging synthesis methods. It generates diverse,\nanatomically coherent images with high visual fidelity. In terms of dice score\naccuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the\n0.6531 accuracy of real images, and outperforms baseline models. Combined with\nreal images, it further increases segmentation accuracy to 0.6675, showing the\npotential of our proposed method for data augmentation. This model represents\nthe first use of a diffusion model in 3D semantic brain MRI synthesis,\nproducing high-quality images. Its semantic conditioning feature also shows\npotential for image anonymization in biomedical imaging, addressing data and\nprivacy issues. We provide the code and model weights for Med-DDPM on our\nGitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support\nreproducibility.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Sodtavilan Odonchimed","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.18453v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06683v2","updated":"2024-02-02T09:54:18Z","published":"2024-01-12T16:43:28Z","title":"DQNC2S: DQN-based Cross-stream Crisis event Summarizer","summary":" Summarizing multiple disaster-relevant data streams simultaneously is\nparticularly challenging as existing Retrieve&Re-ranking strategies suffer from\nthe inherent redundancy of multi-stream data and limited scalability in a\nmulti-query setting. This work proposes an online approach to crisis timeline\ngeneration based on weak annotation with Deep Q-Networks. It selects on-the-fly\nthe relevant pieces of text without requiring neither human annotations nor\ncontent re-ranking. This makes the inference time independent of the number of\ninput queries. The proposed approach also incorporates a redundancy filter into\nthe reward function to effectively handle cross-stream content overlaps. The\nachieved ROUGE and BERTScore results are superior to those of best-performing\nmodels on the CrisisFACTS 2022 benchmark.\n","authors":["Daniele Rege Cambrin","Luca Cagliero","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2401.06683v2.pdf","comment":"accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2311.02516v2","updated":"2024-02-02T09:46:20Z","published":"2023-11-04T21:46:28Z","title":"Forward $χ^2$ Divergence Based Variational Importance Sampling","summary":" Maximizing the log-likelihood is a crucial aspect of learning latent variable\nmodels, and variational inference (VI) stands as the commonly adopted method.\nHowever, VI can encounter challenges in achieving a high log-likelihood when\ndealing with complicated posterior distributions. In response to this\nlimitation, we introduce a novel variational importance sampling (VIS) approach\nthat directly estimates and maximizes the log-likelihood. VIS leverages the\noptimal proposal distribution, achieved by minimizing the forward $\\chi^2$\ndivergence, to enhance log-likelihood estimation. We apply VIS to various\npopular latent variable models, including mixture models, variational\nauto-encoders, and partially observable generalized linear models. Results\ndemonstrate that our approach consistently outperforms state-of-the-art\nbaselines, both in terms of log-likelihood and model parameter estimation.\n","authors":["Chengrui Li","Yule Wang","Weihan Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2311.02516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.07365v5","updated":"2024-02-02T09:42:13Z","published":"2022-02-15T12:46:43Z","title":"A Statistical Learning View of Simple Kriging","summary":" In the Big Data era, with the ubiquity of geolocation sensors in particular,\nmassive datasets exhibiting a possibly complex spatial dependence structure are\nbecoming increasingly available. In this context, the standard probabilistic\ntheory of statistical learning does not apply directly and guarantees of the\ngeneralization capacity of predictive rules learned from such data are left to\nestablish. We analyze here the simple Kriging task from a statistical learning\nperspective, i.e. by carrying out a nonparametric finite-sample predictive\nanalysis. Given $d\\geq 1$ values taken by a realization of a square integrable\nrandom field $X=\\{X_s\\}_{s\\in S}$, $S\\subset \\mathbb{R}^2$, with unknown\ncovariance structure, at sites $s_1,\\; \\ldots,\\; s_d$ in $S$, the goal is to\npredict the unknown values it takes at any other location $s\\in S$ with minimum\nquadratic risk. The prediction rule being derived from a training spatial\ndataset: a single realization $X'$ of $X$, independent from those to be\npredicted, observed at $n\\geq 1$ locations $\\sigma_1,\\; \\ldots,\\; \\sigma_n$ in\n$S$. Despite the connection of this minimization problem with kernel ridge\nregression, establishing the generalization capacity of empirical risk\nminimizers is far from straightforward, due to the non independent and\nidentically distributed nature of the training data $X'_{\\sigma_1},\\; \\ldots,\\;\nX'_{\\sigma_n}$ involved in the learning procedure. In this article,\nnon-asymptotic bounds of order $O_{\\mathbb{P}}(1/\\sqrt{n})$ are proved for the\nexcess risk of a plug-in predictive rule mimicking the true minimizer in the\ncase of isotropic stationary Gaussian processes, observed at locations forming\na regular grid in the learning stage. These theoretical results are illustrated\nby various numerical experiments, on simulated data and on real-world datasets.\n","authors":["Emilia Siviero","Emilie Chautru","Stephan Clémençon"],"pdf_url":"https://arxiv.org/pdf/2202.07365v5.pdf","comment":"41 pages"},{"id":"http://arxiv.org/abs/2402.01264v1","updated":"2024-02-02T09:36:06Z","published":"2024-02-02T09:36:06Z","title":"Direct side information learning for zero-shot regression","summary":" Zero-shot learning provides models for targets for which instances are not\navailable, commonly called unobserved targets. The availability of target side\ninformation becomes crucial in this context in order to properly induce models\nfor these targets. The literature is plenty of strategies to cope with this\nscenario, but specifically designed on the basis of a zero-shot classification\nscenario, mostly in computer vision and image classification, but they are\neither not applicable or easily extensible for a zero-shot regression framework\nfor which a continuos value is required to be predicted rather than a label. In\nfact, there is a considerable lack of methods for zero-shot regression in the\nliterature. Two approaches for zero-shot regression that work in a two-phase\nprocedure were recently proposed. They first learn the observed target models\nthrough a classical regression learning ignoring the target side information.\nThen, they aggregate those observed target models afterwards exploiting the\ntarget side information and the models for the unobserved targets are induced.\nDespite both have shown quite good performance because of the different\ntreatment they grant to the common features and to the side information, they\nexploit features and side information separately, avoiding a global\noptimization for providing the unobserved target models. The proposal of this\npaper is a novel method that jointly takes features and side information in a\none-phase learning process, but treating side information properly and in a\nmore deserving way than as common features. A specific kernel that properly\nmerges features and side information is proposed for this purpose resulting in\na novel approach that exhibits better performance over both artificial and real\ndatasets.\n","authors":["Miriam Fdez-Díaz","Elena Montañés","José Ramón Quevedo"],"pdf_url":"https://arxiv.org/pdf/2402.01264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16775v2","updated":"2024-02-02T09:36:04Z","published":"2024-01-30T06:27:11Z","title":"Activity Detection for Massive Connectivity in Cell-free Networks with\n Unknown Large-scale Fading, Channel Statistics, Noise Variance, and Activity\n Probability: A Bayesian Approach","summary":" Activity detection is an important task in the next generation grant-free\nmultiple access. While there are a number of existing algorithms designed for\nthis purpose, they mostly require precise information about the network, such\nas large-scale fading coefficients, small-scale fading channel statistics,\nnoise variance at the access points, and user activity probability. Acquiring\nthese information would take a significant overhead and their estimated values\nmight not be accurate. This problem is even more severe in cell-free networks\nas there are many of these parameters to be acquired. Therefore, this paper\nsets out to investigate the activity detection problem without the\nabove-mentioned information. In order to handle so many unknown parameters,\nthis paper employs the Bayesian approach, where the unknown variables are\nendowed with prior distributions which effectively act as regularizations.\nTogether with the likelihood function, a maximum a posteriori (MAP) estimator\nand a variational inference algorithm are derived. Extensive simulations\ndemonstrate that the proposed methods, even without the knowledge of these\nsystem parameters, perform better than existing state-of-the-art methods, such\nas covariance-based and approximate message passing methods.\n","authors":["Hao Zhang","Qingfeng Lin","Yang Li","Lei Cheng","Yik-Chung Wu"],"pdf_url":"https://arxiv.org/pdf/2401.16775v2.pdf","comment":"16 pages, 9 figures, accepted for publication in IEEE Transactions on\n Signal Processing"},{"id":"http://arxiv.org/abs/2402.01263v1","updated":"2024-02-02T09:34:49Z","published":"2024-02-02T09:34:49Z","title":"A Differentiable POGLM with Forward-Backward Message Passing","summary":" The partially observable generalized linear model (POGLM) is a powerful tool\nfor understanding neural connectivity under the assumption of existing hidden\nneurons. With spike trains only recorded from visible neurons, existing works\nuse variational inference to learn POGLM meanwhile presenting the difficulty of\nlearning this latent variable model. There are two main issues: (1) the sampled\nPoisson hidden spike count hinders the use of the pathwise gradient estimator\nin VI; and (2) the existing design of the variational model is neither\nexpressive nor time-efficient, which further affects the performance. For (1),\nwe propose a new differentiable POGLM, which enables the pathwise gradient\nestimator, better than the score function gradient estimator used in existing\nworks. For (2), we propose the forward-backward message-passing sampling scheme\nfor the variational model. Comprehensive experiments show that our\ndifferentiable POGLMs with our forward-backward message passing produce a\nbetter performance on one synthetic and two real-world datasets. Furthermore,\nour new method yields more interpretable parameters, underscoring its\nsignificance in neuroscience.\n","authors":["Chengrui Li","Weihan Li","Yule Wang","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2402.01263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01262v1","updated":"2024-02-02T09:33:07Z","published":"2024-02-02T09:33:07Z","title":"Cascaded Scaling Classifier: class incremental learning with probability\n scaling","summary":" Humans are capable of acquiring new knowledge and transferring learned\nknowledge into different domains, incurring a small forgetting. The same\nability, called Continual Learning, is challenging to achieve when operating\nwith neural networks due to the forgetting affecting past learned tasks when\nlearning new ones. This forgetting can be mitigated by replaying stored samples\nfrom past tasks, but a large memory size may be needed for long sequences of\ntasks; moreover, this could lead to overfitting on saved samples. In this\npaper, we propose a novel regularisation approach and a novel incremental\nclassifier called, respectively, Margin Dampening and Cascaded Scaling\nClassifier. The first combines a soft constraint and a knowledge distillation\napproach to preserve past learned knowledge while allowing the model to learn\nnew patterns effectively. The latter is a gated incremental classifier, helping\nthe model modify past predictions without directly interfering with them. This\nis achieved by modifying the output of the model with auxiliary scaling\nfunctions. We empirically show that our approach performs well on multiple\nbenchmarks against well-established baselines, and we also study each component\nof our proposal and how the combinations of such components affect the final\nresults.\n","authors":["Jary Pomponi","Alessio Devoto","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2402.01262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01261v1","updated":"2024-02-02T09:32:03Z","published":"2024-02-02T09:32:03Z","title":"TEDDY: Trimming Edges with Degree-based Discrimination strategY","summary":" Since the pioneering work on the lottery ticket hypothesis for graph neural\nnetworks (GNNs) was proposed in Chen et al. (2021), the study on finding graph\nlottery tickets (GLT) has become one of the pivotal focus in the GNN community,\ninspiring researchers to discover sparser GLT while achieving comparable\nperformance to original dense networks. In parallel, the graph structure has\ngained substantial attention as a crucial factor in GNN training dynamics, also\nelucidated by several recent studies. Despite this, contemporary studies on\nGLT, in general, have not fully exploited inherent pathways in the graph\nstructure and identified tickets in an iterative manner, which is\ntime-consuming and inefficient. To address these limitations, we introduce\nTEDDY, a one-shot edge sparsification framework that leverages structural\ninformation by incorporating edge-degree information. Following edge\nsparsification, we encourage the parameter sparsity during training via simple\nprojected gradient descent on the $\\ell_0$ ball. Given the target sparsity\nlevels for both the graph structure and the model parameters, our TEDDY\nfacilitates efficient and rapid realization of GLT within a single training.\nRemarkably, our experimental results demonstrate that TEDDY significantly\nsurpasses conventional iterative approaches in generalization, even when\nconducting one-shot sparsification that solely utilizes graph structures,\nwithout taking node features into account.\n","authors":["Hyunjin Seo","Jihun Yun","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2402.01261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01259v1","updated":"2024-02-02T09:30:27Z","published":"2024-02-02T09:30:27Z","title":"Position Aware 60 GHz mmWave Beamforming for V2V Communications\n Utilizing Deep Learning","summary":" Beamforming techniques are considered as essential parts to compensate the\nsevere path loss in millimeter-wave (mmWave) communications by adopting large\nantenna arrays and formulating narrow beams to obtain satisfactory received\npowers. However, performing accurate beam alignment over such narrow beams for\nefficient link configuration by traditional beam selection approaches, mainly\nrelied on channel state information, typically impose significant latency and\ncomputing overheads, which is often infeasible in vehicle-to-vehicle (V2V)\ncommunications like highly dynamic scenarios. In contrast, utilizing\nout-of-band contextual information, such as vehicular position information, is\na potential alternative to reduce such overheads. In this context, this paper\npresents a deep learning-based solution on utilizing the vehicular position\ninformation for predicting the optimal beams having sufficient mmWave received\npowers so that the best V2V line-of-sight links can be ensured proactively.\nAfter experimental evaluation of the proposed solution on real-world measured\nmmWave sensing and communications datasets, the results show that the solution\ncan achieve up to 84.58% of received power of link status on average, which\nconfirm a promising solution for beamforming in mmWave at 60 GHz enabled V2V\ncommunications.\n","authors":["Muhammad Baqer Mollah","Honggang Wang","Hua Fang"],"pdf_url":"https://arxiv.org/pdf/2402.01259v1.pdf","comment":"2024 IEEE International Conference on Communications (ICC), Denver,\n CO, USA"},{"id":"http://arxiv.org/abs/2402.01258v1","updated":"2024-02-02T09:29:40Z","published":"2024-02-02T09:29:40Z","title":"Transformers Learn Nonlinear Features In Context: Nonconvex Mean-field\n Dynamics on the Attention Landscape","summary":" Large language models based on the Transformer architecture have demonstrated\nimpressive capabilities to learn in context. However, existing theoretical\nstudies on how this phenomenon arises are limited to the dynamics of a single\nlayer of attention trained on linear regression tasks. In this paper, we study\nthe optimization of a Transformer consisting of a fully connected layer\nfollowed by a linear attention layer. The MLP acts as a common nonlinear\nrepresentation or feature map, greatly enhancing the power of in-context\nlearning. We prove in the mean-field and two-timescale limit that the\ninfinite-dimensional loss landscape for the distribution of parameters, while\nhighly nonconvex, becomes quite benign. We also analyze the second-order\nstability of mean-field dynamics and show that Wasserstein gradient flow almost\nalways avoids saddle points. Furthermore, we establish novel methods for\nobtaining concrete improvement rates both away from and near critical points.\nThis represents the first saddle point analysis of mean-field dynamics in\ngeneral and the techniques are of independent interest.\n","authors":["Juno Kim","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2402.01258v1.pdf","comment":"32 pages, 1 figure"},{"id":"http://arxiv.org/abs/2306.05366v3","updated":"2024-02-02T09:25:41Z","published":"2023-06-08T17:08:52Z","title":"Ordinal Potential-based Player Rating","summary":" It was recently observed that Elo ratings fail at preserving transitive\nrelations among strategies and therefore cannot correctly extract the\ntransitive component of a game. We provide a characterization of transitive\ngames as a weak variant of ordinal potential games and show that Elo ratings\nactually do preserve transitivity when computed in the right space, using\nsuitable invertible mappings. Leveraging this insight, we introduce a new game\ndecomposition of an arbitrary game into transitive and cyclic components that\nis learnt using a neural network-based architecture and that prioritises\ncapturing the sign pattern of the game, namely transitive and cyclic relations\namong strategies. We link our approach to the known concept of sign-rank, and\nevaluate our methodology using both toy examples and empirical data from\nreal-world games.\n","authors":["Nelson Vadori","Rahul Savani"],"pdf_url":"https://arxiv.org/pdf/2306.05366v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01252v1","updated":"2024-02-02T09:19:45Z","published":"2024-02-02T09:19:45Z","title":"Target inductive methods for zero-shot regression","summary":" This research arises from the need to predict the amount of air pollutants in\nmeteorological stations. Air pollution depends on the location of the stations\n(weather conditions and activities in the surroundings). Frequently, the\nsurrounding information is not considered in the learning process. This\ninformation is known beforehand in the absence of unobserved weather conditions\nand remains constant for the same station. Considering the surrounding\ninformation as side information facilitates the generalization for predicting\npollutants in new stations, leading to a zero-shot regression scenario.\nAvailable methods in zero-shot typically lean towards classification, and are\nnot easily extensible to regression. This paper proposes two zero-shot methods\nfor regression. The first method is a similarity based approach that learns\nmodels from features and aggregates them using side information. However,\npotential knowledge of the feature models may be lost in the aggregation. The\nsecond method overcomes this drawback by replacing the aggregation procedure\nand learning the correspondence between side information and feature-induced\nmodels, instead. Both proposals are compared with a baseline procedure using\nartificial datasets, UCI repository communities and crime datasets, and the\npollutants. Both approaches outperform the baseline method, but the parameter\nlearning approach manifests its superiority over the similarity based method.\n","authors":["Miriam Fdez-Díaz","José Ramón Quevedo","Elena Montañés"],"pdf_url":"https://arxiv.org/pdf/2402.01252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01242v1","updated":"2024-02-02T09:10:35Z","published":"2024-02-02T09:10:35Z","title":"Two Heads Are Better Than One: Boosting Graph Sparse Training via\n Semantic and Topological Awareness","summary":" Graph Neural Networks (GNNs) excel in various graph learning tasks but face\ncomputational challenges when applied to large-scale graphs. A promising\nsolution is to remove non-essential edges to reduce the computational overheads\nin GNN. Previous literature generally falls into two categories:\ntopology-guided and semantic-guided. The former maintains certain graph\ntopological properties yet often underperforms on GNNs due to low integration\nwith neural network training. The latter performs well at lower sparsity on\nGNNs but faces performance collapse at higher sparsity levels. With this in\nmind, we take the first step to propose a new research line and concept termed\nGraph Sparse Training (GST), which dynamically manipulates sparsity at the data\nlevel. Specifically, GST initially constructs a topology & semantic anchor at a\nlow training cost, followed by performing dynamic sparse training to align the\nsparse graph with the anchor. We introduce the Equilibria Sparsification\nPrinciple to guide this process, effectively balancing the preservation of both\ntopological and semantic information. Ultimately, GST produces a sparse graph\nwith maximum topological integrity and no performance degradation. Extensive\nexperiments on 6 datasets and 5 backbones showcase that GST (I) identifies\nsubgraphs at higher graph sparsity levels (1.67%~15.85% $\\uparrow$) than\nstate-of-the-art sparsification methods, (II) preserves more key spectral\nproperties, (III) achieves 1.27-3.42$\\times$ speedup in GNN inference and (IV)\nsuccessfully helps graph adversarial defense and graph lottery tickets.\n","authors":["Guibin Zhang","Yanwei Yue","Kun Wang","Junfeng Fang","Yongduo Sui","Kai Wang","Yuxuan Liang","Dawei Cheng","Shirui Pan","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2402.01242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01240v1","updated":"2024-02-02T09:07:09Z","published":"2024-02-02T09:07:09Z","title":"Beyond the Request: Harnessing HTTP Response Headers for Cross-Browser\n Web Tracker Classification in an Imbalanced Setting","summary":" The World Wide Web's connectivity is greatly attributed to the HTTP protocol,\nwith HTTP messages offering informative header fields that appeal to\ndisciplines like web security and privacy, especially concerning web tracking.\nDespite existing research employing HTTP/S request messages to identify web\ntrackers, HTTP/S response headers are often overlooked. This study endeavors to\ndesign effective machine learning classifiers for web tracker detection using\nHTTP/S response headers. Data from the Chrome, Firefox, and Brave browsers,\nobtained through the traffic monitoring browser extension T.EX, serves as our\ndata set. Eleven supervised models were trained on Chrome data and tested\nacross all browsers. The results demonstrated high accuracy, F1-score,\nprecision, recall, and minimal log-loss error for Chrome and Firefox, but\nsubpar performance on Brave, potentially due to its distinct data distribution\nand feature set. The research suggests that these classifiers are viable for\ndetecting web trackers in Chrome and Firefox. However, real-world application\ntesting remains pending, and the distinction between tracker types and broader\nlabel sources could be explored in future studies.\n","authors":["Wolf Rieder","Philip Raschke","Thomas Cory"],"pdf_url":"https://arxiv.org/pdf/2402.01240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01238v1","updated":"2024-02-02T09:03:38Z","published":"2024-02-02T09:03:38Z","title":"Flexible Variational Information Bottleneck: Achieving Diverse\n Compression with a Single Training","summary":" Information Bottleneck (IB) is a widely used framework that enables the\nextraction of information related to a target random variable from a source\nrandom variable. In the objective function, IB controls the trade-off between\ndata compression and predictiveness through the Lagrange multiplier $\\beta$.\nTraditionally, to find the trade-off to be learned, IB requires a search for\n$\\beta$ through multiple training cycles, which is computationally expensive.\nIn this study, we introduce Flexible Variational Information Bottleneck (FVIB),\nan innovative framework for classification task that can obtain optimal models\nfor all values of $\\beta$ with single, computationally efficient training. We\ntheoretically demonstrate that across all values of reasonable $\\beta$, FVIB\ncan simultaneously maximize an approximation of the objective function for\nVariational Information Bottleneck (VIB), the conventional IB method. Then we\nempirically show that FVIB can learn the VIB objective as effectively as VIB.\nFurthermore, in terms of calibration performance, FVIB outperforms other IB and\ncalibration methods by enabling continuous optimization of $\\beta$. Our codes\nare available at https://github.com/sotakudo/fvib.\n","authors":["Sota Kudo","Naoaki Ono","Shigehiko Kanaya","Ming Huang"],"pdf_url":"https://arxiv.org/pdf/2402.01238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.06854v3","updated":"2024-02-02T08:59:26Z","published":"2022-06-14T13:49:08Z","title":"On the explainable properties of 1-Lipschitz Neural Networks: An Optimal\n Transport Perspective","summary":" Input gradients have a pivotal role in a variety of applications, including\nadversarial attack algorithms for evaluating model robustness, explainable AI\ntechniques for generating Saliency Maps, and counterfactual\nexplanations.However, Saliency Maps generated by traditional neural networks\nare often noisy and provide limited insights. In this paper, we demonstrate\nthat, on the contrary, the Saliency Maps of 1-Lipschitz neural networks,\nlearned with the dual loss of an optimal transportation problem, exhibit\ndesirable XAI properties:They are highly concentrated on the essential parts of\nthe image with low noise, significantly outperforming state-of-the-art\nexplanation approaches across various models and metrics. We also prove that\nthese maps align unprecedentedly well with human explanations on ImageNet.To\nexplain the particularly beneficial properties of the Saliency Map for such\nmodels, we prove this gradient encodes both the direction of the transportation\nplan and the direction towards the nearest adversarial attack. Following the\ngradient down to the decision boundary is no longer considered an adversarial\nattack, but rather a counterfactual explanation that explicitly transports the\ninput from one class to another. Thus, Learning with such a loss jointly\noptimizes the classification objective and the alignment of the gradient, i.e.\nthe Saliency Map, to the transportation plan direction.These networks were\npreviously known to be certifiably robust by design, and we demonstrate that\nthey scale well for large problems and models, and are tailored for\nexplainability using a fast and straightforward method.\n","authors":["Mathieu Serrurier","Franck Mamalet","Thomas Fel","Louis Béthune","Thibaut Boissin"],"pdf_url":"https://arxiv.org/pdf/2206.06854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01231v1","updated":"2024-02-02T08:55:23Z","published":"2024-02-02T08:55:23Z","title":"Unveiling Delay Effects in Traffic Forecasting: A Perspective from\n Spatial-Temporal Delay Differential Equations","summary":" Traffic flow forecasting is a fundamental research issue for transportation\nplanning and management, which serves as a canonical and typical example of\nspatial-temporal predictions. In recent years, Graph Neural Networks (GNNs) and\nRecurrent Neural Networks (RNNs) have achieved great success in capturing\nspatial-temporal correlations for traffic flow forecasting. Yet, two\nnon-ignorable issues haven't been well solved: 1) The message passing in GNNs\nis immediate, while in reality the spatial message interactions among\nneighboring nodes can be delayed. The change of traffic flow at one node will\ntake several minutes, i.e., time delay, to influence its connected neighbors.\n2) Traffic conditions undergo continuous changes. The prediction frequency for\ntraffic flow forecasting may vary based on specific scenario requirements. Most\nexisting discretized models require retraining for each prediction horizon,\nrestricting their applicability. To tackle the above issues, we propose a\nneural Spatial-Temporal Delay Differential Equation model, namely STDDE. It\nincludes both delay effects and continuity into a unified delay differential\nequation framework, which explicitly models the time delay in spatial\ninformation propagation. Furthermore, theoretical proofs are provided to show\nits stability. Then we design a learnable traffic-graph time-delay estimator,\nwhich utilizes the continuity of the hidden states to achieve the gradient\nbackward process. Finally, we propose a continuous output module, allowing us\nto accurately predict traffic flow at various frequencies, which provides more\nflexibility and adaptability to different scenarios. Extensive experiments show\nthe superiority of the proposed STDDE along with competitive computational\nefficiency.\n","authors":["Qingqing Long","Zheng Fang","Chen Fang","Chong Chen","Pengfei Wang","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.01231v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.01226v1","updated":"2024-02-02T08:45:38Z","published":"2024-02-02T08:45:38Z","title":"HW-SW Optimization of DNNs for Privacy-preserving People Counting on\n Low-resolution Infrared Arrays","summary":" Low-resolution infrared (IR) array sensors enable people counting\napplications such as monitoring the occupancy of spaces and people flows while\npreserving privacy and minimizing energy consumption. Deep Neural Networks\n(DNNs) have been shown to be well-suited to process these sensor data in an\naccurate and efficient manner. Nevertheless, the space of DNNs' architectures\nis huge and its manual exploration is burdensome and often leads to sub-optimal\nsolutions. To overcome this problem, in this work, we propose a highly\nautomated full-stack optimization flow for DNNs that goes from neural\narchitecture search, mixed-precision quantization, and post-processing, down to\nthe realization of a new smart sensor prototype, including a Microcontroller\nwith a customized instruction set. Integrating these cross-layer optimizations,\nwe obtain a large set of Pareto-optimal solutions in the 3D-space of energy,\nmemory, and accuracy. Deploying such solutions on our hardware platform, we\nimprove the state-of-the-art achieving up to 4.2x model size reduction, 23.8x\ncode size reduction, and 15.38x energy reduction at iso-accuracy.\n","authors":["Matteo Risso","Chen Xie","Francesco Daghero","Alessio Burrello","Seyedmorteza Mollaei","Marco Castellano","Enrico Macii","Massimo Poncino","Daniele Jahier Pagliari"],"pdf_url":"https://arxiv.org/pdf/2402.01226v1.pdf","comment":"This paper has been accepted for publication in the DATE 2024\n conference IEEE"},{"id":"http://arxiv.org/abs/2401.00744v6","updated":"2024-02-02T08:45:25Z","published":"2024-01-01T12:57:15Z","title":"Harmonizing Covariance and Expressiveness for Deep Hamiltonian\n Regression in Crystalline Material Research: a Hybrid Cascaded Regression\n Framework","summary":" Deep learning for Hamiltonian regression of quantum systems in material\nresearch necessitates satisfying the covariance laws, among which achieving\nSO(3)-equivariance without sacrificing the expressiveness capability of\nnetworks remains unsolved due to the restriction on non-linear mappings in\nassuring theoretical equivariance. To alleviate the covariance-expressiveness\ndilemma, we make an exploration on non-linear covariant deep learning with a\nhybrid framework consisting of two cascaded regression stages. The first stage,\ni.e., a theoretically-guaranteed covariant neural network modeling symmetry\nproperties of 3D atom systems, predicts baseline Hamiltonians with\ntheoretically covariant features extracted, assisting the second stage in\nlearning covariance. Meanwhile, the second stage, powered by a non-linear 3D\ngraph Transformer network we propose for structural modeling of atomic systems,\nrefines the first stage's output as a fine-grained prediction of Hamiltonians\nwith better expressiveness capability. The novel combination of a theoretically\ncovariant yet inevitably less expressive model with a highly expressive\nnon-linear network enables precise, generalizable predictions while maintaining\nrobust covariance under coordinate transformations. We achieve state-of-the-art\nperformance in Hamiltonian prediction, confirmed through experiments on six\ncrystalline material databases.\n","authors":["Shi Yin","Xinyang Pan","Xudong Zhu","Tianyu Gao","Haochong Zhang","Feng Wu","Lixin He"],"pdf_url":"https://arxiv.org/pdf/2401.00744v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01208v1","updated":"2024-02-02T08:26:42Z","published":"2024-02-02T08:26:42Z","title":"Location Agnostic Adaptive Rain Precipitation Prediction using Deep\n Learning","summary":" Rain precipitation prediction is a challenging task as it depends on weather\nand meteorological features which vary from location to location. As a result,\na prediction model that performs well at one location does not perform well at\nother locations due to the distribution shifts. In addition, due to global\nwarming, the weather patterns are changing very rapidly year by year which\ncreates the possibility of ineffectiveness of those models even at the same\nlocation as time passes. In our work, we have proposed an adaptive deep\nlearning-based framework in order to provide a solution to the aforementioned\nchallenges. Our method can generalize the model for the prediction of\nprecipitation for any location where the methods without adaptation fail. Our\nmethod has shown 43.51%, 5.09%, and 38.62% improvement after adaptation using a\ndeep neural network for predicting the precipitation of Paris, Los Angeles, and\nTokyo, respectively.\n","authors":["Md Shazid Islam","Md Saydur Rahman","Md Saad Ul Haque","Farhana Akter Tumpa","Md Sanzid Bin Hossain","Abul Al Arabi"],"pdf_url":"https://arxiv.org/pdf/2402.01208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01207v1","updated":"2024-02-02T08:25:32Z","published":"2024-02-02T08:25:32Z","title":"Efficient Causal Graph Discovery Using Large Language Models","summary":" We propose a novel framework that leverages LLMs for full causal graph\ndiscovery. While previous LLM-based methods have used a pairwise query\napproach, this requires a quadratic number of queries which quickly becomes\nimpractical for larger causal graphs. In contrast, the proposed framework uses\na breadth-first search (BFS) approach which allows it to use only a linear\nnumber of queries. We also show that the proposed method can easily incorporate\nobservational data when available, to improve performance. In addition to being\nmore time and data-efficient, the proposed framework achieves state-of-the-art\nresults on real-world causal graphs of varying sizes. The results demonstrate\nthe effectiveness and efficiency of the proposed method in discovering causal\nrelationships, showcasing its potential for broad applicability in causal graph\ndiscovery tasks across different domains.\n","authors":["Thomas Jiralerspong","Xiaoyin Chen","Yash More","Vedant Shah","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2402.01207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01206v1","updated":"2024-02-02T08:25:28Z","published":"2024-02-02T08:25:28Z","title":"Comparative Evaluation of Weather Forecasting using Machine Learning\n Models","summary":" Gaining a deeper understanding of weather and being able to predict its\nfuture conduct have always been considered important endeavors for the growth\nof our society. This research paper explores the advancements in understanding\nand predicting nature's behavior, particularly in the context of weather\nforecasting, through the application of machine learning algorithms. By\nleveraging the power of machine learning, data mining, and data analysis\ntechniques, significant progress has been made in this field. This study\nfocuses on analyzing the contributions of various machine learning algorithms\nin predicting precipitation and temperature patterns using a 20-year dataset\nfrom a single weather station in Dhaka city. Algorithms such as Gradient\nBoosting, AdaBoosting, Artificial Neural Network, Stacking Random Forest,\nStacking Neural Network, and Stacking KNN are evaluated and compared based on\ntheir performance metrics, including Confusion matrix measurements. The\nfindings highlight remarkable achievements and provide valuable insights into\ntheir performances and features correlation.\n","authors":["Md Saydur Rahman","Farhana Akter Tumpa","Md Shazid Islam","Abul Al Arabi","Md Sanzid Bin Hossain","Md Saad Ul Haque"],"pdf_url":"https://arxiv.org/pdf/2402.01206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01204v1","updated":"2024-02-02T08:17:41Z","published":"2024-02-02T08:17:41Z","title":"A Survey on Self-Supervised Learning for Non-Sequential Tabular Data","summary":" Self-supervised learning (SSL) has been incorporated into many\nstate-of-the-art models in various domains, where SSL defines pretext tasks\nbased on unlabeled datasets to learn contextualized and robust representations.\nRecently, SSL has been a new trend in exploring the representation learning\ncapability in the realm of tabular data, which is more challenging due to not\nhaving explicit relations for learning descriptive representations. This survey\naims to systematically review and summarize the recent progress and challenges\nof SSL for non-sequential tabular data (SSL4NS-TD). We first present a formal\ndefinition of NS-TD and clarify its correlation to related studies. Then, these\napproaches are categorized into three groups -- predictive learning,\ncontrastive learning, and hybrid learning, with their motivations and strengths\nof representative methods within each direction. On top of this, application\nissues of SSL4NS-TD are presented, including automatic data engineering,\ncross-table transferability, and domain knowledge integration. In addition, we\nelaborate on existing benchmarks and datasets for NS-TD applications to discuss\nthe performance of existing tabular models. Finally, we discuss the challenges\nof SSL4NS-TD and provide potential directions for future research. We expect\nour work to be useful in terms of encouraging more research on lowering the\nbarrier to entry SSL for the tabular domain and improving the foundations for\nimplicit tabular data.\n","authors":["Wei-Yao Wang","Wei-Wei Du","Derek Xu","Wei Wang","Wen-Chih Peng"],"pdf_url":"https://arxiv.org/pdf/2402.01204v1.pdf","comment":"The paper list can be found at\n https://github.com/wwweiwei/awesome-self-supervised-learning-for-tabular-data"},{"id":"http://arxiv.org/abs/2402.01203v1","updated":"2024-02-02T08:13:18Z","published":"2024-02-02T08:13:18Z","title":"Structured World Modeling via Semantic Vector Quantization","summary":" Neural discrete representations are crucial components of modern neural\nnetworks. However, their main limitation is that the primary strategies such as\nVQ-VAE can only provide representations at the patch level. Therefore, one of\nthe main goals of representation learning, acquiring structured, semantic, and\ncompositional abstractions such as the color and shape of an object, remains\nelusive. In this paper, we present the first approach to semantic neural\ndiscrete representation learning. The proposed model, called Semantic\nVector-Quantized Variational Autoencoder (SVQ), leverages recent advances in\nunsupervised object-centric learning to address this limitation. Specifically,\nwe observe that a simple approach quantizing at the object level poses a\nsignificant challenge and propose constructing scene representations\nhierarchically, from low-level discrete concept schemas to object\nrepresentations. Additionally, we suggest a novel method for structured\nsemantic world modeling by training a prior over these representations,\nenabling the ability to generate images by sampling the semantic properties of\nthe objects in the scene. In experiments on various 2D and 3D object-centric\ndatasets, we find that our model achieves superior generation performance\ncompared to non-semantic vector quantization methods such as VQ-VAE and\nprevious object-centric generative models. Furthermore, we find that the\nsemantic discrete representations can solve downstream scene understanding\ntasks that require reasoning about the properties of different objects in the\nscene.\n","authors":["Yi-Fu Wu","Minseung Lee","Sungjin Ahn"],"pdf_url":"https://arxiv.org/pdf/2402.01203v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2210.11049v3","updated":"2024-02-02T08:11:13Z","published":"2022-10-20T06:44:37Z","title":"How Does a Deep Learning Model Architecture Impact Its Privacy? A\n Comprehensive Study of Privacy Attacks on CNNs and Transformers","summary":" As a booming research area in the past decade, deep learning technologies\nhave been driven by big data collected and processed on an unprecedented scale.\nHowever, privacy concerns arise due to the potential leakage of sensitive\ninformation from the training data. Recent research has revealed that deep\nlearning models are vulnerable to various privacy attacks, including membership\ninference attacks, attribute inference attacks, and gradient inversion attacks.\nNotably, the efficacy of these attacks varies from model to model. In this\npaper, we answer a fundamental question: Does model architecture affect model\nprivacy? By investigating representative model architectures from convolutional\nneural networks (CNNs) to Transformers, we demonstrate that Transformers\ngenerally exhibit higher vulnerability to privacy attacks than CNNs.\nAdditionally, we identify the micro design of activation layers, stem layers,\nand LN layers, as major factors contributing to the resilience of CNNs against\nprivacy attacks, while the presence of attention modules is another main factor\nthat exacerbates the privacy vulnerability of Transformers. Our discovery\nreveals valuable insights for deep learning models to defend against privacy\nattacks and inspires the research community to develop privacy-friendly model\narchitectures.\n","authors":["Guangsheng Zhang","Bo Liu","Huan Tian","Tianqing Zhu","Ming Ding","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2210.11049v3.pdf","comment":"To appear in USENIX Security 2024"},{"id":"http://arxiv.org/abs/2311.17795v2","updated":"2024-02-02T08:06:51Z","published":"2023-11-29T16:45:43Z","title":"Marginal Laplacian Score","summary":" High-dimensional imbalanced data poses a machine learning challenge. In the\nabsence of sufficient or high-quality labels, unsupervised feature selection\nmethods are crucial for the success of subsequent algorithms. Therefore, we\nintroduce a Marginal Laplacian Score (MLS), a modification of the well known\nLaplacian Score (LS) tailored to better address imbalanced data. We introduce\nan assumption that the minority class or anomalous appear more frequently in\nthe margin of the features. Consequently, MLS aims to preserve the local\nstructure of the dataset's margin. We propose its integration into modern\nfeature selection methods that utilize the Laplacian score. We integrate the\nMLS algorithm into the Differentiable Unsupervised Feature Selection (DUFS),\nresulting in DUFS-MLS. The proposed methods demonstrate robust and improved\nperformance on synthetic and public datasets.\n","authors":["Guy Hay","Ohad Volk"],"pdf_url":"https://arxiv.org/pdf/2311.17795v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2402.01201v1","updated":"2024-02-02T08:05:35Z","published":"2024-02-02T08:05:35Z","title":"Few-Shot Class-Incremental Learning with Prior Knowledge","summary":" To tackle the issues of catastrophic forgetting and overfitting in few-shot\nclass-incremental learning (FSCIL), previous work has primarily concentrated on\npreserving the memory of old knowledge during the incremental phase. The role\nof pre-trained model in shaping the effectiveness of incremental learning is\nfrequently underestimated in these studies. Therefore, to enhance the\ngeneralization ability of the pre-trained model, we propose Learning with Prior\nKnowledge (LwPK) by introducing nearly free prior knowledge from a few\nunlabeled data of subsequent incremental classes. We cluster unlabeled\nincremental class samples to produce pseudo-labels, then jointly train these\nwith labeled base class samples, effectively allocating embedding space for\nboth old and new class data. Experimental results indicate that LwPK\neffectively enhances the model resilience against catastrophic forgetting, with\ntheoretical analysis based on empirical risk minimization and class distance\nmeasurement corroborating its operational principles. The source code of LwPK\nis publicly available at: \\url{https://github.com/StevenJ308/LwPK}.\n","authors":["Wenhao Jiang","Duo Li","Menghan Hu","Guangtao Zhai","Xiaokang Yang","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01201v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.08143v3","updated":"2024-02-02T09:37:57Z","published":"2023-08-16T04:31:33Z","title":"IIANet: An Intra- and Inter-Modality Attention Network for Audio-Visual\n Speech Separation","summary":" Recent research has made significant progress in designing fusion modules for\naudio-visual speech separation. However, they predominantly focus on\nmulti-modal fusion at a single temporal scale of auditory and visual features\nwithout employing selective attention mechanisms, which is in sharp contrast\nwith the brain. To address this issue, We propose a novel model called Intra-\nand Inter-Attention Network (IIANet), which leverages the attention mechanism\nfor efficient audio-visual feature fusion. IIANet consists of two types of\nattention blocks: intra-attention (IntraA) and inter-attention (InterA) blocks,\nwhere the InterA blocks are distributed at the top, middle and bottom of\nIIANet. Heavily inspired by the way how human brain selectively focuses on\nrelevant content at various temporal scales, these blocks maintain the ability\nto learn modality-specific features and enable the extraction of different\nsemantics from audio-visual features. Comprehensive experiments on three\nstandard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of IIANet, outperforming previous\nstate-of-the-art methods while maintaining comparable inference time. In\nparticular, the fast version of IIANet (IIANet-fast) has only 7% of CTCNet's\nMACs and is 40% faster than CTCNet on CPUs while achieving better separation\nquality, showing the great potential of attention mechanism for efficient and\neffective multimodal fusion.\n","authors":["Kai Li","Runxuan Yang","Fuchun Sun","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v3.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.07929v3","updated":"2024-02-02T08:02:35Z","published":"2023-09-13T05:43:35Z","title":"Prompting Segmentation with Sound Is Generalizable Audio-Visual Source\n Localizer","summary":" Never having seen an object and heard its sound simultaneously, can the model\nstill accurately localize its visual position from the input audio? In this\nwork, we concentrate on the Audio-Visual Localization and Segmentation tasks\nbut under the demanding zero-shot and few-shot scenarios. To achieve this goal,\ndifferent from existing approaches that mostly employ the\nencoder-fusion-decoder paradigm to decode localization information from the\nfused audio-visual feature, we introduce the encoder-prompt-decoder paradigm,\naiming to better fit the data scarcity and varying data distribution dilemmas\nwith the help of abundant knowledge from pre-trained models. Specifically, we\nfirst propose to construct Semantic-aware Audio Prompt (SAP) to help the visual\nfoundation model focus on sounding objects, meanwhile, the semantic gap between\nthe visual and audio modalities is also encouraged to shrink. Then, we develop\na Correlation Adapter (ColA) to keep minimal training efforts as well as\nmaintain adequate knowledge of the visual foundation model. By equipping with\nthese means, extensive experiments demonstrate that this new paradigm\noutperforms other fusion-based methods in both the unseen class and\ncross-dataset settings. We hope that our work can further promote the\ngeneralization study of Audio-Visual Localization and Segmentation in practical\napplication scenarios.\n","authors":["Yaoting Wang","Weisong Liu","Guangyao Li","Jian Ding","Di Hu","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2309.07929v3.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2402.01180v1","updated":"2024-02-02T06:54:06Z","published":"2024-02-02T06:54:06Z","title":"Real-time Extended Reality Video Transmission Optimization Based on\n Frame-priority Scheduling","summary":" Extended Reality (XR) is an important service in the 5G network and in future\n6G networks. In contrast to traditional video on demand services, real-time XR\nvideo is transmitted frame by frame, requiring low latency and being highly\nsensitive to network fluctuations. In this paper, we model the quality of\nexperience (QoE) for real-time XR video transmission on a frame-by-frame basis.\nBased on the proposed QoE model, we formulate an optimization problem that\nmaximizes QoE with constraints on wireless resources and long-term energy\nconsumption. We utilize Lyapunov optimization to transform the original problem\ninto a single-frame optimization problem and then allocate wireless\nsubchannels. We propose an adaptive XR video bitrate algorithm that employs a\nLong Short Term Memory (LSTM) based Deep Q-Network (DQN) algorithm for video\nbitrate selection. Through numerical results, we show that our proposed\nalgorithm outperforms the baseline algorithms, with the average QoE\nimprovements of 5.9% to 80.0%.\n","authors":["Guangjin Pan","Shugong Xu","Shunqing Zhang","Xiaojing Chen","Yanzan Sun"],"pdf_url":"https://arxiv.org/pdf/2402.01180v1.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.00045v2","updated":"2024-02-02T02:50:59Z","published":"2024-01-22T15:08:19Z","title":"Detecting Multimedia Generated by Large AI Models: A Survey","summary":" The rapid advancement of Large AI Models (LAIMs), particularly diffusion\nmodels and large language models, has marked a new era where AI-generated\nmultimedia is increasingly integrated into various aspects of daily life.\nAlthough beneficial in numerous fields, this content presents significant\nrisks, including potential misuse, societal disruptions, and ethical concerns.\nConsequently, detecting multimedia generated by LAIMs has become crucial, with\na marked rise in related research. Despite this, there remains a notable gap in\nsystematic surveys that focus specifically on detecting LAIM-generated\nmultimedia. Addressing this, we provide the first survey to comprehensively\ncover existing research on detecting multimedia (such as text, images, videos,\naudio, and multimodal content) created by LAIMs. Specifically, we introduce a\nnovel taxonomy for detection methods, categorized by media modality, and\naligned with two perspectives: pure detection (aiming to enhance detection\nperformance) and beyond detection (adding attributes like generalizability,\nrobustness, and interpretability to detectors). Additionally, we have presented\na brief overview of generation mechanisms, public datasets, and online\ndetection tools to provide a valuable resource for researchers and\npractitioners in this field. Furthermore, we identify current challenges in\ndetection and propose directions for future research that address unexplored,\nongoing, and emerging issues in detecting multimedia generated by LAIMs. Our\naim for this survey is to fill an academic gap and contribute to global AI\nsecurity efforts, helping to ensure the integrity of information in the digital\nrealm. The project link is\nhttps://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey.\n","authors":["Li Lin","Neeraj Gupta","Yue Zhang","Hainan Ren","Chun-Hao Liu","Feng Ding","Xin Wang","Xin Li","Luisa Verdoliva","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2402.00045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15136v2","updated":"2024-02-02T19:07:40Z","published":"2023-09-26T14:28:16Z","title":"A multi-modal approach for identifying schizophrenia using cross-modal\n attention","summary":" This study focuses on how different modalities of human communication can be\nused to distinguish between healthy controls and subjects with schizophrenia\nwho exhibit strong positive symptoms. We developed a multi-modal schizophrenia\nclassification system using audio, video, and text. Facial action units and\nvocal tract variables were extracted as low-level features from video and audio\nrespectively, which were then used to compute high-level coordination features\nthat served as the inputs to the audio and video modalities.\nContext-independent text embeddings extracted from transcriptions of speech\nwere used as the input for the text modality. The multi-modal system is\ndeveloped by fusing a segment-to-session-level classifier for video and audio\nmodalities with a text model based on a Hierarchical Attention Network (HAN)\nwith cross-modal attention. The proposed multi-modal system outperforms the\nprevious state-of-the-art multi-modal system by 8.53% in the weighted average\nF1 score.\n","authors":["Gowtham Premananth","Yashish M. Siriwardena","Philip Resnik","Carol Espy-Wilson"],"pdf_url":"https://arxiv.org/pdf/2309.15136v2.pdf","comment":"Submitted to Annual International Conference of the IEEE Engineering\n in Medicine and Biology Society 2024"}]},"2024-02-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.03303v1","updated":"2024-02-05T18:58:19Z","published":"2024-02-05T18:58:19Z","title":"Nevermind: Instruction Override and Moderation in Large Language Models","summary":" Given the impressive capabilities of recent Large Language Models (LLMs), we\ninvestigate and benchmark the most popular proprietary and different sized open\nsource models on the task of explicit instruction following in conflicting\nsituations, e.g. overrides. These include the ability of the model to override\nthe knowledge within the weights of the model, the ability to override (or\nmoderate) extracted knowledge in the prompt, and lastly the ability to perform\na full jailbreak. Experimentation performed suggest several key findings to\nimprove instruction following - larger models perform the best in following\ninstructions that override internal and contextual instructions, and are\nobedient, even to a fault. When scaling to longer contexts via rope scaling, a\nsignificant buffer needs to be maintained from the edge of the perplexity cliff\nin order to maintain instruction following capabilities. Finally, we observe\nimproving instruction following, and subsequently instruction\noverrides/jailbreaks, is fundamentally at odds with the ability of a language\nmodel to follow given safety filters or guidelines. Thus, we postulate the most\neffective approach for safe, trustworthy AI should be dealt external to the LLM\nitself.\n","authors":["Edward Kim"],"pdf_url":"https://arxiv.org/pdf/2402.03303v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.03300v1","updated":"2024-02-05T18:55:32Z","published":"2024-02-05T18:55:32Z","title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open\n Language Models","summary":" Mathematical reasoning poses a significant challenge for language models due\nto its complex and structured nature. In this paper, we introduce DeepSeekMath\n7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B\nmath-related tokens sourced from Common Crawl, together with natural language\nand code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the\ncompetition-level MATH benchmark without relying on external toolkits and\nvoting techniques, approaching the performance level of Gemini-Ultra and GPT-4.\nSelf-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH.\nThe mathematical reasoning capability of DeepSeekMath is attributed to two key\nfactors: First, we harness the significant potential of publicly available web\ndata through a meticulously engineered data selection pipeline. Second, we\nintroduce Group Relative Policy Optimization (GRPO), a variant of Proximal\nPolicy Optimization (PPO), that enhances mathematical reasoning abilities while\nconcurrently optimizing the memory usage of PPO.\n","authors":["Zhihong Shao","Peiyi Wang","Qihao Zhu","Runxin Xu","Junxiao Song","Mingchuan Zhang","Y. K. Li","Y. Wu","Daya Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03299v1","updated":"2024-02-05T18:54:43Z","published":"2024-02-05T18:54:43Z","title":"GUARD: Role-playing to Generate Natural-language Jailbreakings to Test\n Guideline Adherence of Large Language Models","summary":" The discovery of \"jailbreaks\" to bypass safety filters of Large Language\nModels (LLMs) and harmful responses have encouraged the community to implement\nsafety measures. One major safety measure is to proactively test the LLMs with\njailbreaks prior to the release. Therefore, such testing will require a method\nthat can generate jailbreaks massively and efficiently. In this paper, we\nfollow a novel yet intuitive strategy to generate jailbreaks in the style of\nthe human generation. We propose a role-playing system that assigns four\ndifferent roles to the user LLMs to collaborate on new jailbreaks. Furthermore,\nwe collect existing jailbreaks and split them into different independent\ncharacteristics using clustering frequency and semantic patterns sentence by\nsentence. We organize these characteristics into a knowledge graph, making them\nmore accessible and easier to retrieve. Our system of different roles will\nleverage this knowledge graph to generate new jailbreaks, which have proved\neffective in inducing LLMs to generate unethical or guideline-violating\nresponses. In addition, we also pioneer a setting in our system that will\nautomatically follow the government-issued guidelines to generate jailbreaks to\ntest whether LLMs follow the guidelines accordingly. We refer to our system as\nGUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have\nempirically validated the effectiveness of GUARD on three cutting-edge\nopen-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a\nwidely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the\nrealm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing\nGUARD's versatility and contributing valuable insights for the development of\nsafer, more reliable LLM-based applications across diverse modalities.\n","authors":["Haibo Jin","Ruoxi Chen","Andy Zhou","Jinyin Chen","Yang Zhang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03299v1.pdf","comment":"22 papges"},{"id":"http://arxiv.org/abs/2402.03284v1","updated":"2024-02-05T18:39:47Z","published":"2024-02-05T18:39:47Z","title":"Deal, or no deal (or who knows)? Forecasting Uncertainty in\n Conversations using Large Language Models","summary":" Effective interlocutors account for the uncertain goals, beliefs, and\nemotions of others. But even the best human conversationalist cannot perfectly\nanticipate the trajectory of a dialogue. How well can language models represent\ninherent uncertainty in conversations? We propose FortUne Dial, an expansion of\nthe long-standing \"conversation forecasting\" task: instead of just accuracy,\nevaluation is conducted with uncertainty-aware metrics, effectively enabling\nabstention on individual instances. We study two ways in which language models\npotentially represent outcome uncertainty (internally, using scores and\ndirectly, using tokens) and propose fine-tuning strategies to improve\ncalibration of both representations. Experiments on eight difficult negotiation\ncorpora demonstrate that our proposed fine-tuning strategies (a traditional\nsupervision strategy and an off-policy reinforcement learning strategy) can\ncalibrate smaller open-source models to compete with pre-trained models 10x\ntheir size.\n","authors":["Anthony Sicilia","Hyunwoo Kim","Khyathi Raghavi Chandu","Malihe Alikhani","Jack Hessel"],"pdf_url":"https://arxiv.org/pdf/2402.03284v1.pdf","comment":"2 Figures; 7 Tables; 27 pages"},{"id":"http://arxiv.org/abs/2310.05707v3","updated":"2024-02-05T18:33:44Z","published":"2023-10-09T13:29:37Z","title":"Guiding Language Model Math Reasoning with Planning Tokens","summary":" Large language models (LLMs) have recently attracted considerable interest\nfor their ability to perform complex reasoning tasks, such as chain-of-thought\nreasoning. However, most of the existing approaches to enhance this ability\nrely heavily on data-driven methods, while neglecting the structural aspects of\nthe model's reasoning capacity. We find that while LLMs can manage individual\nreasoning steps well, they struggle with maintaining consistency across an\nentire reasoning chain. To solve this, we introduce planning tokens at the\nstart of each reasoning step, serving as a guide for the model, and add their\nembeddings to the model parameters. Our approach requires a negligible increase\nin trainable parameters (just 0.001%) and can be applied through either full\nfine-tuning or a more parameter-efficient scheme. We demonstrate our method's\neffectiveness by applying it to three different LLMs, showing notable accuracy\nimprovements across three math word problem datasets w.r.t. standard\nfine-tuning baselines.\n","authors":["Xinyi Wang","Lucas Caccia","Oleksiy Ostapenko","Xingdi Yuan","William Yang Wang","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2310.05707v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14652v2","updated":"2024-02-05T18:30:30Z","published":"2023-11-24T18:35:00Z","title":"One Pass Streaming Algorithm for Super Long Token Attention\n Approximation in Sublinear Space","summary":" Attention computation takes both the time complexity of $O(n^2)$ and the\nspace complexity of $O(n^2)$ simultaneously, which makes deploying Large\nLanguage Models (LLMs) in streaming applications that involve long contexts\nrequiring substantial computational resources. In recent OpenAI DevDay (Nov 6,\n2023), OpenAI released a new model that is able to support a 128K-long\ndocument, in our paper, we focus on the memory-efficient issue when context\nlength $n$ is much greater than 128K ($n \\gg 2^d$). Considering a single-layer\nself-attention with Query, Key, and Value matrices $Q, K, V \\in \\mathbb{R}^{n\n\\times d}$, the polynomial method approximates the attention output $T \\in\n\\mathbb{R}^{n \\times d}$. It accomplishes this by constructing $U_1, U_2 \\in\n\\mathbb{R}^{n \\times t}$ to expedite attention ${\\sf Attn}(Q, K, V)$\ncomputation within $n^{1+o(1)}$ time executions. Despite this, computing the\napproximated attention matrix $U_1U_2^\\top \\in \\mathbb{R}^{n \\times n}$ still\nnecessitates $O(n^2)$ space, leading to significant memory usage. In response\nto these challenges, we introduce a new algorithm that only reads one pass of\nthe data in a streaming fashion. This method employs sublinear space $o(n)$ to\nstore three sketch matrices, alleviating the need for exact $K, V$ storage.\nNotably, our algorithm exhibits exceptional memory-efficient performance with\nsuper-long tokens. As the token length $n$ increases, our error guarantee\ndiminishes while the memory usage remains nearly constant. This unique\nattribute underscores the potential of our technique in efficiently handling\nLLMs in streaming applications.\n","authors":["Raghav Addanki","Chenyang Li","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03271v1","updated":"2024-02-05T18:28:44Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to seek information is of fundamental\nimportance. In many practical applications, such as medical diagnosis and\ntroubleshooting, the information needed to solve the task is not initially\ngiven, and has to be actively sought by asking follow-up questions (for\nexample, a doctor asking a patient for more details about their symptoms). In\nthis work, we introduce Uncertainty of Thoughts (UoT), an algorithm to augment\nlarge language models with the ability to actively seek information by asking\neffective questions. UoT combines 1) an uncertainty-aware simulation approach\nwhich enables the model to simulate possible future scenarios and how likely\nthey are to occur, 2) uncertainty-based rewards motivated by information gain\nwhich incentivizes the model to seek information, and 3) a reward propagation\nscheme to select the optimal question to ask in a way that maximizes the\nexpected reward. In experiments on medical diagnosis, troubleshooting and the\n'20 Questions' game, UoT achieves an average performance improvement of 57.8%\nin the rate of successful task completion across multiple LLMs compared with\ndirect prompting, and also improves efficiency (i.e., the number of questions\nneeded to complete the task).\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.03269v1","updated":"2024-02-05T18:27:27Z","published":"2024-02-05T18:27:27Z","title":"ISPA: Inter-Species Phonetic Alphabet for Transcribing Animal Sounds","summary":" Traditionally, bioacoustics has relied on spectrograms and continuous,\nper-frame audio representations for the analysis of animal sounds, also serving\nas input to machine learning models. Meanwhile, the International Phonetic\nAlphabet (IPA) system has provided an interpretable, language-independent\nmethod for transcribing human speech sounds. In this paper, we introduce ISPA\n(Inter-Species Phonetic Alphabet), a precise, concise, and interpretable system\ndesigned for transcribing animal sounds into text. We compare acoustics-based\nand feature-based methods for transcribing and classifying animal sounds,\ndemonstrating their comparable performance with baseline methods utilizing\ncontinuous, dense audio representations. By representing animal sounds with\ntext, we effectively treat them as a \"foreign language,\" and we show that\nestablished human language ML paradigms and models, such as language models,\ncan be successfully applied to improve performance.\n","authors":["Masato Hagiwara","Marius Miron","Jen-Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03269v1.pdf","comment":"Accepted at XAI-AI Workshop (IEEEXplore track) @ ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.03268v1","updated":"2024-02-05T18:25:51Z","published":"2024-02-05T18:25:51Z","title":"Understanding the Reasoning Ability of Language Models From the\n Perspective of Reasoning Paths Aggregation","summary":" Pre-trained language models (LMs) are able to perform complex reasoning\nwithout explicit fine-tuning. To understand how pre-training with a next-token\nprediction objective contributes to the emergence of such reasoning capability,\nwe propose that we can view an LM as deriving new conclusions by aggregating\nindirect reasoning paths seen at pre-training time. We found this perspective\neffective in two important cases of reasoning: logic reasoning with knowledge\ngraphs (KGs) and math reasoning with math word problems (MWPs). More\nspecifically, we formalize the reasoning paths as random walk paths on the\nknowledge/reasoning graphs. Analyses of learned LM distributions suggest that a\nweighted sum of relevant random walk path probabilities is a reasonable way to\nexplain how LMs reason. Experiments and analysis on multiple KG and MWP\ndatasets reveal the effect of training on random walk paths and suggest that\naugmenting unlabeled random walk reasoning paths can improve real-world\nmulti-step reasoning performance.\n","authors":["Xinyi Wang","Alfonso Amayuelas","Kexun Zhang","Liangming Pan","Wenhu Chen","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00024v2","updated":"2024-02-05T18:24:51Z","published":"2024-01-05T18:31:34Z","title":"Comparative Analysis of LLaMA and ChatGPT Embeddings for Molecule\n Embedding","summary":" Purpose: Large Language Models (LLMs) like ChatGPT and LLaMA are increasingly\nrecognized for their potential in the field of cheminformatics, particularly in\ninterpreting Simplified Molecular Input Line Entry System (SMILES), a standard\nmethod for representing chemical structures. These LLMs can decode SMILES\nstrings into vector representations, providing a novel approach to\nunderstanding chemical graphs.\n Methods: We investigate the performance of ChatGPT and LLaMA in embedding\nSMILES strings. Our evaluation focuses on two key applications: molecular\nproperty (MP) prediction and drug-drug interaction (DDI) prediction, both\nessential in drug development and healthcare.\n Results: We find that SMILES embeddings generated using LLaMA outperform\nthose from ChatGPT in both MP and DDI prediction tasks. Notably, LLaMA-based\nSMILES embeddings show results comparable to existing methods in both\nprediction tasks.\n Conclusion: The application of LLMs in cheminformatics, particularly in\nutilizing SMILES embeddings, shows significant promise for advancing drug\ndevelopment. This includes improving the prediction of chemical properties and\nfacilitating the drug discovery process. GitHub:\nhttps://github.com/sshaghayeghs/LLaMA-VS-ChatGPT\n","authors":["Shaghayegh Sadeghi","Alan Bui","Ali Forooghi","Jianguo Lu","Alioune Ngom"],"pdf_url":"https://arxiv.org/pdf/2402.00024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17256v2","updated":"2024-02-05T18:19:46Z","published":"2024-01-30T18:48:37Z","title":"Weak-to-Strong Jailbreaking on Large Language Models","summary":" Large language models (LLMs) are vulnerable to jailbreak attacks - resulting\nin harmful, unethical, or biased text generations. However, existing\njailbreaking methods are computationally costly. In this paper, we propose the\nweak-to-strong jailbreaking attack, an efficient method to attack aligned LLMs\nto produce harmful text. Our key intuition is based on the observation that\njailbroken and aligned models only differ in their initial decoding\ndistributions. The weak-to-strong attack's key technical insight is using two\nsmaller models (a safe and an unsafe one) to adversarially modify a\nsignificantly larger safe model's decoding probabilities. We evaluate the\nweak-to-strong attack on 5 diverse LLMs from 3 organizations. The results show\nour method can increase the misalignment rate to over 99% on two datasets with\njust one forward pass per example. Our study exposes an urgent safety issue\nthat needs to be addressed when aligning LLMs. As an initial attempt, we\npropose a defense strategy to protect against such attacks, but creating more\nadvanced defenses remains challenging. The code for replicating the method is\navailable at https://github.com/XuandongZhao/weak-to-strong\n","authors":["Xuandong Zhao","Xianjun Yang","Tianyu Pang","Chao Du","Lei Li","Yu-Xiang Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.17256v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03244v1","updated":"2024-02-05T17:59:00Z","published":"2024-02-05T17:59:00Z","title":"Skill Set Optimization: Reinforcing Language Model Behavior via\n Transferable Skills","summary":" Large language models (LLMs) have recently been used for sequential decision\nmaking in interactive environments. However, leveraging environment reward\nsignals for continual LLM actor improvement is not straightforward. We propose\nSkill Set Optimization (SSO) for improving LLM actor performance through\nconstructing and refining sets of transferable skills. SSO constructs skills by\nextracting common subtrajectories with high rewards and generating subgoals and\ninstructions to represent each skill. These skills are provided to the LLM\nactor in-context to reinforce behaviors with high rewards. Then, SSO further\nrefines the skill set by pruning skills that do not continue to result in high\nrewards. We evaluate our method in the classic videogame NetHack and the text\nenvironment ScienceWorld to demonstrate SSO's ability to optimize a set of\nskills and perform in-context policy improvement. SSO outperforms baselines by\n40% in our custom NetHack task and outperforms the previous state-of-the-art in\nScienceWorld by 35%.\n","authors":["Kolby Nottingham","Bodhisattwa Prasad Majumder","Bhavana Dalvi Mishra","Sameer Singh","Peter Clark","Roy Fox"],"pdf_url":"https://arxiv.org/pdf/2402.03244v1.pdf","comment":"8 pages, preprint"},{"id":"http://arxiv.org/abs/2402.03242v1","updated":"2024-02-05T17:57:26Z","published":"2024-02-05T17:57:26Z","title":"JOBSKAPE: A Framework for Generating Synthetic Job Postings to Enhance\n Skill Matching","summary":" Recent approaches in skill matching, employing synthetic training data for\nclassification or similarity model training, have shown promising results,\nreducing the need for time-consuming and expensive annotations. However,\nprevious synthetic datasets have limitations, such as featuring only one skill\nper sentence and generally comprising short sentences. In this paper, we\nintroduce JobSkape, a framework to generate synthetic data that tackles these\nlimitations, specifically designed to enhance skill-to-taxonomy matching.\nWithin this framework, we create SkillSkape, a comprehensive open-source\nsynthetic dataset of job postings tailored for skill-matching tasks. We\nintroduce several offline metrics that show that our dataset resembles\nreal-world data. Additionally, we present a multi-step pipeline for skill\nextraction and matching tasks using large language models (LLMs), benchmarking\nagainst known supervised methodologies. We outline that the downstream\nevaluation results on real-world data can beat baselines, underscoring its\nefficacy and adaptability.\n","authors":["Antoine Magron","Anna Dai","Mike Zhang","Syrielle Montariol","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2402.03242v1.pdf","comment":"Published at NLP4HR 2024 (EACL Workshop)"},{"id":"http://arxiv.org/abs/2304.11082v5","updated":"2024-02-05T17:57:06Z","published":"2023-04-19T17:50:09Z","title":"Fundamental Limitations of Alignment in Large Language Models","summary":" An important aspect in developing language models that interact with humans\nis aligning their behavior to be useful and unharmful for their human users.\nThis is usually achieved by tuning the model in a way that enhances desired\nbehaviors and inhibits undesired ones, a process referred to as alignment. In\nthis paper, we propose a theoretical approach called Behavior Expectation\nBounds (BEB) which allows us to formally investigate several inherent\ncharacteristics and limitations of alignment in large language models.\nImportantly, we prove that within the limits of this framework, for any\nbehavior that has a finite probability of being exhibited by the model, there\nexist prompts that can trigger the model into outputting this behavior, with\nprobability that increases with the length of the prompt. This implies that any\nalignment process that attenuates an undesired behavior but does not remove it\naltogether, is not safe against adversarial prompting attacks. Furthermore, our\nframework hints at the mechanism by which leading alignment approaches such as\nreinforcement learning from human feedback make the LLM prone to being prompted\ninto the undesired behaviors. This theoretical result is being experimentally\ndemonstrated in large scale by the so called contemporary \"chatGPT jailbreaks\",\nwhere adversarial users trick the LLM into breaking its alignment guardrails by\ntriggering it into acting as a malicious persona. Our results expose\nfundamental limitations in alignment of LLMs and bring to the forefront the\nneed to devise reliable mechanisms for ensuring AI safety.\n","authors":["Yotam Wolf","Noam Wies","Oshri Avnery","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2304.11082v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12585v3","updated":"2024-02-05T17:49:40Z","published":"2024-01-23T09:33:31Z","title":"SLANG: New Concept Comprehension of Large Language Models","summary":" The dynamic nature of language, particularly evident in the realm of slang\nand memes on the Internet, poses serious challenges to the adaptability of\nlarge language models (LLMs). Traditionally anchored to static datasets, these\nmodels often struggle to keep up with the rapid linguistic evolution\ncharacteristic of online communities. This research aims to bridge this gap by\nenhancing LLMs' comprehension of the evolving new concepts on the Internet,\nwithout the high cost of continual retraining. In pursuit of this goal, we\npropose a new benchmark $\\textbf{SLANG}$, which can autonomously integrates\nnovel data to stay dataset up-to-date, to assess LLMs' capability in\ncomprehending emerging concepts and an approach $\\textbf{FOCUS}$, which uses\ncausal inference to enhance LLMs to understand new phrases and their colloquial\ncontext. Our benchmark and approach involves digesting real-world instances of\nlinguistic shifts, serving as contextual beacons, to form more precise and\ncontextually relevant connections between newly emerging expressions and their\nmeanings. The empirical analysis shows that our causal inference-based approach\noutperforms the traditional models in terms of precision and relevance in the\ncomprehension of Internet slang and memes.\n","authors":["Lingrui Mei","Shenghua Liu","Yiwei Wang","Baolong Bi","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.12585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03223v1","updated":"2024-02-05T17:36:19Z","published":"2024-02-05T17:36:19Z","title":"English Prompts are Better for NLI-based Zero-Shot Emotion\n Classification than Target-Language Prompts","summary":" Emotion classification in text is a challenging and subjective task, due to\nthe involved cognitive inference processes that are required to interpret a\ntextual stimulus. In addition, the set of emotion categories is highly\ndomain-specific. For instance, literature analysis might require the use of\naesthetic emotions (e.g., finding something beautiful), and social media\nanalysis could benefit from fine-grained sets (e.g., separating anger from\nannoyance) in contrast to basic emotion categories. This renders the task an\ninteresting field for zero-shot classifications, in which the label set is not\nknown at model development time. Unfortunately, most resources for emotion\nanalysis are English, and therefore, most studies on emotion analysis have been\nperformed in English, including those that involve prompting language models\nfor text labels. This leaves us with a research gap that we address in this\npaper: In which language should we prompt for emotion labels on non-English\ntexts? This is particularly of interest when we have access to a multilingual\nlarge language model, because we could request labels with English prompts even\nfor non-English data. Our experiments with natural language inference-based\nlanguage models show that it is consistently better to use English prompts even\nif the data is in a different language.\n","authors":["Patrick Barreiß","Roman Klinger","Jeremy Barnes"],"pdf_url":"https://arxiv.org/pdf/2402.03223v1.pdf","comment":"submitted to the PromptEng workshop at The Web Conf"},{"id":"http://arxiv.org/abs/2401.07964v2","updated":"2024-02-05T17:34:17Z","published":"2024-01-15T21:06:20Z","title":"AI-as-exploration: Navigating intelligence space","summary":" Artificial Intelligence is a field that lives many lives, and the term has\ncome to encompass a motley collection of scientific and commercial endeavours.\nIn this paper, I articulate the contours of a rather neglected but central\nscientific role that AI has to play, which I dub `AI-as-exploration'.The basic\nthrust of AI-as-exploration is that of creating and studying systems that can\nreveal candidate building blocks of intelligence that may differ from the forms\nof human and animal intelligence we are familiar with. In other words, I\nsuggest that AI is one of the best tools we have for exploring intelligence\nspace, namely the space of possible intelligent systems. I illustrate the value\nof AI-as-exploration by focusing on a specific case study, i.e., recent work on\nthe capacity to combine novel and invented concepts in humans and Large\nLanguage Models. I show that the latter, despite showing human-level accuracy\nin such a task, most probably solve it in ways radically different, but no less\nrelevant to intelligence research, to those hypothesised for humans.\n","authors":["Dimitri Coelho Mollo"],"pdf_url":"https://arxiv.org/pdf/2401.07964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03221v1","updated":"2024-02-05T17:33:22Z","published":"2024-02-05T17:33:22Z","title":"\"Define Your Terms\" : Enhancing Efficient Offensive Speech\n Classification with Definition","summary":" The propagation of offensive content through social media channels has\ngarnered attention of the research community. Multiple works have proposed\nvarious semantically related yet subtle distinct categories of offensive\nspeech. In this work, we explore meta-earning approaches to leverage the\ndiversity of offensive speech corpora to enhance their reliable and efficient\ndetection. We propose a joint embedding architecture that incorporates the\ninput's label and definition for classification via Prototypical Network. Our\nmodel achieves at least 75% of the maximal F1-score while using less than 10%\nof the available training data across 4 datasets. Our experimental findings\nalso provide a case study of training strategies valuable to combat resource\nscarcity.\n","authors":["Huy Nghiem","Umang Gupta","Fred Morstatter"],"pdf_url":"https://arxiv.org/pdf/2402.03221v1.pdf","comment":"Accepted to Main Conference, EACL 2024"},{"id":"http://arxiv.org/abs/2305.14195v3","updated":"2024-02-05T17:28:07Z","published":"2023-05-23T16:15:24Z","title":"HumBEL: A Human-in-the-Loop Approach for Evaluating Demographic Factors\n of Language Models in Human-Machine Conversations","summary":" While demographic factors like age and gender change the way people talk, and\nin particular, the way people talk to machines, there is little investigation\ninto how large pre-trained language models (LMs) can adapt to these changes. To\nremedy this gap, we consider how demographic factors in LM language skills can\nbe measured to determine compatibility with a target demographic. We suggest\nclinical techniques from Speech Language Pathology, which has norms for\nacquisition of language skills in humans. We conduct evaluation with a domain\nexpert (i.e., a clinically licensed speech language pathologist), and also\npropose automated techniques to complement clinical evaluation at scale.\nEmpirically, we focus on age, finding LM capability varies widely depending on\ntask: GPT-3.5 mimics the ability of humans ranging from age 6-15 at tasks\nrequiring inference, and simultaneously, outperforms a typical 21 year old at\nmemorization. GPT-3.5 also has trouble with social language use, exhibiting\nless than 50% of the tested pragmatic skills. Findings affirm the importance of\nconsidering demographic alignment and conversational goals when using LMs as\npublic-facing tools. Code, data, and a package will be available.\n","authors":["Anthony Sicilia","Jennifer C. Gates","Malihe Alikhani"],"pdf_url":"https://arxiv.org/pdf/2305.14195v3.pdf","comment":"17 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2402.03216v1","updated":"2024-02-05T17:26:49Z","published":"2024-02-05T17:26:49Z","title":"BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity\n Text Embeddings Through Self-Knowledge Distillation","summary":" In this paper, we present a new embedding model, called M3-Embedding, which\nis distinguished for its versatility in Multi-Linguality, Multi-Functionality,\nand Multi-Granularity. It can support more than 100 working languages, leading\nto new state-of-the-art performances on multi-lingual and cross-lingual\nretrieval tasks. It can simultaneously perform the three common retrieval\nfunctionalities of embedding model: dense retrieval, multi-vector retrieval,\nand sparse retrieval, which provides a unified model foundation for real-world\nIR applications. It is able to process inputs of different granularities,\nspanning from short sentences to long documents of up to 8192 tokens. The\neffective training of M3-Embedding involves the following technical\ncontributions. We propose a novel self-knowledge distillation approach, where\nthe relevance scores from different retrieval functionalities can be integrated\nas the teacher signal to enhance the training quality. We also optimize the\nbatching strategy, enabling a large batch size and high training throughput to\nensure the discriminativeness of embeddings. To the best of our knowledge,\nM3-Embedding is the first embedding model which realizes such a strong\nversatility. The model and code will be publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Jianlv Chen","Shitao Xiao","Peitian Zhang","Kun Luo","Defu Lian","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03216v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.15222v2","updated":"2024-02-05T17:13:41Z","published":"2024-01-26T22:19:31Z","title":"Transfer Learning for the Prediction of Entity Modifiers in Clinical\n Text: Application to Opioid Use Disorder Case Detection","summary":" Background: The semantics of entities extracted from a clinical text can be\ndramatically altered by modifiers, including entity negation, uncertainty,\nconditionality, severity, and subject. Existing models for determining\nmodifiers of clinical entities involve regular expression or features weights\nthat are trained independently for each modifier.\n Methods: We develop and evaluate a multi-task transformer architecture design\nwhere modifiers are learned and predicted jointly using the publicly available\nSemEval 2015 Task 14 corpus and a new Opioid Use Disorder (OUD) data set that\ncontains modifiers shared with SemEval as well as novel modifiers specific for\nOUD. We evaluate the effectiveness of our multi-task learning approach versus\npreviously published systems and assess the feasibility of transfer learning\nfor clinical entity modifiers when only a portion of clinical modifiers are\nshared.\n Results: Our approach achieved state-of-the-art results on the ShARe corpus\nfrom SemEval 2015 Task 14, showing an increase of 1.1% on weighted accuracy,\n1.7% on unweighted accuracy, and 10% on micro F1 scores.\n Conclusions: We show that learned weights from our shared model can be\neffectively transferred to a new partially matched data set, validating the use\nof transfer learning for clinical text modifiers\n","authors":["Abdullateef I. Almudaifer","Whitney Covington","JaMor Hairston","Zachary Deitch","Ankit Anand","Caleb M. Carroll","Estera Crisan","William Bradford","Lauren Walter","Eaton Ellen","Sue S. Feldman","John D. Osborne"],"pdf_url":"https://arxiv.org/pdf/2401.15222v2.pdf","comment":"18 pages, 2 figures, 6 tables. To be submitted to the Journal of\n Biomedical Semantics"},{"id":"http://arxiv.org/abs/2402.03191v1","updated":"2024-02-05T16:57:24Z","published":"2024-02-05T16:57:24Z","title":"Isotropy, Clusters, and Classifiers","summary":" Whether embedding spaces use all their dimensions equally, i.e., whether they\nare isotropic, has been a recent subject of discussion. Evidence has been\naccrued both for and against enforcing isotropy in embedding spaces. In the\npresent paper, we stress that isotropy imposes requirements on the embedding\nspace that are not compatible with the presence of clusters -- which also\nnegatively impacts linear classification objectives. We demonstrate this fact\nempirically and use it to shed light on previous results from the literature.\n","authors":["Timothee Mickus","Stig-Arne Grönroos","Joseph Attieh"],"pdf_url":"https://arxiv.org/pdf/2402.03191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03190v1","updated":"2024-02-05T16:56:11Z","published":"2024-02-05T16:56:11Z","title":"Unified Hallucination Detection for Multimodal Large Language Models","summary":" Despite significant strides in multimodal tasks, Multimodal Large Language\nModels (MLLMs) are plagued by the critical issue of hallucination. The reliable\ndetection of such hallucinations in MLLMs has, therefore, become a vital aspect\nof model evaluation and the safeguarding of practical application deployment.\nPrior research in this domain has been constrained by a narrow focus on\nsingular tasks, an inadequate range of hallucination categories addressed, and\na lack of detailed granularity. In response to these challenges, our work\nexpands the investigative horizons of hallucination detection. We present a\nnovel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate\nthe evaluation of advancements in hallucination detection methods.\nAdditionally, we unveil a novel unified multimodal hallucination detection\nframework, UNIHD, which leverages a suite of auxiliary tools to validate the\noccurrence of hallucinations robustly. We demonstrate the effectiveness of\nUNIHD through meticulous evaluation and comprehensive analysis. We also provide\nstrategic insights on the application of specific tools for addressing various\ncategories of hallucinations.\n","authors":["Xiang Chen","Chenxi Wang","Yida Xue","Ningyu Zhang","Xiaoyan Yang","Qiang Li","Yue Shen","Jinjie Gu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03190v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.03177v1","updated":"2024-02-05T16:44:17Z","published":"2024-02-05T16:44:17Z","title":"CIDAR: Culturally Relevant Instruction Dataset For Arabic","summary":" Instruction tuning has emerged as a prominent methodology for teaching Large\nLanguage Models (LLMs) to follow instructions. However, current instruction\ndatasets predominantly cater to English or are derived from English-dominated\nLLMs, resulting in inherent biases toward Western culture. This bias\nsignificantly impacts the linguistic structures of non-English languages such\nas Arabic, which has a distinct grammar reflective of the diverse cultures\nacross the Arab region. This paper addresses this limitation by introducing\nCIDAR: https://hf.co/datasets/arbml/CIDAR, the first open Arabic\ninstruction-tuning dataset culturally-aligned by human reviewers. CIDAR\ncontains 10,000 instruction and output pairs that represent the Arab region. We\ndiscuss the cultural relevance of CIDAR via the analysis and comparison to\nother models fine-tuned on other datasets. Our experiments show that CIDAR can\nhelp enrich research efforts in aligning LLMs with the Arabic culture. All the\ncode is available at https://github.com/ARBML/CIDAR.\n","authors":["Zaid Alyafeai","Khalid Almubarak","Ahmed Ashraf","Deema Alnuhait","Saied Alshahrani","Gubran A. Q. Abdulrahman","Gamil Ahmed","Qais Gawah","Zead Saleh","Mustafa Ghaleb","Yousef Ali","Maged S. Al-Shaibani"],"pdf_url":"https://arxiv.org/pdf/2402.03177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14736v2","updated":"2024-02-05T16:41:10Z","published":"2023-11-21T19:12:18Z","title":"Data Diversity Matters for Robust Instruction Tuning","summary":" Recent works have shown that by curating high quality and diverse instruction\ntuning datasets, we can significantly improve instruction-following\ncapabilities. However, creating such datasets is difficult and most works rely\non manual curation or proprietary language models. Automatic data curation is\ndifficult as it is still not clear how we can define diversity for instruction\ntuning, how diversity and quality depend on one other, and how we can optimize\ndataset quality and diversity. To resolve these issue, we propose a new\nalgorithm, Quality-Diversity Instruction Tuning (QDIT). QDIT provides a simple\nmethod to simultaneously control dataset diversity and quality, allowing us to\nconduct an in-depth study on the effect of diversity and quality on instruction\ntuning performance. From this study we draw two key insights (1) there is a\nnatural tradeoff between data diversity and quality and (2) increasing data\ndiversity significantly improves the worst case instruction following\nperformance, therefore improving robustness. We validate the performance of\nQDIT on several large scale instruction tuning datasets, where we find it can\nsubstantially improve worst and average case performance compared to\nquality-driven data selection.\n","authors":["Alexander Bukharin","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.14736v2.pdf","comment":"22 pages, 18 figures"},{"id":"http://arxiv.org/abs/2402.03173v1","updated":"2024-02-05T16:41:02Z","published":"2024-02-05T16:41:02Z","title":"Multi: Multimodal Understanding Leaderboard with Text and Images","summary":" Rapid progress in multimodal large language models (MLLMs) highlights the\nneed to introduce challenging yet realistic benchmarks to the academic\ncommunity. Existing benchmarks primarily focus on simple natural image\nunderstanding, but Multi emerges as a cutting-edge benchmark for MLLMs,\noffering a comprehensive dataset for evaluating MLLMs against understanding\ncomplex figures and tables, and scientific questions. This benchmark,\nreflecting current realistic examination styles, provides multimodal inputs and\nrequires responses that are either precise or open-ended, similar to real-life\nschool tests. It challenges MLLMs with a variety of tasks, ranging from formula\nderivation to image detail analysis, and cross-modality reasoning. Multi\nincludes over 18,000 questions, with a focus on science-based QA in diverse\nformats. We also introduce Multi-Elite, a 500-question subset for testing the\nextremities of MLLMs, and Multi-Extend, which enhances In-Context Learning\nresearch with more than 4,500 knowledge pieces. Our evaluation indicates\nsignificant potential for MLLM advancement, with GPT-4V achieving a 63.7%\naccuracy rate on Multi, in contrast to other MLLMs scoring between 31.3% and\n53.7%. Multi serves not only as a robust evaluation platform but also paves the\nway for the development of expert-level AI.\n","authors":["Zichen Zhu","Yang Xu","Lu Chen","Jingkai Yang","Yichuan Ma","Yiming Sun","Hailin Wen","Jiaqi Liu","Jinyu Cai","Yingzi Ma","Situo Zhang","Zihan Zhao","Liangtai Sun","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03173v1.pdf","comment":"Details and access are available at:\n https://OpenDFM.github.io/MULTI-Benchmark/"},{"id":"http://arxiv.org/abs/2402.03172v1","updated":"2024-02-05T16:40:23Z","published":"2024-02-05T16:40:23Z","title":"Accurate and Well-Calibrated ICD Code Assignment Through Attention Over\n Diverse Label Embeddings","summary":" Although the International Classification of Diseases (ICD) has been adopted\nworldwide, manually assigning ICD codes to clinical text is time-consuming,\nerror-prone, and expensive, motivating the development of automated approaches.\nThis paper describes a novel approach for automated ICD coding, combining\nseveral ideas from previous related work. We specifically employ a strong\nTransformer-based model as a text encoder and, to handle lengthy clinical\nnarratives, we explored either (a) adapting the base encoder model into a\nLongformer, or (b) dividing the text into chunks and processing each chunk\nindependently. The representations produced by the encoder are combined with a\nlabel embedding mechanism that explores diverse ICD code synonyms. Experiments\nwith different splits of the MIMIC-III dataset show that the proposed approach\noutperforms the current state-of-the-art models in ICD coding, with the label\nembeddings significantly contributing to the good performance. Our approach\nalso leads to properly calibrated classification results, which can effectively\ninform downstream tasks such as quantification.\n","authors":["Gonçalo Gomes","Isabel Coutinho","Bruno Martins"],"pdf_url":"https://arxiv.org/pdf/2402.03172v1.pdf","comment":"Accepted to EACL2024"},{"id":"http://arxiv.org/abs/2402.03171v1","updated":"2024-02-05T16:39:15Z","published":"2024-02-05T16:39:15Z","title":"Homograph Attacks on Maghreb Sentiment Analyzers","summary":" We examine the impact of homograph attacks on the Sentiment Analysis (SA)\ntask of different Arabic dialects from the Maghreb North-African countries.\nHomograph attacks result in a 65.3% decrease in transformer classification from\nan F1-score of 0.95 to 0.33 when data is written in \"Arabizi\". The goal of this\nstudy is to highlight LLMs weaknesses' and to prioritize ethical and\nresponsible Machine Learning.\n","authors":["Fatima Zahra Qachfar","Rakesh M. Verma"],"pdf_url":"https://arxiv.org/pdf/2402.03171v1.pdf","comment":"NAML, North Africans in Machine Leaning, NeurIPS, Neural Information\n Processing Systems"},{"id":"http://arxiv.org/abs/2310.15393v2","updated":"2024-02-05T16:33:05Z","published":"2023-10-23T22:51:58Z","title":"DoGE: Domain Reweighting with Generalization Estimation","summary":" The coverage and composition of the pretraining data significantly impacts\nthe generalization ability of Large Language Models (LLMs). Despite its\nimportance, recent LLMs still rely on heuristics and trial and error to\nincrease or reduce the influence of data-domains. We propose DOmain reweighting\nwith Generalization Estimation (DoGE), which optimizes the probability of\nsampling from each domain (domain weights) in a principled way. Our approach is\na two-stage process consisting of (i) training a proxy model to obtain domain\nweights using a bi-level optimization algorithm; (ii) training a larger base\nmodel by sampling training domains according to the learned domain weights. In\nour experiments, we extensively show how DoGE improves the generalization of\nthe base model to any target data mixture. On the SlimPajama dataset, our base\nmodel gets better perplexity and few-shot reasoning accuracies across $6$ tasks\ncompared to baseline methods. Moreover, aiming to generalize to out-of-domain\ntarget tasks, which is unseen in the pretraining corpus (OOD domain), DoGE can\neffectively identify inter-domain dependencies, and consistently achieves\nbetter test perplexity on the target domain.\n","authors":["Simin Fan","Matteo Pagliardini","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2310.15393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03163v1","updated":"2024-02-05T16:31:03Z","published":"2024-02-05T16:31:03Z","title":"Linguistic features for sentence difficulty prediction in ABSA","summary":" One of the challenges of natural language understanding is to deal with the\nsubjectivity of sentences, which may express opinions and emotions that add\nlayers of complexity and nuance. Sentiment analysis is a field that aims to\nextract and analyze these subjective elements from text, and it can be applied\nat different levels of granularity, such as document, paragraph, sentence, or\naspect. Aspect-based sentiment analysis is a well-studied topic with many\navailable data sets and models. However, there is no clear definition of what\nmakes a sentence difficult for aspect-based sentiment analysis. In this paper,\nwe explore this question by conducting an experiment with three data sets:\n\"Laptops\", \"Restaurants\", and \"MTSC\" (Multi-Target-dependent Sentiment\nClassification), and a merged version of these three datasets. We study the\nimpact of domain diversity and syntactic diversity on difficulty. We use a\ncombination of classifiers to identify the most difficult sentences and analyze\ntheir characteristics. We employ two ways of defining sentence difficulty. The\nfirst one is binary and labels a sentence as difficult if the classifiers fail\nto correctly predict the sentiment polarity. The second one is a six-level\nscale based on how many of the top five best-performing classifiers can\ncorrectly predict the sentiment polarity. We also define 9 linguistic features\nthat, combined, aim at estimating the difficulty at sentence level.\n","authors":["Adrian-Gabriel Chifu","Sébastien Fournier"],"pdf_url":"https://arxiv.org/pdf/2402.03163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03161v1","updated":"2024-02-05T16:30:49Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n Visual-Motional Tokenization","summary":" In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models will be available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12038v2","updated":"2024-02-05T16:15:38Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in non-English\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a (quasi)-zero-shot\nmanner, even surpassing models trained on image-text data in native languages.\nTaking Chinese as a practice of MPM, we build large multimodal models VisCPM in\nimage-to-text and text-to-image generation, which achieve state-of-the-art\n(open-source) performance in Chinese. To facilitate future research, we\nopen-source codes and model weights at https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v2.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2308.12420v2","updated":"2024-02-05T16:06:14Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts\nextend beyond technology, influencing environmental and societal aspects. This\nevolution has increased publications, making manual literature analysis\nincreasingly challenging. We address this with a Natural Language Processing\n(NLP)-based systematic literature review method to explore the intersection of\nDistributed Ledger Technology (DLT) with its Environmental, Social, and\nGovernance (ESG) aspects. Our approach involves building and refining a\ndirected citation network from 107 seed papers to a corpus of 24,539\npublications and fine-tuning a transformer-based language model for Named\nEntity Recognition (NER) on DLT and ESG domains. Applying this model, we\ndistilled the corpus to 505 key publications, enabling an inaugural literature\nreview and temporal graph analysis of DLT's evolution in ESG contexts. Our\ncontributions include an adaptable and scalable NLP-driven systematic\nliterature review methodology and a unique NER dataset of 54,808 entities,\ntailored for DLT and ESG research. Our inaugural literature review demonstrates\ntheir applicability and effectiveness in analyzing DLT's evolution and impacts,\nproving invaluable for stakeholders in the DLT domain.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03137v1","updated":"2024-02-05T16:05:32Z","published":"2024-02-05T16:05:32Z","title":"Sociolinguistically Informed Interpretability: A Case Study on Hinglish\n Emotion Classification","summary":" Emotion classification is a challenging task in NLP due to the inherent\nidiosyncratic and subjective nature of linguistic expression, especially with\ncode-mixed data. Pre-trained language models (PLMs) have achieved high\nperformance for many tasks and languages, but it remains to be seen whether\nthese models learn and are robust to the differences in emotional expression\nacross languages. Sociolinguistic studies have shown that Hinglish speakers\nswitch to Hindi when expressing negative emotions and to English when\nexpressing positive emotions. To understand if language models can learn these\nassociations, we study the effect of language on emotion prediction across 3\nPLMs on a Hinglish emotion classification dataset. Using LIME and token level\nlanguage ID, we find that models do learn these associations between language\nchoice and emotional expression. Moreover, having code-mixed data present in\nthe pre-training can augment that learning when task-specific data is scarce.\nWe also conclude from the misclassifications that the models may overgeneralise\nthis heuristic to other infrequent examples where this sociolinguistic\nphenomenon does not apply.\n","authors":["Kushal Tatariya","Heather Lent","Johannes Bjerva","Miryam de Lhoneux"],"pdf_url":"https://arxiv.org/pdf/2402.03137v1.pdf","comment":"5 pages, Accepted to SIGTYP 2024 @ EACL"},{"id":"http://arxiv.org/abs/2402.03131v1","updated":"2024-02-05T15:57:32Z","published":"2024-02-05T15:57:32Z","title":"Constrained Decoding for Cross-lingual Label Projection","summary":" Zero-shot cross-lingual transfer utilizing multilingual LLMs has become a\npopular learning paradigm for low-resource languages with no labeled training\ndata. However, for NLP tasks that involve fine-grained predictions on words and\nphrases, the performance of zero-shot cross-lingual transfer learning lags far\nbehind supervised fine-tuning methods. Therefore, it is common to exploit\ntranslation and label projection to further improve the performance by (1)\ntranslating training data that is available in a high-resource language (e.g.,\nEnglish) together with the gold labels into low-resource languages, and/or (2)\ntranslating test data in low-resource languages to a high-source language to\nrun inference on, then projecting the predicted span-level labels back onto the\noriginal test data. However, state-of-the-art marker-based label projection\nmethods suffer from translation quality degradation due to the extra label\nmarkers injected in the input to the translation model. In this work, we\nexplore a new direction that leverages constrained decoding for label\nprojection to overcome the aforementioned issues. Our new method not only can\npreserve the quality of translated texts but also has the versatility of being\napplicable to both translating training and translating test data strategies.\nThis versatility is crucial as our experiments reveal that translating test\ndata can lead to a considerable boost in performance compared to translating\nonly training data. We evaluate on two cross-lingual transfer tasks, namely\nNamed Entity Recognition and Event Argument Extraction, spanning 20 languages.\nThe results demonstrate that our approach outperforms the state-of-the-art\nmarker-based method by a large margin and also shows better performance than\nother label projection methods that rely on external word alignment.\n","authors":["Duong Minh Le","Yang Chen","Alan Ritter","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.03131v1.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.03099v1","updated":"2024-02-05T15:28:43Z","published":"2024-02-05T15:28:43Z","title":"Intent-based Prompt Calibration: Enhancing prompt optimization with\n synthetic boundary cases","summary":" Prompt engineering is a challenging and important task due to the high\nsensitivity of Large Language Models (LLMs) to the given prompt and the\ninherent ambiguity of a textual task instruction. Automatic prompt engineering\nis essential to achieve optimized performance from LLMs. Recent studies have\ndemonstrated the capabilities of LLMs to automatically conduct prompt\nengineering by employing a meta-prompt that incorporates the outcomes of the\nlast trials and proposes an improved prompt. However, this requires a\nhigh-quality benchmark to compare different prompts, which is difficult and\nexpensive to acquire in many real-world use cases. In this work, we introduce a\nnew method for automatic prompt engineering, using a calibration process that\niteratively refines the prompt to the user intent. During the optimization\nprocess, the system jointly generates synthetic data of boundary use cases and\noptimizes the prompt according to the generated dataset. We demonstrate the\neffectiveness of our method with respect to strong proprietary models on\nreal-world tasks such as moderation and generation. Our method outperforms\nstate-of-the-art methods with a limited number of annotated samples.\nFurthermore, we validate the advantages of each one of the system's key\ncomponents. Our system is built in a modular way, facilitating easy adaptation\nto other tasks. The code is available\n$\\href{https://github.com/Eladlev/AutoPrompt}{here}$.\n","authors":["Elad Levi","Eli Brosh","Matan Friedmann"],"pdf_url":"https://arxiv.org/pdf/2402.03099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04076v4","updated":"2024-02-05T15:12:06Z","published":"2023-11-07T15:40:43Z","title":"Do LLMs exhibit human-like response biases? A case study in survey\n design","summary":" As large language models (LLMs) become more capable, there is growing\nexcitement about the possibility of using LLMs as proxies for humans in\nreal-world tasks where subjective labels are desired, such as in surveys and\nopinion polling. One widely-cited barrier to the adoption of LLMs is their\nsensitivity to prompt wording - but interestingly, humans also display\nsensitivities to instruction changes in the form of response biases. As such,\nwe argue that if LLMs are going to be used to approximate human opinions, it is\nnecessary to investigate the extent to which LLMs also reflect human response\nbiases, if at all. In this work, we use survey design as a case study, where\nhuman response biases caused by permutations in wordings of \"prompts\" have been\nextensively studied. Drawing from prior work in social psychology, we design a\ndataset and propose a framework to evaluate whether LLMs exhibit human-like\nresponse biases in survey questionnaires. Our comprehensive evaluation of nine\nmodels shows that popular open and commercial LLMs generally fail to reflect\nhuman-like behavior. These inconsistencies tend to be more prominent in models\nthat have been instruction fine-tuned. Furthermore, even if a model shows a\nsignificant change in the same direction as humans, we find that perturbations\nthat are not meant to elicit significant changes in humans may also result in a\nsimilar change. These results highlight the potential pitfalls of using LLMs to\nsubstitute humans in parts of the annotation pipeline, and further underscore\nthe importance of finer-grained characterizations of model behavior. Our code,\ndataset, and collected samples are available at\nhttps://github.com/lindiatjuatja/BiasMonkey\n","authors":["Lindia Tjuatja","Valerie Chen","Sherry Tongshuang Wu","Ameet Talwalkar","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2311.04076v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03067v1","updated":"2024-02-05T14:59:29Z","published":"2024-02-05T14:59:29Z","title":"Multilingual transformer and BERTopic for short text topic modeling: The\n case of Serbian","summary":" This paper presents the results of the first application of BERTopic, a\nstate-of-the-art topic modeling technique, to short text written in a\nmorphologi-cally rich language. We applied BERTopic with three multilingual\nembed-ding models on two levels of text preprocessing (partial and full) to\nevalu-ate its performance on partially preprocessed short text in Serbian. We\nalso compared it to LDA and NMF on fully preprocessed text. The experiments\nwere conducted on a dataset of tweets expressing hesitancy toward COVID-19\nvaccination. Our results show that with adequate parameter setting, BERTopic\ncan yield informative topics even when applied to partially pre-processed short\ntext. When the same parameters are applied in both prepro-cessing scenarios,\nthe performance drop on partially preprocessed text is minimal. Compared to LDA\nand NMF, judging by the keywords, BERTopic offers more informative topics and\ngives novel insights when the number of topics is not limited. The findings of\nthis paper can be significant for re-searchers working with other\nmorphologically rich low-resource languages and short text.\n","authors":["Darija Medvecki","Bojana Bašaragin","Adela Ljajić","Nikola Milošević"],"pdf_url":"https://arxiv.org/pdf/2402.03067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04215v2","updated":"2024-02-05T14:55:19Z","published":"2023-08-08T12:27:20Z","title":"Hybrid Retrieval-Augmented Generation for Real-time Composition\n Assistance","summary":" Retrieval augmentation enhances performance of traditional language models by\nincorporating additional context. However, the computational demands for\nretrieval augmented large language models (LLMs) pose a challenge when applying\nthem to real-time tasks, such as composition assistance. To address this\nlimitation, we propose the Hybrid Retrieval-Augmented Generation (HybridRAG)\nframework, a novel approach that efficiently combines a cloud-based LLM with a\nsmaller, client-side, language model through retrieval augmented memory. This\nintegration enables the client model to generate effective responses,\nbenefiting from the LLM's capabilities and contextual information.\nAdditionally, through an asynchronous memory update mechanism, the client model\ncan deliver real-time completions swiftly to user inputs without the need to\nwait for responses from the cloud. Our experiments on five benchmark datasets\ndemonstrate that HybridRAG significantly improves utility over client-only\nmodels while maintaining low latency.\n","authors":["Menglin Xia","Xuchao Zhang","Camille Couturier","Guoqing Zheng","Saravan Rajmohan","Victor Ruhle"],"pdf_url":"https://arxiv.org/pdf/2308.04215v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16332v2","updated":"2024-02-05T14:53:13Z","published":"2024-01-29T17:38:14Z","title":"Tradeoffs Between Alignment and Helpfulness in Language Models","summary":" Language model alignment has become an important component of AI safety,\nallowing safe interactions between humans and language models, by enhancing\ndesired behaviors and inhibiting undesired ones. It is often done by tuning the\nmodel or inserting preset aligning prompts. Recently, representation\nengineering, a method which alters the model's behavior via changing its\nrepresentations post-training, was shown to be effective in aligning LLMs (Zou\net al., 2023a). Representation engineering yields gains in alignment oriented\ntasks such as resistance to adversarial attacks and reduction of social biases,\nbut was also shown to cause a decrease in the ability of the model to perform\nbasic tasks. In this paper we study the tradeoff between the increase in\nalignment and decrease in helpfulness of the model. We propose a theoretical\nframework which provides bounds for these two quantities, and demonstrate their\nrelevance empirically. Interestingly, we find that while the helpfulness\ngenerally decreases, it does so quadratically with the norm of the\nrepresentation engineering vector, while the alignment increases linearly with\nit, indicating a regime in which it is efficient to use representation\nengineering. We validate our findings empirically, and chart the boundaries to\nthe usefulness of representation engineering for alignment.\n","authors":["Yotam Wolf","Noam Wies","Dorin Shteyman","Binyamin Rothberg","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2401.16332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13861v2","updated":"2024-02-05T14:41:35Z","published":"2023-04-26T23:09:02Z","title":"The Parrot Dilemma: Human-Labeled vs. LLM-augmented Data in\n Classification Tasks","summary":" In the realm of Computational Social Science (CSS), practitioners often\nnavigate complex, low-resource domains and face the costly and time-intensive\nchallenges of acquiring and annotating data. We aim to establish a set of\nguidelines to address such challenges, comparing the use of human-labeled data\nwith synthetically generated data from GPT-4 and Llama-2 in ten distinct CSS\nclassification tasks of varying complexity. Additionally, we examine the impact\nof training data sizes on performance. Our findings reveal that models trained\non human-labeled data consistently exhibit superior or comparable performance\ncompared to their synthetically augmented counterparts. Nevertheless, synthetic\naugmentation proves beneficial, particularly in improving performance on rare\nclasses within multi-class tasks. Furthermore, we leverage GPT-4 and Llama-2\nfor zero-shot classification and find that, while they generally display strong\nperformance, they often fall short when compared to specialized classifiers\ntrained on moderately sized training sets.\n","authors":["Anders Giovanni Møller","Jacob Aarup Dalsgaard","Arianna Pera","Luca Maria Aiello"],"pdf_url":"https://arxiv.org/pdf/2304.13861v2.pdf","comment":"Accepted at EACL 2024. 14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2402.03053v1","updated":"2024-02-05T14:36:51Z","published":"2024-02-05T14:36:51Z","title":"Multi-Lingual Malaysian Embedding: Leveraging Large Language Models for\n Semantic Representations","summary":" In this work, we present a comprehensive exploration of finetuning Malaysian\nlanguage models, specifically Llama2 and Mistral, on embedding tasks involving\nnegative and positive pairs. We release two distinct models tailored for\nSemantic Similarity and Retrieval-Augmented Generation (RAG).\n For Semantic Similarity, our 600 million parameter Llama2 model outperforms\nOpenAI text-embedding-ada-002 across all recall@k metrics for b.cari.com.my,\nc.cari.com.my, Malay news, and Malaysian Twitter test sets.\n In the realm of RAG models, our approach proves competitive with OpenAI\ntext-embedding-ada-002 in the Malaysian context. Notably, our 2 billion\nparameter Llama2 model achieves superior Recall@5, Recall@10 for the \"Melayu\"\nkeyword research papers dataset and excels in Recall@3, Recall@5, and Recall@10\nfor the lom.agc.gov.my dataset.\n These findings underscore the effectiveness of our finetuning strategy and\nhighlight the performance gains in both Semantic Similarity and RAG tasks.\n All models released at\nhttps://huggingface.co/collections/mesolitica/malaysian-embedding-6523612bfe5881ad35f81b99\n","authors":["Husein Zolkepli","Aisyah Razak","Kamarul Adha","Ariff Nazhan"],"pdf_url":"https://arxiv.org/pdf/2402.03053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03050v1","updated":"2024-02-05T14:34:14Z","published":"2024-02-05T14:34:14Z","title":"A Comprehensive Study of the Current State-of-the-Art in Nepali\n Automatic Speech Recognition Systems","summary":" In this paper, we examine the research conducted in the field of Nepali\nAutomatic Speech Recognition (ASR). The primary objective of this survey is to\nconduct a comprehensive review of the works on Nepali Automatic Speech\nRecognition Systems completed to date, explore the different datasets used,\nexamine the technology utilized, and take account of the obstacles encountered\nin implementing the Nepali ASR system. In tandem with the global trends of\never-increasing research on speech recognition based research, the number of\nNepalese ASR-related projects are also growing. Nevertheless, the investigation\nof language and acoustic models of the Nepali language has not received\nadequate attention compared to languages that possess ample resources. In this\ncontext, we provide a framework as well as directions for future\ninvestigations.\n","authors":["Rupak Raj Ghimire","Bal Krishna Bal","Prakash Poudyal"],"pdf_url":"https://arxiv.org/pdf/2402.03050v1.pdf","comment":"Accepted in International Conference on Technologies for Computer,\n Electrical, Electronics & Communication (ICT-CEEL 2023)"},{"id":"http://arxiv.org/abs/2402.03049v1","updated":"2024-02-05T14:33:56Z","published":"2024-02-05T14:33:56Z","title":"EasyInstruct: An Easy-to-use Instruction Processing Framework for Large\n Language Models","summary":" In recent years, instruction tuning has gained increasing attention and\nemerged as a crucial technique to enhance the capabilities of Large Language\nModels (LLMs). To construct high-quality instruction datasets, many instruction\nprocessing approaches have been proposed, aiming to achieve a delicate balance\nbetween data quantity and data quality. Nevertheless, due to inconsistencies\nthat persist among various instruction processing methods, there is no standard\nopen-source instruction processing implementation framework available for the\ncommunity, which hinders practitioners from further developing and advancing.\nTo facilitate instruction processing research and development, we present\nEasyInstruct, an easy-to-use instruction processing framework for LLMs, which\nmodularizes instruction generation, selection, and prompting, while also\nconsidering their combination and interaction. EasyInstruct is publicly\nreleased and actively maintained at https://github.com/zjunlp/EasyInstruct,\nalong with a running demo App at\nhttps://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for\nbroader research centered on instruction data.\n","authors":["Yixin Ou","Ningyu Zhang","Honghao Gui","Ziwen Xu","Shuofei Qiao","Zhen Bi","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03049v1.pdf","comment":"Ongoing work; the project website is at\n https://zjunlp.github.io/project/EasyInstruct, code is at\n https://github.com/zjunlp/EasyInstruct, demo is at\n https://huggingface.co/spaces/zjunlp/EasyInstruct"},{"id":"http://arxiv.org/abs/2402.03043v1","updated":"2024-02-05T14:29:54Z","published":"2024-02-05T14:29:54Z","title":"SIDU-TXT: An XAI Algorithm for NLP with a Holistic Assessment Approach","summary":" Explainable AI (XAI) aids in deciphering 'black-box' models. While several\nmethods have been proposed and evaluated primarily in the image domain, the\nexploration of explainability in the text domain remains a growing research\narea. In this paper, we delve into the applicability of XAI methods for the\ntext domain. In this context, the 'Similarity Difference and Uniqueness' (SIDU)\nXAI method, recognized for its superior capability in localizing entire salient\nregions in image-based classification is extended to textual data. The extended\nmethod, SIDU-TXT, utilizes feature activation maps from 'black-box' models to\ngenerate heatmaps at a granular, word-based level, thereby providing\nexplanations that highlight contextually significant textual elements crucial\nfor model predictions. Given the absence of a unified standard for assessing\nXAI methods, this study applies a holistic three-tiered comprehensive\nevaluation framework: Functionally-Grounded, Human-Grounded and\nApplication-Grounded, to assess the effectiveness of the proposed SIDU-TXT\nacross various experiments. We find that, in sentiment analysis task of a movie\nreview dataset, SIDU-TXT excels in both functionally and human-grounded\nevaluations, demonstrating superior performance through quantitative and\nqualitative analyses compared to benchmarks like Grad-CAM and LIME. In the\napplication-grounded evaluation within the sensitive and complex legal domain\nof asylum decision-making, SIDU-TXT and Grad-CAM demonstrate comparable\nperformances, each with its own set of strengths and weaknesses. However, both\nmethods fall short of entirely fulfilling the sophisticated criteria of expert\nexpectations, highlighting the imperative need for additional research in XAI\nmethods suitable for such domains.\n","authors":["Mohammad N. S. Jahromi","Satya. M. Muddamsetty","Asta Sofie Stage Jarlner","Anna Murphy Høgenhaug","Thomas Gammeltoft-Hansen","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2402.03043v1.pdf","comment":"Preprint submitted to Elsevier on Jan 5th, 2024"},{"id":"http://arxiv.org/abs/2402.03038v1","updated":"2024-02-05T14:23:43Z","published":"2024-02-05T14:23:43Z","title":"Automatic Combination of Sample Selection Strategies for Few-Shot\n Learning","summary":" In few-shot learning, such as meta-learning, few-shot fine-tuning or\nin-context learning, the limited number of samples used to train a model have a\nsignificant impact on the overall success. Although a large number of sample\nselection strategies exist, their impact on the performance of few-shot\nlearning is not extensively known, as most of them have been so far evaluated\nin typical supervised settings only. In this paper, we thoroughly investigate\nthe impact of 20 sample selection strategies on the performance of 5 few-shot\nlearning approaches over 8 image and 6 text datasets. In addition, we propose a\nnew method for automatic combination of sample selection strategies (ACSESS)\nthat leverages the strengths and complementary information of the individual\nstrategies. The experimental results show that our method consistently\noutperforms the individual selection strategies, as well as the recently\nproposed method for selecting support examples for in-context learning. We also\nshow a strong modality, dataset and approach dependence for the majority of\nstrategies as well as their dependence on the number of shots - demonstrating\nthat the sample selection strategies play a significant role for lower number\nof shots, but regresses to random selection at higher number of shots.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2402.03038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03009v1","updated":"2024-02-05T13:47:53Z","published":"2024-02-05T13:47:53Z","title":"UniMem: Towards a Unified View of Long-Context Large Language Models","summary":" Long-context processing is a critical ability that constrains the\napplicability of large language models. Although there exist various methods\ndevoted to enhancing the long-context processing ability of large language\nmodels (LLMs), they are developed in an isolated manner and lack systematic\nanalysis and integration of their strengths, hindering further developments. In\nthis paper, we introduce UniMem, a unified framework that reformulates existing\nlong-context methods from the view of memory augmentation of LLMs. UniMem is\ncharacterized by four key dimensions: Memory Management, Memory Writing, Memory\nReading, and Memory Injection, providing a systematic theory for understanding\nvarious long-context methods. We reformulate 16 existing methods based on\nUniMem and analyze four representative methods: Transformer-XL, Memorizing\nTransformer, RMT, and Longformer into equivalent UniMem forms to reveal their\ndesign principles and strengths. Based on these analyses, we propose UniMix, an\ninnovative approach that integrates the strengths of these algorithms.\nExperimental results show that UniMix achieves superior performance in handling\nlong contexts with significantly lower perplexity than baselines.\n","authors":["Junjie Fang","Likai Tang","Hongzhe Bi","Yujia Qin","Si Sun","Zhenyu Li","Haolun Li","Yongjian Li","Xin Cong","Yukun Yan","Xiaodong Shi","Sen Song","Yankai Lin","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.03009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02992v1","updated":"2024-02-05T13:31:28Z","published":"2024-02-05T13:31:28Z","title":"Decoding-time Realignment of Language Models","summary":" Aligning language models with human preferences is crucial for reducing\nerrors and biases in these models. Alignment techniques, such as reinforcement\nlearning from human feedback (RLHF), are typically cast as optimizing a\ntradeoff between human preference rewards and a proximity regularization term\nthat encourages staying close to the unaligned model. Selecting an appropriate\nlevel of regularization is critical: insufficient regularization can lead to\nreduced model capabilities due to reward hacking, whereas excessive\nregularization hinders alignment. Traditional methods for finding the optimal\nregularization level require retraining multiple models with varying\nregularization strengths. This process, however, is resource-intensive,\nespecially for large models. To address this challenge, we propose\ndecoding-time realignment (DeRa), a simple method to explore and evaluate\ndifferent regularization strengths in aligned models without retraining. DeRa\nenables control over the degree of alignment, allowing users to smoothly\ntransition between unaligned and aligned models. It also enhances the\nefficiency of hyperparameter tuning by enabling the identification of effective\nregularization strengths using a validation dataset.\n","authors":["Tianlin Liu","Shangmin Guo","Leonardo Bianco","Daniele Calandriello","Quentin Berthet","Felipe Llinares","Jessica Hoffmann","Lucas Dixon","Michal Valko","Mathieu Blondel"],"pdf_url":"https://arxiv.org/pdf/2402.02992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01391v2","updated":"2024-02-05T13:28:23Z","published":"2024-02-02T13:14:31Z","title":"StepCoder: Improve Code Generation with Reinforcement Learning from\n Compiler Feedback","summary":" The advancement of large language models (LLMs) has significantly propelled\nthe field of code generation. Previous work integrated reinforcement learning\n(RL) with compiler feedback for exploring the output space of LLMs to enhance\ncode generation quality. However, the lengthy code generated by LLMs in\nresponse to complex human requirements makes RL exploration a challenge. Also,\nsince the unit tests may not cover the complicated code, optimizing LLMs by\nusing these unexecuted code snippets is ineffective. To tackle these\nchallenges, we introduce StepCoder, a novel RL framework for code generation,\nconsisting of two main components: CCCS addresses the exploration challenge by\nbreaking the long sequences code generation task into a Curriculum of Code\nCompletion Subtasks, while FGO only optimizes the model by masking the\nunexecuted code segments to provide Fine-Grained Optimization. In addition, we\nfurthermore construct the APPS+ dataset for RL training, which is manually\nverified to ensure the correctness of unit tests. Experimental results show\nthat our method improves the ability to explore the output space and\noutperforms state-of-the-art approaches in corresponding benchmarks. Our\ndataset APPS+ and StepCoder are available online.\n","authors":["Shihan Dou","Yan Liu","Haoxiang Jia","Limao Xiong","Enyu Zhou","Wei Shen","Junjie Shan","Caishuang Huang","Xiao Wang","Xiaoran Fan","Zhiheng Xi","Yuhao Zhou","Tao Ji","Rui Zheng","Qi Zhang","Xuanjing Huang","Tao Gui"],"pdf_url":"https://arxiv.org/pdf/2402.01391v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.02987v1","updated":"2024-02-05T13:18:42Z","published":"2024-02-05T13:18:42Z","title":"Conversation Reconstruction Attack Against GPT Models","summary":" In recent times, significant advancements have been made in the field of\nlarge language models (LLMs), represented by GPT series models. To optimize\ntask execution, users often engage in multi-round conversations with GPT models\nhosted in cloud environments. These multi-round conversations, potentially\nreplete with private information, require transmission and storage within the\ncloud. However, this operational paradigm introduces additional attack\nsurfaces. In this paper, we first introduce a specific Conversation\nReconstruction Attack targeting GPT models. Our introduced Conversation\nReconstruction Attack is composed of two steps: hijacking a session and\nreconstructing the conversations. Subsequently, we offer an exhaustive\nevaluation of the privacy risks inherent in conversations when GPT models are\nsubjected to the proposed attack. However, GPT-4 demonstrates certain\nrobustness to the proposed attacks. We then introduce two advanced attacks\naimed at better reconstructing previous conversations, specifically the UNR\nattack and the PBU attack. Our experimental findings indicate that the PBU\nattack yields substantial performance across all models, achieving semantic\nsimilarity scores exceeding 0.60, while the UNR attack is effective solely on\nGPT-3.5. Our results reveal the concern about privacy risks associated with\nconversations involving GPT models and aim to draw the community's attention to\nprevent the potential misuse of these models' remarkable capabilities. We will\nresponsibly disclose our findings to the suppliers of related large language\nmodels.\n","authors":["Junjie Chu","Zeyang Sha","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02987v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.02975v1","updated":"2024-02-05T12:56:22Z","published":"2024-02-05T12:56:22Z","title":"Putting Context in Context: the Impact of Discussion Structure on Text\n Classification","summary":" Current text classification approaches usually focus on the content to be\nclassified. Contextual aspects (both linguistic and extra-linguistic) are\nusually neglected, even in tasks based on online discussions. Still in many\ncases the multi-party and multi-turn nature of the context from which these\nelements are selected can be fruitfully exploited. In this work, we propose a\nseries of experiments on a large dataset for stance detection in English, in\nwhich we evaluate the contribution of different types of contextual\ninformation, i.e. linguistic, structural and temporal, by feeding them as\nnatural language input into a transformer-based model. We also experiment with\ndifferent amounts of training data and analyse the topology of local discussion\nnetworks in a privacy-compliant way. Results show that structural information\ncan be highly beneficial to text classification but only under certain\ncircumstances (e.g. depending on the amount of training data and on discussion\nchain complexity). Indeed, we show that contextual information on smaller\ndatasets from other classification tasks does not yield significant\nimprovements. Our framework, based on local discussion networks, allows the\nintegration of structural information, while minimising user profiling, thus\npreserving their privacy.\n","authors":["Nicolò Penzo","Antonio Longa","Bruno Lepri","Sara Tonelli","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2402.02975v1.pdf","comment":"Accepted to EACL 2024 main conference"},{"id":"http://arxiv.org/abs/2311.12664v2","updated":"2024-02-05T12:50:23Z","published":"2023-11-21T15:14:54Z","title":"The DURel Annotation Tool: Human and Computational Measurement of\n Semantic Proximity, Sense Clusters and Semantic Change","summary":" We present the DURel tool that implements the annotation of semantic\nproximity between uses of words into an online, open source interface. The tool\nsupports standardized human annotation as well as computational annotation,\nbuilding on recent advances with Word-in-Context models. Annotator judgments\nare clustered with automatic graph clustering techniques and visualized for\nanalysis. This allows to measure word senses with simple and intuitive\nmicro-task judgments between use pairs, requiring minimal preparation efforts.\nThe tool offers additional functionalities to compare the agreement between\nannotators to guarantee the inter-subjectivity of the obtained judgments and to\ncalculate summary statistics giving insights into sense frequency\ndistributions, semantic variation or changes of senses over time.\n","authors":["Dominik Schlechtweg","Shafqat Mumtaz Virk","Pauline Sander","Emma Sköldberg","Lukas Theuer Linke","Tuo Zhang","Nina Tahmasebi","Jonas Kuhn","Sabine Schulte im Walde"],"pdf_url":"https://arxiv.org/pdf/2311.12664v2.pdf","comment":"EACL Demo, 7 pages"},{"id":"http://arxiv.org/abs/2402.01376v2","updated":"2024-02-05T12:42:52Z","published":"2024-02-02T13:00:38Z","title":"LoTR: Low Tensor Rank Weight Adaptation","summary":" In this paper we generalize and extend an idea of low-rank adaptation (LoRA)\nof large language models (LLMs) based on Transformer architecture. Widely used\nLoRA-like methods of fine-tuning LLMs are based on matrix factorization of\ngradient update. We introduce LoTR, a novel approach for parameter-efficient\nfine-tuning of LLMs which represents a gradient update to parameters in a form\nof tensor decomposition. Low-rank adapter for each layer is constructed as a\nproduct of three matrices, and tensor structure arises from sharing left and\nright multipliers of this product among layers. Simultaneous compression of a\nsequence of layers with low-rank tensor representation allows LoTR to archive\neven better parameter efficiency then LoRA especially for deep models.\nMoreover, the core tensor does not depend on original weight dimension and can\nbe made arbitrary small, which allows for extremely cheap and fast downstream\nfine-tuning.\n","authors":["Daniel Bershatsky","Daria Cherniuk","Talgat Daulbaev","Aleksandr Mikhalev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2402.01376v2.pdf","comment":"Submitted; missing author and sections were added;"},{"id":"http://arxiv.org/abs/2309.07445v2","updated":"2024-02-05T12:41:09Z","published":"2023-09-14T05:56:49Z","title":"SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic\n Classification in 200+ Languages and Dialects","summary":" Despite the progress we have recorded in the last few years in multilingual\nnatural language processing, evaluation is typically limited to a small set of\nlanguages with available datasets which excludes a large number of low-resource\nlanguages. In this paper, we created SIB-200 -- a large-scale open-sourced\nbenchmark dataset for topic classification in 200 languages and dialects to\naddress the lack of evaluation dataset for Natural Language Understanding\n(NLU). For many of the languages covered in SIB-200, this is the first publicly\navailable evaluation dataset for NLU. The dataset is based on Flores-200\nmachine translation corpus. We annotated the English portion of the dataset and\nextended the sentence-level annotation to the remaining 203 languages covered\nin the corpus. Despite the simplicity of this task, our evaluation in\nfull-supervised setting, cross-lingual transfer setting and prompting of large\nlanguage model setting show that there is still a large gap between the\nperformance of high-resource and low-resource languages when multilingual\nevaluation is scaled to numerous world languages. We found that languages\nunseen during the pre-training of multilingual language models,\nunder-represented language families (like Nilotic and Altantic-Congo), and\nlanguages from the regions of Africa, Americas, Oceania and South East Asia,\noften have the lowest performance on our topic classification dataset. We hope\nour dataset will encourage a more inclusive evaluation of multilingual language\nmodels on a more diverse set of languages. https://github.com/dadelani/sib-200\n","authors":["David Ifeoluwa Adelani","Hannah Liu","Xiaoyu Shen","Nikita Vassilyev","Jesujoba O. Alabi","Yanke Mao","Haonan Gao","Annie En-Shiun Lee"],"pdf_url":"https://arxiv.org/pdf/2309.07445v2.pdf","comment":"Accepted to EACL 2024 (main conference)"},{"id":"http://arxiv.org/abs/2402.02926v1","updated":"2024-02-05T11:47:36Z","published":"2024-02-05T11:47:36Z","title":"Automated Cognate Detection as a Supervised Link Prediction Task with\n Cognate Transformer","summary":" Identification of cognates across related languages is one of the primary\nproblems in historical linguistics. Automated cognate identification is helpful\nfor several downstream tasks including identifying sound correspondences,\nproto-language reconstruction, phylogenetic classification, etc. Previous\nstate-of-the-art methods for cognate identification are mostly based on\ndistributions of phonemes computed across multilingual wordlists and make\nlittle use of the cognacy labels that define links among cognate clusters. In\nthis paper, we present a transformer-based architecture inspired by\ncomputational biology for the task of automated cognate detection. Beyond a\ncertain amount of supervision, this method performs better than the existing\nmethods, and shows steady improvement with further increase in supervision,\nthereby proving the efficacy of utilizing the labeled information. We also\ndemonstrate that accepting multiple sequence alignments as input and having an\nend-to-end architecture with link prediction head saves much computation time\nwhile simultaneously yielding superior performance.\n","authors":["V. S. D. S. Mahesh Akavarapu","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2402.02926v1.pdf","comment":"Accepted to EACL-2024 main conference"},{"id":"http://arxiv.org/abs/2402.02915v1","updated":"2024-02-05T11:32:13Z","published":"2024-02-05T11:32:13Z","title":"A Computational Model for the Assessment of Mutual Intelligibility Among\n Closely Related Languages","summary":" Closely related languages show linguistic similarities that allow speakers of\none language to understand speakers of another language without having actively\nlearned it. Mutual intelligibility varies in degree and is typically tested in\npsycholinguistic experiments. To study mutual intelligibility computationally,\nwe propose a computer-assisted method using the Linear Discriminative Learner,\na computational model developed to approximate the cognitive processes by which\nhumans learn languages, which we expand with multilingual semantic vectors and\nmultilingual sound classes. We test the model on cognate data from German,\nDutch, and English, three closely related Germanic languages. We find that our\nmodel's comprehension accuracy depends on 1) the automatic trimming of\ninflections and 2) the language pair for which comprehension is tested. Our\nmultilingual modelling approach does not only offer new methodological findings\nfor automatic testing of mutual intelligibility across languages but also\nextends the use of Linear Discriminative Learning to multilingual settings.\n","authors":["Jessica Nieder","Johann-Mattis List"],"pdf_url":"https://arxiv.org/pdf/2402.02915v1.pdf","comment":"To appear in: Proceedings of the 6th Workshop on Research in\n Computational Linguistic Typology and Multilingual NLP (SIGTYP 2024)"},{"id":"http://arxiv.org/abs/2212.10923v3","updated":"2024-02-05T11:19:24Z","published":"2022-12-21T11:12:14Z","title":"Language Models as Inductive Reasoners","summary":" Inductive reasoning is a core component of human intelligence. In the past\nresearch of inductive reasoning within computer science, formal language is\nused as representations of knowledge (facts and rules, more specifically).\nHowever, formal language can cause systematic problems for inductive reasoning\nsuch as disability of handling raw input such as natural language,\nsensitiveness to mislabeled data, and incapacity to handle ambiguous input. To\nthis end, we propose a new paradigm (task) for inductive reasoning, which is to\ninduce natural language rules from natural language facts, and create a dataset\ntermed DEER containing 1.2k rule-fact pairs for the task, where rules and facts\nare written in natural language. New automatic metrics are also proposed and\nanalysed for the evaluation of this task. With DEER, we investigate a modern\napproach for inductive reasoning where we use natural language as\nrepresentation for knowledge instead of formal language and use pretrained\nlanguage models as ''reasoners''. Moreover, we provide the first and\ncomprehensive analysis of how well pretrained language models can induce\nnatural language rules from natural language facts. We also propose a new\nframework drawing insights from philosophy literature for this task, which we\nshow in the experiment section that surpasses baselines in both automatic and\nhuman evaluations. We discuss about our future perspectives for inductive\nreasoning in Section 7. Dataset and code are available at\nhttps://github.com/ZonglinY/Inductive_Reasoning.\n","authors":["Zonglin Yang","Li Dong","Xinya Du","Hao Cheng","Erik Cambria","Xiaodong Liu","Jianfeng Gao","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2212.10923v3.pdf","comment":"Accepted by EACL 2024"},{"id":"http://arxiv.org/abs/2311.01256v2","updated":"2024-02-05T11:13:59Z","published":"2023-11-02T14:16:48Z","title":"An energy-based comparative analysis of common approaches to text\n classification in the Legal domain","summary":" Most Machine Learning research evaluates the best solutions in terms of\nperformance. However, in the race for the best performing model, many important\naspects are often overlooked when, on the contrary, they should be carefully\nconsidered. In fact, sometimes the gaps in performance between different\napproaches are neglectable, whereas factors such as production costs, energy\nconsumption, and carbon footprint must take into consideration. Large Language\nModels (LLMs) are extensively adopted to address NLP problems in academia and\nindustry. In this work, we present a detailed quantitative comparison of LLM\nand traditional approaches (e.g. SVM) on the LexGLUE benchmark, which takes\ninto account both performance (standard indices) and alternative metrics such\nas timing, power consumption and cost, in a word: the carbon-footprint. In our\nanalysis, we considered the prototyping phase (model selection by\ntraining-validation-test iterations) and in-production phases separately, since\nthey follow different implementation procedures and also require different\nresources. The results indicate that very often, the simplest algorithms\nachieve performance very close to that of large LLMs but with very low power\nconsumption and lower resource demands. The results obtained could suggest\ncompanies to include additional evaluations in the choice of Machine Learning\n(ML) solutions.\n","authors":["Sinan Gultekin","Achille Globo","Andrea Zugarini","Marco Ernandes","Leonardo Rigutini"],"pdf_url":"https://arxiv.org/pdf/2311.01256v2.pdf","comment":"Presented at The 4th International Conference on NLP & Text Mining\n (NLTM 2024), January 27-28 2024, Copenhagen, Denmark - 12 pages, 1 figure, 7\n tables"},{"id":"http://arxiv.org/abs/2402.02896v1","updated":"2024-02-05T11:05:20Z","published":"2024-02-05T11:05:20Z","title":"LLM Agents in Interaction: Measuring Personality Consistency and\n Linguistic Alignment in Interacting Populations of Large Language Models","summary":" While both agent interaction and personalisation are vibrant topics in\nresearch on large language models (LLMs), there has been limited focus on the\neffect of language interaction on the behaviour of persona-conditioned LLM\nagents. Such an endeavour is important to ensure that agents remain consistent\nto their assigned traits yet are able to engage in open, naturalistic\ndialogues. In our experiments, we condition GPT-3.5 on personality profiles\nthrough prompting and create a two-group population of LLM agents using a\nsimple variability-inducing sampling algorithm. We then administer personality\ntests and submit the agents to a collaborative writing task, finding that\ndifferent profiles exhibit different degrees of personality consistency and\nlinguistic alignment to their conversational partners. Our study seeks to lay\nthe groundwork for better understanding of dialogue-based interaction between\nLLMs and highlights the need for new approaches to crafting robust, more\nhuman-like LLM personas for interactive environments.\n","authors":["Ivar Frisch","Mario Giulianelli"],"pdf_url":"https://arxiv.org/pdf/2402.02896v1.pdf","comment":"To appear in Proceedings of the 1st Personalization of Generative AI\n Workshop, EACL 2024"},{"id":"http://arxiv.org/abs/2401.17072v2","updated":"2024-02-05T10:53:21Z","published":"2024-01-30T14:52:50Z","title":"SemScore: Automated Evaluation of Instruction-Tuned LLMs based on\n Semantic Textual Similarity","summary":" Instruction-tuned Large Language Models (LLMs) have recently showcased\nremarkable advancements in their ability to generate fitting responses to\nnatural language instructions. However, many current works rely on manual\nevaluation to judge the quality of generated responses. Since such manual\nevaluation is time-consuming, it does not easily scale to the evaluation of\nmultiple models and model variants. In this short paper, we propose a\nstraightforward but remarkably effective evaluation metric called SemScore, in\nwhich we directly compare model outputs to gold target responses using semantic\ntextual similarity (STS). We conduct a comparative evaluation of the model\noutputs of 12 prominent instruction-tuned LLMs using 8 widely-used evaluation\nmetrics for text generation. We find that our proposed SemScore metric\noutperforms all other, in many cases more complex, evaluation metrics in terms\nof correlation to human evaluation. These findings indicate the utility of our\nproposed metric for the evaluation of instruction-tuned LLMs.\n","authors":["Ansar Aynetdinov","Alan Akbik"],"pdf_url":"https://arxiv.org/pdf/2401.17072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02883v1","updated":"2024-02-05T10:49:05Z","published":"2024-02-05T10:49:05Z","title":"Approximate Attributions for Off-the-Shelf Siamese Transformers","summary":" Siamese encoders such as sentence transformers are among the least understood\ndeep models. Established attribution methods cannot tackle this model class\nsince it compares two inputs rather than processing a single one. To address\nthis gap, we have recently proposed an attribution method specifically for\nSiamese encoders (M\\\"oller et al., 2023). However, it requires models to be\nadjusted and fine-tuned and therefore cannot be directly applied to\noff-the-shelf models. In this work, we reassess these restrictions and propose\n(i) a model with exact attribution ability that retains the original model's\npredictive performance and (ii) a way to compute approximate attributions for\noff-the-shelf models. We extensively compare approximate and exact attributions\nand use them to analyze the models' attendance to different linguistic aspects.\nWe gain insights into which syntactic roles Siamese transformers attend to,\nconfirm that they mostly ignore negation, explore how they judge semantically\nopposite adjectives, and find that they exhibit lexical bias.\n","authors":["Lucas Möller","Dmitry Nikolaev","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2402.02883v1.pdf","comment":"Accepted for EACL 2024, St. Julian's, Malta"},{"id":"http://arxiv.org/abs/2402.02872v1","updated":"2024-02-05T10:39:32Z","published":"2024-02-05T10:39:32Z","title":"How do Large Language Models Learn In-Context? Query and Key Matrices of\n In-Context Heads are Two Towers for Metric Learning","summary":" We explore the mechanism of in-context learning and propose a hypothesis\nusing locate-and-project method. In shallow layers, the features of\ndemonstrations are merged into their corresponding labels, and the features of\nthe input text are aggregated into the last token. In deep layers, in-context\nheads make great contributions. In each in-context head, the value-output\nmatrix extracts the labels' features. Query and key matrices compute the\nattention weights between the input text and each demonstration. The larger the\nattention weight is, the more label information is transferred into the last\ntoken for predicting the next word. Query and key matrices can be regarded as\ntwo towers for learning the similarity metric between the input text and each\ndemonstration. Based on this hypothesis, we explain why imbalanced labels and\ndemonstration order affect predictions. We conduct experiments on GPT2 large,\nLlama 7B, 13B and 30B. The results can support our analysis. Overall, our study\nprovides a new method and a reasonable hypothesis for understanding the\nmechanism of in-context learning. Our code will be released on github.\n","authors":["Zeping Yu","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2402.02872v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.12531v3","updated":"2024-02-05T10:28:43Z","published":"2023-10-19T07:11:48Z","title":"ICU: Conquering Language Barriers in Vision-and-Language Modeling by\n Dividing the Tasks into Image Captioning and Language Understanding","summary":" Most multilingual vision-and-language (V&L) research aims to accomplish\nmultilingual and multimodal capabilities within one model. However, the\nscarcity of multilingual captions for images has hindered the development. To\novercome this obstacle, we propose ICU, Image Caption Understanding, which\ndivides a V&L task into two stages: a V&L model performs image captioning in\nEnglish, and a multilingual language model (mLM), in turn, takes the caption as\nthe alt text and performs cross-lingual language understanding. The burden of\nmultilingual processing is lifted off V&L model and placed on mLM. Since the\nmultilingual text data is relatively of higher abundance and quality, ICU can\nfacilitate the conquering of language barriers for V&L models. In experiments\non two tasks across 9 languages in the IGLUE benchmark, we show that ICU can\nachieve new state-of-the-art results for five languages, and comparable results\nfor the rest.\n","authors":["Guojun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.12531v3.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2402.02864v1","updated":"2024-02-05T10:24:40Z","published":"2024-02-05T10:24:40Z","title":"EEVEE: An Easy Annotation Tool for Natural Language Processing","summary":" Annotation tools are the starting point for creating Natural Language\nProcessing (NLP) datasets. There is a wide variety of tools available; setting\nup these tools is however a hindrance. We propose EEVEE, an annotation tool\nfocused on simplicity, efficiency, and ease of use. It can run directly in the\nbrowser (no setup required) and uses tab-separated files (as opposed to\ncharacter offsets or task-specific formats) for annotation. It allows for\nannotation of multiple tasks on a single dataset and supports four task-types:\nsequence labeling, span labeling, text classification and seq2seq.\n","authors":["Axel Sorensen","Siyao Peng","Barbara Plank","Rob van der Goot"],"pdf_url":"https://arxiv.org/pdf/2402.02864v1.pdf","comment":"6 pages; accepted to The Linguistic Annotation Workshop (LAW) at EACL\n 2024"},{"id":"http://arxiv.org/abs/2308.15154v2","updated":"2024-02-05T10:11:37Z","published":"2023-08-29T09:35:23Z","title":"The Anatomy of Conspirators: Unveiling Traits using a Comprehensive\n Twitter Dataset","summary":" The discourse around conspiracy theories is currently thriving amidst the\nrampant misinformation in online environments. Research in this field has been\nfocused on detecting conspiracy theories on social media, often relying on\nlimited datasets. In this study, we present a novel methodology for\nconstructing a Twitter dataset that encompasses accounts engaged in\nconspiracy-related activities throughout the year 2022. Our approach centers on\ndata collection that is independent of specific conspiracy theories and\ninformation operations. Additionally, our dataset includes a control group\ncomprising randomly selected users who can be fairly compared to the\nindividuals involved in conspiracy activities. This comprehensive collection\neffort yielded a total of 15K accounts and 37M tweets extracted from their\ntimelines. We conduct a comparative analysis of the two groups across three\ndimensions: topics, profiles, and behavioral characteristics. The results\nindicate that conspiracy and control users exhibit similarity in terms of their\nprofile metadata characteristics. However, they diverge significantly in terms\nof behavior and activity, particularly regarding the discussed topics, the\nterminology used, and their stance on trending subjects. In addition, we find\nno significant disparity in the presence of bot users between the two groups.\nFinally, we develop a classifier to identify conspiracy users using features\nborrowed from bot, troll and linguistic literature. The results demonstrate a\nhigh accuracy level (with an F1 score of 0.94), enabling us to uncover the most\ndiscriminating features associated with conspiracy-related accounts.\n","authors":["Margherita Gambini","Serena Tardelli","Maurizio Tesconi"],"pdf_url":"https://arxiv.org/pdf/2308.15154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02844v1","updated":"2024-02-05T09:57:15Z","published":"2024-02-05T09:57:15Z","title":"Comparing Knowledge Sources for Open-Domain Scientific Claim\n Verification","summary":" The increasing rate at which scientific knowledge is discovered and health\nclaims shared online has highlighted the importance of developing efficient\nfact-checking systems for scientific claims. The usual setting for this task in\nthe literature assumes that the documents containing the evidence for claims\nare already provided and annotated or contained in a limited corpus. This\nrenders the systems unrealistic for real-world settings where knowledge sources\nwith potentially millions of documents need to be queried to find relevant\nevidence. In this paper, we perform an array of experiments to test the\nperformance of open-domain claim verification systems. We test the final\nverdict prediction of systems on four datasets of biomedical and health claims\nin different settings. While keeping the pipeline's evidence selection and\nverdict prediction parts constant, document retrieval is performed over three\ncommon knowledge sources (PubMed, Wikipedia, Google) and using two different\ninformation retrieval techniques. We show that PubMed works better with\nspecialized biomedical claims, while Wikipedia is more suited for everyday\nhealth concerns. Likewise, BM25 excels in retrieval precision, while semantic\nsearch in recall of relevant evidence. We discuss the results, outline frequent\nretrieval patterns and challenges, and provide promising future directions.\n","authors":["Juraj Vladika","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2402.02844v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2402.02837v1","updated":"2024-02-05T09:48:07Z","published":"2024-02-05T09:48:07Z","title":"With a Little Help from my (Linguistic) Friends: Topic Segmentation of\n Multi-party Casual Conversations","summary":" Topics play an important role in the global organisation of a conversation as\nwhat is currently discussed constrains the possible contributions of the\nparticipant. Understanding the way topics are organised in interaction would\nprovide insight on the structure of dialogue beyond the sequence of utterances.\nHowever, studying this high-level structure is a complex task that we try to\napproach by first segmenting dialogues into smaller topically coherent sets of\nutterances. Understanding the interactions between these segments would then\nenable us to propose a model of topic organisation at a dialogue level. In this\npaper we work with open-domain conversations and try to reach a comparable\nlevel of accuracy as recent machine learning based topic segmentation models\nbut with a formal approach. The features we identify as meaningful for this\ntask help us understand better the topical structure of a conversation.\n","authors":["Amandine Decker","Maxime Amblard"],"pdf_url":"https://arxiv.org/pdf/2402.02837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05591v2","updated":"2024-02-05T09:10:15Z","published":"2023-07-10T17:59:21Z","title":"Linear Alignment of Vision-language Models for Image Captioning","summary":" Recently, vision-language models like CLIP have advanced the state of the art\nin a variety of multi-modal tasks including image captioning and caption\nevaluation. Many approaches adapt CLIP-style models to a downstream task by\ntraining a mapping network between CLIP and a language model. This is costly as\nit usually involves calculating gradients for large models. We propose a more\nefficient training protocol that fits a linear mapping between image and text\nembeddings of CLIP via a closed-form solution. This bypasses the need for\ngradient computation and results in a lightweight captioning method called\nReCap, which can be trained up to 1000 times faster than existing lightweight\nmethods. Moreover, we propose two new learning-based image-captioning metrics\nthat build on CLIP score along with our linear mapping. Furthermore, we combine\nReCap with our new metrics to design an iterative datastore-augmentation loop\n(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k,\nVizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art\nlightweight methods on established metrics while outperforming them on our new\nmetrics, which are better aligned with human ratings on Flickr8k-Expert and\nFlickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to\nother domains and that our DAL leads to a performance boost.\n","authors":["Fabian Paischer","Markus Hofmarcher","Sepp Hochreiter","Thomas Adler"],"pdf_url":"https://arxiv.org/pdf/2307.05591v2.pdf","comment":"8 pages (+ references and appendix)"},{"id":"http://arxiv.org/abs/2402.02807v1","updated":"2024-02-05T08:35:33Z","published":"2024-02-05T08:35:33Z","title":"Are Sounds Sound for Phylogenetic Reconstruction?","summary":" In traditional studies on language evolution, scholars often emphasize the\nimportance of sound laws and sound correspondences for phylogenetic inference\nof language family trees. However, to date, computational approaches have\ntypically not taken this potential into account. Most computational studies\nstill rely on lexical cognates as major data source for phylogenetic\nreconstruction in linguistics, although there do exist a few studies in which\nauthors praise the benefits of comparing words at the level of sound sequences.\nBuilding on (a) ten diverse datasets from different language families, and (b)\nstate-of-the-art methods for automated cognate and sound correspondence\ndetection, we test, for the first time, the performance of sound-based versus\ncognate-based approaches to phylogenetic reconstruction. Our results show that\nphylogenies reconstructed from lexical cognates are topologically closer, by\napproximately one third with respect to the generalized quartet distance on\naverage, to the gold standard phylogenies than phylogenies reconstructed from\nsound correspondences.\n","authors":["Luise Häuser","Gerhard Jäger","Taraka Rama","Johann-Mattis List","Alexandros Stamatakis"],"pdf_url":"https://arxiv.org/pdf/2402.02807v1.pdf","comment":"Paper accepted for SIGTYP (2024): H\\\"auser, Luise; J\\\"ager, Gerhard;\n List, Johann-Mattis; Rama, Taraka; and Stamatakis, Alexandros (2024): Are\n sounds sound for phylogenetic reconstruction? In: Proceedings of the 6th\n Workshop on Research in Computational Linguistic Typology and Multilingual\n NLP (SIGTYP 2024)"},{"id":"http://arxiv.org/abs/2402.02805v1","updated":"2024-02-05T08:26:33Z","published":"2024-02-05T08:26:33Z","title":"Graph-enhanced Large Language Models in Asynchronous Plan Reasoning","summary":" Reasoning about asynchronous plans is challenging since it requires\nsequential and parallel planning to optimize time costs. Can large language\nmodels (LLMs) succeed at this task? Here, we present the first large-scale\nstudy investigating this question. We find that a representative set of closed\nand open-source LLMs, including GPT-4 and LLaMA-2, behave poorly when not\nsupplied with illustrations about the task-solving process in our benchmark\nAsyncHow. We propose a novel technique called Plan Like a Graph (PLaG) that\ncombines graphs with natural language prompts and achieves state-of-the-art\nresults. We show that although PLaG can boost model performance, LLMs still\nsuffer from drastic degradation when task complexity increases, highlighting\nthe limits of utilizing LLMs for simulating digital devices. We see our study\nas an exciting step towards using LLMs as efficient autonomous agents.\n","authors":["Fangru Lin","Emanuele La Malfa","Valentin Hofmann","Elle Michelle Yang","Anthony Cohn","Janet B. Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2402.02805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02801v1","updated":"2024-02-05T08:19:56Z","published":"2024-02-05T08:19:56Z","title":"KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language\n Models","summary":" The lottery ticket hypothesis posits the existence of ``winning tickets''\nwithin a randomly initialized neural network. Do winning tickets exist for LLMs\nin fine-tuning scenarios? How can we find such winning tickets? In this paper,\nwe propose KS-Lottery, a method to identify a small subset of LLM parameters\nhighly effective in multilingual fine-tuning. Our key idea is to use\nKolmogorov-Smirnov Test to analyze the distribution shift of parameters before\nand after fine-tuning. We further theoretically prove that KS-Lottery can find\nthe certified winning tickets in the embedding layer, fine-tuning on the found\nparameters is guaranteed to perform as well as full fine-tuning. Comparing\nKS-Lottery with other parameter-efficient tuning algorithms on translation\ntasks, the experimental results show that KS-Lottery finds a much smaller set\nof parameters for fine-tuning while achieving the comparable performance as\nfull fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens'\nembedding of LLaMA suffices to reach the fine-tuning translation performance.\nCode and model will be released to the public.\n","authors":["Fei Yuan","Chang Ma","Shuai Yuan","Qiushi Sun","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.02801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10172v3","updated":"2024-02-05T08:10:52Z","published":"2023-07-19T17:57:53Z","title":"DialogStudio: Towards Richest and Most Diverse Unified Dataset\n Collection for Conversational AI","summary":" Despite advancements in conversational AI, language models encounter\nchallenges to handle diverse conversational tasks, and existing dialogue\ndataset collections often lack diversity and comprehensiveness. To tackle these\nissues, we introduce DialogStudio: the largest and most diverse collection of\ndialogue datasets, unified under a consistent format while preserving their\noriginal information. Our collection encompasses data from open-domain\ndialogues, task-oriented dialogues, natural language understanding,\nconversational recommendation, dialogue summarization, and knowledge-grounded\ndialogues, making it an incredibly rich and diverse resource for dialogue\nresearch and model training. To further enhance the utility of DialogStudio, we\nidentify the licenses for each dataset, design external knowledge and\ndomain-aware prompts for selected dialogues to facilitate instruction-aware\nfine-tuning. Furthermore, we develop conversational AI models using the dataset\ncollection, and our experiments in both zero-shot and few-shot learning\nscenarios demonstrate the superiority of DialogStudio. To improve transparency\nand support dataset and task-based research, as well as language model\npre-training, all datasets, licenses, codes, and models associated with\nDialogStudio are made publicly\naccessible\\footnote{\\url{https://github.com/salesforce/DialogStudio}}.\n","authors":["Jianguo Zhang","Kun Qian","Zhiwei Liu","Shelby Heinecke","Rui Meng","Ye Liu","Zhou Yu","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2307.10172v3.pdf","comment":"17 pages, accepted by EACL 2024 Findings as a long paper. All\n datasets, licenses, codes, and models are available at at\n https://github.com/salesforce/DialogStudio"},{"id":"http://arxiv.org/abs/2402.02791v1","updated":"2024-02-05T07:59:38Z","published":"2024-02-05T07:59:38Z","title":"Rethinking Optimization and Architecture for Tiny Language Models","summary":" The power of large language models (LLMs) has been demonstrated through\nnumerous data and computing resources. However, the application of language\nmodels on mobile devices is facing huge challenge on the computation and memory\ncosts, that is, tiny language models with high performance are urgently\nrequired. Limited by the highly complex training process, there are many\ndetails for optimizing language models that are seldom studied carefully. In\nthis study, based on a tiny language model with 1B parameters, we carefully\ndesign a series of empirical study to analyze the effect of each component.\nThree perspectives are mainly discussed, i.e., neural architecture, parameter\ninitialization, and optimization strategy. Several design formulas are\nempirically proved especially effective for tiny language models, including\ntokenizer compression, architecture tweaking, parameter inheritance and\nmultiple-round training. Then we train PanGu-$\\pi$-1B Pro and PanGu-$\\pi$-1.5B\nPro on 1.6T multilingual corpora, following the established formulas.\nExperimental results demonstrate the improved optimization and architecture\nyield a notable average improvement of 8.87 on benchmark evaluation sets for\nPanGu-$\\pi$-1B Pro. Besides, PanGu-$\\pi$-1.5B Pro surpasses a range of SOTA\nmodels with larger model sizes, validating its superior performance. The code\nwill be released soon (https://github.com/YuchuanTian/RethinkTinyLM).\n","authors":["Yehui Tang","Fangcheng Liu","Yunsheng Ni","Yuchuan Tian","Zheyuan Bai","Yi-Qi Hu","Sichao Liu","Shangling Jui","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02782v1","updated":"2024-02-05T07:33:25Z","published":"2024-02-05T07:33:25Z","title":"From Partial to Strictly Incremental Constituent Parsing","summary":" We study incremental constituent parsers to assess their capacity to output\ntrees based on prefix representations alone. Guided by strictly left-to-right\ngenerative language models and tree-decoding modules, we build parsers that\nadhere to a strong definition of incrementality across languages. This builds\nupon work that asserted incrementality, but that mostly only enforced it on\neither the encoder or the decoder. Finally, we conduct an analysis against\nnon-incremental and partially incremental models.\n","authors":["Ana Ezquerro","Carlos Gómez-Rodríguez","David Vilares"],"pdf_url":"https://arxiv.org/pdf/2402.02782v1.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2402.02781v1","updated":"2024-02-05T07:30:32Z","published":"2024-02-05T07:30:32Z","title":"Dual Knowledge Distillation for Efficient Sound Event Detection","summary":" Sound event detection (SED) is essential for recognizing specific sounds and\ntheir temporal locations within acoustic signals. This becomes challenging\nparticularly for on-device applications, where computational resources are\nlimited. To address this issue, we introduce a novel framework referred to as\ndual knowledge distillation for developing efficient SED systems in this work.\nOur proposed dual knowledge distillation commences with temporal-averaging\nknowledge distillation (TAKD), utilizing a mean student model derived from the\ntemporal averaging of the student model's parameters. This allows the student\nmodel to indirectly learn from a pre-trained teacher model, ensuring a stable\nknowledge distillation. Subsequently, we introduce embedding-enhanced feature\ndistillation (EEFD), which involves incorporating an embedding distillation\nlayer within the student model to bolster contextual learning. On DCASE 2023\nTask 4A public evaluation dataset, our proposed SED system with dual knowledge\ndistillation having merely one-third of the baseline model's parameters,\ndemonstrates superior performance in terms of PSDS1 and PSDS2. This highlights\nthe importance of proposed dual knowledge distillation for compact SED systems,\nwhich can be ideal for edge devices.\n","authors":["Yang Xiao","Rohan Kumar Das"],"pdf_url":"https://arxiv.org/pdf/2402.02781v1.pdf","comment":"Accepted to ICASSP 2024 (Deep Neural Network Model Compression\n Workshop)"},{"id":"http://arxiv.org/abs/2305.14982v2","updated":"2024-02-05T07:27:18Z","published":"2023-05-24T10:16:16Z","title":"LAraBench: Benchmarking Arabic AI with Large Language Models","summary":" Recent advancements in Large Language Models (LLMs) have significantly\ninfluenced the landscape of language and speech research. Despite this\nprogress, these models lack specific benchmarking against state-of-the-art\n(SOTA) models tailored to particular languages and tasks. LAraBench addresses\nthis gap for Arabic Natural Language Processing (NLP) and Speech Processing\ntasks, including sequence tagging and content classification across different\ndomains. We utilized models such as GPT-3.5-turbo, GPT-4, BLOOMZ,\nJais-13b-chat, Whisper, and USM, employing zero and few-shot learning\ntechniques to tackle 33 distinct tasks across 61 publicly available datasets.\nThis involved 98 experimental setups, encompassing ~296K data points, ~46 hours\nof speech, and 30 sentences for Text-to-Speech (TTS). This effort resulted in\n330+ sets of experiments. Our analysis focused on measuring the performance gap\nbetween SOTA models and LLMs. The overarching trend observed was that SOTA\nmodels generally outperformed LLMs in zero-shot learning, with a few\nexceptions. Notably, larger computational models with few-shot learning\ntechniques managed to reduce these performance gaps. Our findings provide\nvaluable insights into the applicability of LLMs for Arabic NLP and speech\nprocessing tasks.\n","authors":["Ahmed Abdelali","Hamdy Mubarak","Shammur Absar Chowdhury","Maram Hasanain","Basel Mousi","Sabri Boughorbel","Yassine El Kheir","Daniel Izham","Fahim Dalvi","Majd Hawasly","Nizi Nazar","Yousseif Elshahawy","Ahmed Ali","Nadir Durrani","Natasa Milic-Frayling","Firoj Alam"],"pdf_url":"https://arxiv.org/pdf/2305.14982v2.pdf","comment":"Foundation Models, Large Language Models, Arabic NLP, Arabic Speech,\n Arabic AI, GPT3.5 Evaluation, USM Evaluation, Whisper Evaluation, GPT-4,\n BLOOMZ, Jais13b"},{"id":"http://arxiv.org/abs/2402.02764v1","updated":"2024-02-05T06:52:53Z","published":"2024-02-05T06:52:53Z","title":"List-aware Reranking-Truncation Joint Model for Search and\n Retrieval-augmented Generation","summary":" The results of information retrieval (IR) are usually presented in the form\nof a ranked list of candidate documents, such as web search for humans and\nretrieval-augmented generation for large language models (LLMs). List-aware\nretrieval aims to capture the list-level contextual features to return a better\nlist, mainly including reranking and truncation. Reranking finely re-scores the\ndocuments in the list. Truncation dynamically determines the cut-off point of\nthe ranked list to achieve the trade-off between overall relevance and avoiding\nmisinformation from irrelevant documents. Previous studies treat them as two\nseparate tasks and model them separately. However, the separation is not\noptimal. First, it is hard to share the contextual information of the ranking\nlist between the two tasks. Second, the separate pipeline usually meets the\nerror accumulation problem, where the small error from the reranking stage can\nlargely affect the truncation stage. To solve these problems, we propose a\nReranking-Truncation joint model (GenRT) that can perform the two tasks\nconcurrently. GenRT integrates reranking and truncation via generative paradigm\nbased on encoder-decoder architecture. We also design the novel loss functions\nfor joint optimization to make the model learn both tasks. Sharing parameters\nby the joint model is conducive to making full use of the common modeling\ninformation of the two tasks. Besides, the two tasks are performed concurrently\nand co-optimized to solve the error accumulation problem between separate\nstages. Experiments on public learning-to-rank benchmarks and open-domain Q\\&A\ntasks show that our method achieves SOTA performance on both reranking and\ntruncation tasks for web search and retrieval-augmented LLMs.\n","authors":["Shicheng Xu","Liang Pang","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.02764v1.pdf","comment":"Accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2402.01622v2","updated":"2024-02-05T06:48:01Z","published":"2024-02-02T18:39:51Z","title":"TravelPlanner: A Benchmark for Real-World Planning with Language Agents","summary":" Planning has been part of the core pursuit for artificial intelligence since\nits conception, but earlier AI agents mostly focused on constrained settings\nbecause many of the cognitive substrates necessary for human-level planning\nhave been lacking. Recently, language agents powered by large language models\n(LLMs) have shown interesting capabilities such as tool use and reasoning. Are\nthese language agents capable of planning in more complex settings that are out\nof the reach of prior AI agents? To advance this investigation, we propose\nTravelPlanner, a new planning benchmark that focuses on travel planning, a\ncommon real-world planning scenario. It provides a rich sandbox environment,\nvarious tools for accessing nearly four million data records, and 1,225\nmeticulously curated planning intents and reference plans. Comprehensive\nevaluations show that the current language agents are not yet capable of\nhandling such complex planning tasks-even GPT-4 only achieves a success rate of\n0.6%. Language agents struggle to stay on task, use the right tools to collect\ninformation, or keep track of multiple constraints. However, we note that the\nmere possibility for language agents to tackle such a complex problem is in\nitself non-trivial progress. TravelPlanner provides a challenging yet\nmeaningful testbed for future language agents.\n","authors":["Jian Xie","Kai Zhang","Jiangjie Chen","Tinghui Zhu","Renze Lou","Yuandong Tian","Yanghua Xiao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2402.01622v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.13954v3","updated":"2024-02-05T06:42:38Z","published":"2023-05-23T11:30:43Z","title":"Robust Prompt Optimization for Large Language Models Against\n Distribution Shifts","summary":" Large Language Model (LLM) has demonstrated significant ability in various\nNatural Language Processing tasks. However, their effectiveness is highly\ndependent on the phrasing of the task prompt, leading to research on automatic\nprompt optimization using labeled task data. We reveal that these prompt\noptimization techniques are vulnerable to distribution shifts such as\nsubpopulation shifts, which are common for LLMs in real-world scenarios such as\ncustomer reviews analysis. In this light, we propose a new problem of robust\nprompt optimization for LLMs against distribution shifts, which requires the\nprompt optimized over the labeled source group can simultaneously generalize to\nan unlabeled target group. To solve this problem, we propose Generalized Prompt\nOptimization framework, which incorporates the unlabeled data from the target\ngroup into prompt optimization. Extensive experimental results demonstrate the\neffectiveness of the proposed framework with significant performance\nimprovement on the target group and comparable performance on the source group.\n","authors":["Moxin Li","Wenjie Wang","Fuli Feng","Yixin Cao","Jizhi Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2305.13954v3.pdf","comment":"EMNLP 2023 Main"},{"id":"http://arxiv.org/abs/2402.02750v1","updated":"2024-02-05T06:06:47Z","published":"2024-02-05T06:06:47Z","title":"KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache","summary":" Efficiently serving large language models (LLMs) requires batching many\nrequests together to reduce the cost per request. Yet, the key-value (KV)\ncache, which stores attention keys and values to avoid re-computations,\nsignificantly increases memory demands and becomes the new bottleneck in speed\nand memory usage. This memory demand increases with larger batch sizes and\nlonger context lengths. Additionally, the inference speed is limited by the\nsize of KV cache, as the GPU's SRAM must load the entire KV cache from the main\nGPU memory for each token generated, causing the computational core to be idle\nduring this process. A straightforward and effective solution to reduce KV\ncache size is quantization, which decreases the total bytes taken by KV cache.\nHowever, there is a lack of in-depth studies that explore the element\ndistribution of KV cache to understand the hardness and limitation of KV cache\nquantization. To fill the gap, we conducted a comprehensive study on the\nelement distribution in KV cache of popular LLMs. Our findings indicate that\nthe key cache should be quantized per-channel, i.e., group elements along the\nchannel dimension and quantize them together. In contrast, the value cache\nshould be quantized per-token. From this analysis, we developed a tuning-free\n2bit KV cache quantization algorithm, named KIVI. With the hardware-friendly\nimplementation, KIVI can enable Llama (Llama-2), Falcon, and Mistral models to\nmaintain almost the same quality while using $\\mathbf{2.6\\times}$ less peak\nmemory usage (including the model weight). This reduction in memory usage\nenables up to $\\mathbf{4\\times}$ larger batch size, bringing\n$\\mathbf{2.35\\times \\sim 3.47\\times}$ throughput on real LLM inference\nworkload. The source code is available at https://github.com/jy-yuan/KIVI.\n","authors":["Zirui Liu","Jiayi Yuan","Hongye Jin","Shaochen Zhong","Zhaozhuo Xu","Vladimir Braverman","Beidi Chen","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.02750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01152v2","updated":"2024-02-05T05:45:59Z","published":"2024-02-02T05:38:59Z","title":"AccentFold: A Journey through African Accents for Zero-Shot ASR\n Adaptation to Target Accents","summary":" Despite advancements in speech recognition, accented speech remains\nchallenging. While previous approaches have focused on modeling techniques or\ncreating accented speech datasets, gathering sufficient data for the multitude\nof accents, particularly in the African context, remains impractical due to\ntheir sheer diversity and associated budget constraints. To address these\nchallenges, we propose AccentFold, a method that exploits spatial relationships\nbetween learned accent embeddings to improve downstream Automatic Speech\nRecognition (ASR). Our exploratory analysis of speech embeddings representing\n100+ African accents reveals interesting spatial accent relationships\nhighlighting geographic and genealogical similarities, capturing consistent\nphonological, and morphological regularities, all learned empirically from\nspeech. Furthermore, we discover accent relationships previously\nuncharacterized by the Ethnologue. Through empirical evaluation, we demonstrate\nthe effectiveness of AccentFold by showing that, for out-of-distribution (OOD)\naccents, sampling accent subsets for training based on AccentFold information\noutperforms strong baselines a relative WER improvement of 4.6%. AccentFold\npresents a promising approach for improving ASR performance on accented speech,\nparticularly in the context of African accents, where data scarcity and budget\nconstraints pose significant challenges. Our findings emphasize the potential\nof leveraging linguistic relationships to improve zero-shot ASR adaptation to\ntarget accents.\n","authors":["Abraham Toluwase Owodunni","Aditya Yadavalli","Chris Chinenye Emezue","Tobi Olatunji","Clinton C Mbataku"],"pdf_url":"https://arxiv.org/pdf/2402.01152v2.pdf","comment":"Accepted to EACL Findings 2024"},{"id":"http://arxiv.org/abs/2306.07629v3","updated":"2024-02-05T05:42:32Z","published":"2023-06-13T08:57:54Z","title":"SqueezeLLM: Dense-and-Sparse Quantization","summary":" Generative Large Language Models (LLMs) have demonstrated remarkable results\nfor a wide range of tasks. However, deploying these models for inference has\nbeen a significant challenge due to their unprecedented resource requirements.\nThis has forced existing deployment frameworks to use multi-GPU inference\npipelines, which are often complex and costly, or to use smaller and less\nperformant models. In this work, we demonstrate that the main bottleneck for\ngenerative inference with LLMs is memory bandwidth, rather than compute,\nspecifically for single batch inference. While quantization has emerged as a\npromising solution by representing model weights with reduced precision,\nprevious efforts have often resulted in notable performance degradation. To\naddress this, we introduce SqueezeLLM, a post-training quantization framework\nthat not only enables lossless compression to ultra-low precisions of up to\n3-bit, but also achieves higher quantization performance under the same memory\nconstraint. Our framework incorporates two novel ideas: (i) sensitivity-based\nnon-uniform quantization, which searches for the optimal bit precision\nassignment based on second-order information; and (ii) the Dense-and-Sparse\ndecomposition that stores outliers and sensitive weight values in an efficient\nsparse format. When applied to the LLaMA models, our 3-bit quantization\nsignificantly reduces the perplexity gap from the FP16 baseline by up to 2.1x\nas compared to the state-of-the-art methods with the same memory requirement.\nFurthermore, when deployed on an A6000 GPU, our quantized models achieve up to\n2.3x speedup compared to the baseline. Our code is open-sourced and available\nonline.\n","authors":["Sehoon Kim","Coleman Hooper","Amir Gholami","Zhen Dong","Xiuyu Li","Sheng Shen","Michael W. Mahoney","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2306.07629v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01155v2","updated":"2024-02-05T03:42:51Z","published":"2024-02-02T05:48:39Z","title":"CABINET: Content Relevance based Noise Reduction for Table Question\n Answering","summary":" Table understanding capability of Large Language Models (LLMs) has been\nextensively studied through the task of question-answering (QA) over tables.\nTypically, only a small part of the whole table is relevant to derive the\nanswer for a given question. The irrelevant parts act as noise and are\ndistracting information, resulting in sub-optimal performance due to the\nvulnerability of LLMs to noise. To mitigate this, we propose CABINET (Content\nRelevAnce-Based NoIse ReductioN for TablE QuesTion-Answering) - a framework to\nenable LLMs to focus on relevant tabular data by suppressing extraneous\ninformation. CABINET comprises an Unsupervised Relevance Scorer (URS), trained\ndifferentially with the QA LLM, that weighs the table content based on its\nrelevance to the input question before feeding it to the question-answering LLM\n(QA LLM). To further aid the relevance scorer, CABINET employs a weakly\nsupervised module that generates a parsing statement describing the criteria of\nrows and columns relevant to the question and highlights the content of\ncorresponding table cells. CABINET significantly outperforms various tabular\nLLM baselines, as well as GPT3-based in-context learning methods, is more\nrobust to noise, maintains outperformance on tables of varying sizes, and\nestablishes new SoTA performance on WikiTQ, FeTaQA, and WikiSQL datasets. We\nrelease our code and datasets at https://github.com/Sohanpatnaik106/CABINET_QA.\n","authors":["Sohan Patnaik","Heril Changwal","Milan Aggarwal","Sumit Bhatia","Yaman Kumar","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2402.01155v2.pdf","comment":"Accepted at ICLR 2024 (spotlight)"},{"id":"http://arxiv.org/abs/2402.02695v1","updated":"2024-02-05T03:15:26Z","published":"2024-02-05T03:15:26Z","title":"Exploiting Class Probabilities for Black-box Sentence-level Attacks","summary":" Sentence-level attacks craft adversarial sentences that are synonymous with\ncorrectly-classified sentences but are misclassified by the text classifiers.\nUnder the black-box setting, classifiers are only accessible through their\nfeedback to queried inputs, which is predominately available in the form of\nclass probabilities. Even though utilizing class probabilities results in\nstronger attacks, due to the challenges of using them for sentence-level\nattacks, existing attacks use either no feedback or only the class labels.\nOvercoming the challenges, we develop a novel algorithm that uses class\nprobabilities for black-box sentence-level attacks, investigate the\neffectiveness of using class probabilities on the attack's success, and examine\nthe question if it is worthy or practical to use class probabilities by\nblack-box sentence-level attacks. We conduct extensive evaluations of the\nproposed attack comparing with the baselines across various classifiers and\nbenchmark datasets.\n","authors":["Raha Moraffah","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2402.02695v1.pdf","comment":"EACL 2024 Findings"},{"id":"http://arxiv.org/abs/2311.16203v3","updated":"2024-02-05T02:46:11Z","published":"2023-11-27T08:52:10Z","title":"ChatTraffic: Text-to-Traffic Generation via Diffusion Model","summary":" Traffic prediction is one of the most significant foundations in Intelligent\nTransportation Systems (ITS). Traditional traffic prediction methods rely only\non historical traffic data to predict traffic trends and face two main\nchallenges. 1) insensitivity to unusual events. 2) limited performance in\nlong-term prediction. In this work, we explore how generative models combined\nwith text describing the traffic system can be applied for traffic generation,\nand name the task Text-to-Traffic Generation (TTG). The key challenge of the\nTTG task is how to associate text with the spatial structure of the road\nnetwork and traffic data for generating traffic situations. To this end, we\npropose ChatTraffic, the first diffusion model for text-to-traffic generation.\nTo guarantee the consistency between synthetic and real data, we augment a\ndiffusion model with the Graph Convolutional Network (GCN) to extract spatial\ncorrelations of traffic data. In addition, we construct a large dataset\ncontaining text-traffic pairs for the TTG task. We benchmarked our model\nqualitatively and quantitatively on the released dataset. The experimental\nresults indicate that ChatTraffic can generate realistic traffic situations\nfrom the text. Our code and dataset are available at\nhttps://github.com/ChyaZhang/ChatTraffic.\n","authors":["Chengyang Zhang","Yong Zhang","Qitan Shao","Bo Li","Yisheng Lv","Xinglin Piao","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2311.16203v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04534v2","updated":"2024-02-05T02:42:57Z","published":"2023-11-08T08:45:14Z","title":"Loss Masking Is Not Needed in Decoder-only Transformer for\n Discrete-token-based ASR","summary":" Recently, unified speech-text models, such as SpeechGPT, VioLA, and\nAudioPaLM, have achieved remarkable performance on various speech tasks. These\nmodels discretize speech signals into tokens (speech discretization) and use a\nshared vocabulary for both text and speech tokens. Then they train a single\ndecoder-only Transformer on a mixture of speech tasks. However, these models\nrely on the Loss Masking strategy for the ASR task, which ignores the\ndependency among speech tokens. In this paper, we propose to model speech\ntokens in an autoregressive way, similar to text. We find that applying the\nconventional cross-entropy loss on input speech tokens does not consistently\nimprove the ASR performance over the Loss Masking approach. To address this\nissue, we propose a novel approach denoted Smoothed Label Distillation (SLD),\nwhich applies a KL divergence loss with smoothed labels on speech tokens. Our\nexperiments show that SLD effectively models speech tokens and outperforms Loss\nMasking for decoder-only Transformers in ASR tasks with different speech\ndiscretization methods. The source code can be found here:\nhttps://github.com/alibaba-damo-academy/SpokenNLP/tree/main/sld\n","authors":["Qian Chen","Wen Wang","Qinglin Zhang","Siqi Zheng","Shiliang Zhang","Chong Deng","Yukun Ma","Hai Yu","Jiaqing Liu","Chong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.04534v2.pdf","comment":"5 pages, accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.02680v1","updated":"2024-02-05T02:32:09Z","published":"2024-02-05T02:32:09Z","title":"Large Language Models are Geographically Biased","summary":" Large Language Models (LLMs) inherently carry the biases contained in their\ntraining corpora, which can lead to the perpetuation of societal harm. As the\nimpact of these foundation models grows, understanding and evaluating their\nbiases becomes crucial to achieving fairness and accuracy. We propose to study\nwhat LLMs know about the world we live in through the lens of geography. This\napproach is particularly powerful as there is ground truth for the numerous\naspects of human life that are meaningfully projected onto geographic space\nsuch as culture, race, language, politics, and religion. We show various\nproblematic geographic biases, which we define as systemic errors in geospatial\npredictions. Initially, we demonstrate that LLMs are capable of making accurate\nzero-shot geospatial predictions in the form of ratings that show strong\nmonotonic correlation with ground truth (Spearman's $\\rho$ of up to 0.89). We\nthen show that LLMs exhibit common biases across a range of objective and\nsubjective topics. In particular, LLMs are clearly biased against locations\nwith lower socioeconomic conditions (e.g. most of Africa) on a variety of\nsensitive subjective topics such as attractiveness, morality, and intelligence\n(Spearman's $\\rho$ of up to 0.70). Finally, we introduce a bias score to\nquantify this and find that there is significant variation in the magnitude of\nbias across existing LLMs.\n","authors":["Rohin Manvi","Samar Khanna","Marshall Burke","David Lobell","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2402.02680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09308v2","updated":"2024-02-05T02:21:59Z","published":"2023-11-15T19:02:40Z","title":"Divergences between Language Models and Human Brains","summary":" Do machines and humans process language in similar ways? Recent research has\nhinted in the affirmative, finding that brain signals can be effectively\npredicted using the internal representations of language models (LMs). Although\nsuch results are thought to reflect shared computational principles between LMs\nand human brains, there are also clear differences in how LMs and humans\nrepresent and use language. In this work, we systematically explore the\ndivergences between human and machine language processing by examining the\ndifferences between LM representations and human brain responses to language as\nmeasured by Magnetoencephalography (MEG) across two datasets in which subjects\nread and listened to narrative stories. Using a data-driven approach, we\nidentify two domains that are not captured well by LMs: social/emotional\nintelligence and physical commonsense. We then validate these domains with\nhuman behavioral experiments and show that fine-tuning LMs on these domains can\nimprove their alignment with human brain responses.\n","authors":["Yuchen Zhou","Emmy Liu","Graham Neubig","Michael J. Tarr","Leila Wehbe"],"pdf_url":"https://arxiv.org/pdf/2311.09308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18751v2","updated":"2024-02-05T01:13:52Z","published":"2023-11-30T17:50:47Z","title":"Exposing Limitations of Language Model Agents in Sequential-Task\n Compositions on the Web","summary":" Language model agents (LMA) recently emerged as a promising paradigm on\nmuti-step decision making tasks, often outperforming humans and other\nreinforcement learning agents. Despite the promise, their performance on\nreal-world applications that often involve combinations of tasks is still\nunderexplored. In this work, we introduce a new benchmark, called CompWoB -- 50\nnew compositional web automation tasks reflecting more realistic assumptions.\nWe show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve\n94.0% average success rate on base tasks, their performance degrades to 24.9%\nsuccess rate on compositional tasks. On the other hand, transferred LMAs\n(finetuned only on base tasks) show less generalization gap, dropping from\n85.4% to 54.8%. By balancing data distribution across tasks, we train a new\nmodel, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB,\nand achieves the best zero-shot performance on CompWoB (61.5%). While these\nhighlight the promise of small-scale finetuned and transferred models for task\ncompositionality, their performance further degrades under different\ninstruction compositions changing combinational order. In contrast to the\nrecent remarkable success of LMA, our benchmark and detailed analysis emphasize\nthe necessity of building LMAs that are robust and generalizable to task\ncompositionality for real-world deployment.\n","authors":["Hiroki Furuta","Yutaka Matsuo","Aleksandra Faust","Izzeddin Gur"],"pdf_url":"https://arxiv.org/pdf/2311.18751v2.pdf","comment":"Code:\n https://github.com/google-research/google-research/tree/master/compositional_rl/compwob"},{"id":"http://arxiv.org/abs/2402.02658v1","updated":"2024-02-05T00:57:51Z","published":"2024-02-05T00:57:51Z","title":"Multi-step Problem Solving Through a Verifier: An Empirical Analysis on\n Model-induced Process Supervision","summary":" Process supervision, using a trained verifier to evaluate the intermediate\nsteps generated by reasoner, has demonstrated significant improvements in\nmulti-step problem solving. In this paper, to avoid expensive human annotation\neffort on the verifier training data, we introduce Model-induced Process\nSupervision (MiPS), a novel method for automating data curation. MiPS annotates\nan intermediate step by sampling completions of this solution through the\nreasoning model, and obtaining an accuracy defined as the proportion of correct\ncompletions. Errors in the reasoner would cause MiPS to underestimate the\naccuracy of intermediate steps, therefore, we suggest and empirically show that\nverification focusing on high predicted scores of the verifier shall be\npreferred over that of low predicted scores, contrary to prior work. Our\napproach significantly improves the performance of PaLM 2 on math and coding\ntasks (accuracy +0.67% on GSM8K, +4.16% on MATH, +0.92% on MBPP compared with\nan output supervision trained verifier). Additionally, our study demonstrates\nthat the verifier exhibits strong generalization ability across different\nreasoning models.\n","authors":["Zihan Wang","Yunxuan Li","Yuexin Wu","Liangchen Luo","Le Hou","Hongkun Yu","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2402.02658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02656v1","updated":"2024-02-05T00:56:30Z","published":"2024-02-05T00:56:30Z","title":"RACER: An LLM-powered Methodology for Scalable Analysis of\n Semi-structured Mental Health Interviews","summary":" Semi-structured interviews (SSIs) are a commonly employed data-collection\nmethod in healthcare research, offering in-depth qualitative insights into\nsubject experiences. Despite their value, the manual analysis of SSIs is\nnotoriously time-consuming and labor-intensive, in part due to the difficulty\nof extracting and categorizing emotional responses, and challenges in scaling\nhuman evaluation for large populations. In this study, we develop RACER, a\nLarge Language Model (LLM) based expert-guided automated pipeline that\nefficiently converts raw interview transcripts into insightful domain-relevant\nthemes and sub-themes. We used RACER to analyze SSIs conducted with 93\nhealthcare professionals and trainees to assess the broad personal and\nprofessional mental health impacts of the COVID-19 crisis. RACER achieves\nmoderately high agreement with two human evaluators (72%), which approaches the\nhuman inter-rater agreement (77%). Interestingly, LLMs and humans struggle with\nsimilar content involving nuanced emotional, ambivalent/dialectical, and\npsychological statements. Our study highlights the opportunities and challenges\nin using LLMs to improve research efficiency and opens new avenues for scalable\nanalysis of SSIs in healthcare research.\n","authors":["Satpreet Harcharan Singh","Kevin Jiang","Kanchan Bhasin","Ashutosh Sabharwal","Nidal Moukaddam","Ankit B Patel"],"pdf_url":"https://arxiv.org/pdf/2402.02656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02655v1","updated":"2024-02-05T00:54:40Z","published":"2024-02-05T00:54:40Z","title":"VlogQA: Task, Dataset, and Baseline Models for Vietnamese Spoken-Based\n Machine Reading Comprehension","summary":" This paper presents the development process of a Vietnamese spoken language\ncorpus for machine reading comprehension (MRC) tasks and provides insights into\nthe challenges and opportunities associated with using real-world data for\nmachine reading comprehension tasks. The existing MRC corpora in Vietnamese\nmainly focus on formal written documents such as Wikipedia articles, online\nnewspapers, or textbooks. In contrast, the VlogQA consists of 10,076\nquestion-answer pairs based on 1,230 transcript documents sourced from YouTube\n-- an extensive source of user-uploaded content, covering the topics of food\nand travel. By capturing the spoken language of native Vietnamese speakers in\nnatural settings, an obscure corner overlooked in Vietnamese research, the\ncorpus provides a valuable resource for future research in reading\ncomprehension tasks for the Vietnamese language. Regarding performance\nevaluation, our deep-learning models achieved the highest F1 score of 75.34% on\nthe test set, indicating significant progress in machine reading comprehension\nfor Vietnamese spoken language data. In terms of EM, the highest score we\naccomplished is 53.97%, which reflects the challenge in processing spoken-based\ncontent and highlights the need for further improvement.\n","authors":["Thinh Phuoc Ngo","Khoa Tran Anh Dang","Son T. Luu","Kiet Van Nguyen","Ngan Luu-Thuy Nguyen"],"pdf_url":"https://arxiv.org/pdf/2402.02655v1.pdf","comment":"Accepted as main conference paper at EACL 2024"},{"id":"http://arxiv.org/abs/2402.02648v1","updated":"2024-02-05T00:44:28Z","published":"2024-02-05T00:44:28Z","title":"Chain-of-Feedback: Mitigating the Effects of Inconsistency in Responses","summary":" Large Language Models (LLMs) frequently suffer from knowledge-intensive\nquestions, often being inconsistent by providing different outputs despite\ngiven the same input. The response quality worsens when the user expresses a\nfirm opposing stance which causes the LLMs to adjust its response despite the\ncorrect initial one. These behaviors decrease the reliability and validity of\nthe responses provided by these models. In this paper, we attempt to 1) raise\nawareness of the inherent risks that follow from overly relying on AI agents\nlike ChatGPT by showing how Chain-of-Feedback (CoF) triggers LLMs to deviate\nmore from the actual answer and 2) suggest a novel prompting method, Recursive\nChain of Feedback (R-CoF), that we are conducting further study. The CoF system\ntakes in an open-ended multi-step question. Then, we repetitively provide\nmeaningless feedback requesting another attempt. Our preliminary experiments\nshow that such feedback only decreases the quality of the response. On the\nother hand, to mitigate the effects of the aforementioned inconsistencies, we\npresent a novel method of recursively revising the initial incorrect reasoning\nprovided by the LLM by repetitively breaking down each incorrect step into\nsmaller individual problems.\n","authors":["Jinwoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2402.02648v1.pdf","comment":"Still Ongoing Work"},{"id":"http://arxiv.org/abs/2311.02945v2","updated":"2024-02-05T00:14:40Z","published":"2023-11-06T08:26:14Z","title":"PhoGPT: Generative Pre-training for Vietnamese","summary":" We open-source a state-of-the-art 4B-parameter generative model series for\nVietnamese, which includes the base pre-trained monolingual model PhoGPT-4B and\nits chat variant, PhoGPT-4B-Chat. The base model, PhoGPT-4B, with exactly 3.7B\nparameters, is pre-trained from scratch on a Vietnamese corpus of 102B tokens,\nwith an 8192 context length, employing a vocabulary of 20480 token types. The\nchat variant, PhoGPT-4B-Chat, is the modeling output obtained by fine-tuning\nPhoGPT-4B on a dataset of 70K instructional prompts and their responses, along\nwith an additional 290K conversations. We demonstrate its strong performance\ncompared to previous closed-source and open-source 7B-parameter models. Our\nPhoGPT models are available at: https://github.com/VinAIResearch/PhoGPT\n","authors":["Dat Quoc Nguyen","Linh The Nguyen","Chi Tran","Dung Ngoc Nguyen","Dinh Phung","Hung Bui"],"pdf_url":"https://arxiv.org/pdf/2311.02945v2.pdf","comment":"PhoGPT-4B Technical Report - 5 pages"},{"id":"http://arxiv.org/abs/2309.04146v2","updated":"2024-02-05T23:28:15Z","published":"2023-09-08T06:23:25Z","title":"NESTLE: a No-Code Tool for Statistical Analysis of Legal Corpus","summary":" The statistical analysis of large scale legal corpus can provide valuable\nlegal insights. For such analysis one needs to (1) select a subset of the\ncorpus using document retrieval tools, (2) structure text using information\nextraction (IE) systems, and (3) visualize the data for the statistical\nanalysis. Each process demands either specialized tools or programming skills\nwhereas no comprehensive unified \"no-code\" tools have been available. Here we\nprovide NESTLE, a no-code tool for large-scale statistical analysis of legal\ncorpus. Powered by a Large Language Model (LLM) and the internal custom\nend-to-end IE system, NESTLE can extract any type of information that has not\nbeen predefined in the IE system opening up the possibility of unlimited\ncustomizable statistical analysis of the corpus without writing a single line\nof code. We validate our system on 15 Korean precedent IE tasks and 3 legal\ntext classification tasks from LexGLUE. The comprehensive experiments reveal\nNESTLE can achieve GPT-4 comparable performance by training the internal IE\nmodule with 4 human-labeled, and 192 LLM-labeled examples.\n","authors":["Kyoungyeon Cho","Seungkum Han","Young Rok Choi","Wonseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2309.04146v2.pdf","comment":"EACL 2024 System Demonstration Track"},{"id":"http://arxiv.org/abs/2402.03563v1","updated":"2024-02-05T22:22:49Z","published":"2024-02-05T22:22:49Z","title":"Distinguishing the Knowable from the Unknowable with Language Models","summary":" We study the feasibility of identifying epistemic uncertainty (reflecting a\nlack of knowledge), as opposed to aleatoric uncertainty (reflecting entropy in\nthe underlying distribution), in the outputs of large language models (LLMs)\nover free-form text. In the absence of ground-truth probabilities, we explore a\nsetting where, in order to (approximately) disentangle a given LLM's\nuncertainty, a significantly larger model stands in as a proxy for the ground\ntruth. We show that small linear probes trained on the embeddings of frozen,\npretrained models accurately predict when larger models will be more confident\nat the token level and that probes trained on one text domain generalize to\nothers. Going further, we propose a fully unsupervised method that achieves\nnon-trivial accuracy on the same task. Taken together, we interpret these\nresults as evidence that LLMs naturally contain internal representations of\ndifferent types of uncertainty that could potentially be leveraged to devise\nmore informative indicators of model confidence in diverse practical settings.\n","authors":["Gustaf Ahdritz","Tian Qin","Nikhil Vyas","Boaz Barak","Benjamin L. Edelman"],"pdf_url":"https://arxiv.org/pdf/2402.03563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03561v1","updated":"2024-02-05T22:20:19Z","published":"2024-02-05T22:20:19Z","title":"VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language\n Navigation","summary":" Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate\nthrough realistic 3D outdoor environments based on natural language\ninstructions. The performance of existing VLN methods is limited by\ninsufficient diversity in navigation environments and limited training data. To\naddress these issues, we propose VLN-Video, which utilizes the diverse outdoor\nenvironments present in driving videos in multiple cities in the U.S. augmented\nwith automatically generated navigation instructions and actions to improve\noutdoor VLN performance. VLN-Video combines the best of intuitive classical\napproaches and modern deep learning techniques, using template infilling to\ngenerate grounded navigation instructions, combined with an image rotation\nsimilarity-based navigation action predictor to obtain VLN style data from\ndriving videos for pretraining deep learning VLN models. We pre-train the model\non the Touchdown dataset and our video-augmented dataset created from driving\nvideos with three proxy tasks: Masked Language Modeling, Instruction and\nTrajectory Matching, and Next Action Prediction, so as to learn\ntemporally-aware and visually-aligned instruction representations. The learned\ninstruction representation is adapted to the state-of-the-art navigator when\nfine-tuning on the Touchdown dataset. Empirical results demonstrate that\nVLN-Video significantly outperforms previous state-of-the-art models by 2.1% in\ntask completion rate, achieving a new state-of-the-art on the Touchdown\ndataset.\n","authors":["Jialu Li","Aishwarya Padmakumar","Gaurav Sukhatme","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2402.03561v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2304.10750v2","updated":"2024-02-05T21:25:33Z","published":"2023-04-21T05:37:59Z","title":"Improving Grounded Language Understanding in a Collaborative Environment\n by Interacting with Agents Through Help Feedback","summary":" Many approaches to Natural Language Processing (NLP) tasks often treat them\nas single-step problems, where an agent receives an instruction, executes it,\nand is evaluated based on the final outcome. However, human language is\ninherently interactive, as evidenced by the back-and-forth nature of human\nconversations. In light of this, we posit that human-AI collaboration should\nalso be interactive, with humans monitoring the work of AI agents and providing\nfeedback that the agent can understand and utilize. Further, the AI agent\nshould be able to detect when it needs additional information and proactively\nask for help. Enabling this scenario would lead to more natural, efficient, and\nengaging human-AI collaborations.\n In this work, we explore these directions using the challenging task defined\nby the IGLU competition, an interactive grounded language understanding task in\na MineCraft-like world. We explore multiple types of help players can give to\nthe AI to guide it and analyze the impact of this help in AI behavior,\nresulting in performance improvements.\n","authors":["Nikhil Mehta","Milagro Teruel","Patricio Figueroa Sanz","Xin Deng","Ahmed Hassan Awadallah","Julia Kiseleva"],"pdf_url":"https://arxiv.org/pdf/2304.10750v2.pdf","comment":"Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2402.03519v1","updated":"2024-02-05T21:05:35Z","published":"2024-02-05T21:05:35Z","title":"Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical\n System for Punctuation Restoration","summary":" Punctuation restoration is a crucial step after Automatic Speech Recognition\n(ASR) systems to enhance transcript readability and facilitate subsequent NLP\ntasks. Nevertheless, conventional lexical-based approaches are inadequate for\nsolving the punctuation restoration task in Spanish, where ambiguity can be\noften found between unpunctuated declaratives and questions. In this study, we\npropose a novel hybrid acoustic-lexical punctuation restoration system for\nSpanish transcription, which consolidates acoustic and lexical signals through\na modular process. Our experiment results show that the proposed system can\neffectively improve F1 score of question marks and overall punctuation\nrestoration on both public and internal Spanish conversational datasets.\nAdditionally, benchmark comparison against LLMs (Large Language Model)\nindicates the superiority of our approach in accuracy, reliability and latency.\nFurthermore, we demonstrate that the Word Error Rate (WER) of the ASR module\nalso benefits from our proposed system.\n","authors":["Xiliang Zhu","Chia-Tien Chang","Shayna Gardiner","David Rossouw","Jonas Robertson"],"pdf_url":"https://arxiv.org/pdf/2402.03519v1.pdf","comment":"Accepted to UnImplicit workshop at EACL 2024"},{"id":"http://arxiv.org/abs/2402.03509v1","updated":"2024-02-05T20:51:11Z","published":"2024-02-05T20:51:11Z","title":"Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains","summary":" Recent work has shown that large language models (LLMs) are capable of\ngenerating summaries zero-shot (i.e., without explicit supervision) that, under\nhuman assessment, are often comparable or even preferred to manually composed\nreference summaries. However, this prior work has focussed almost exclusively\non evaluating news article summarization. How do zero-shot summarizers perform\nin other (potentially more specialized) domains? In this work we evaluate\nzero-shot generated summaries across specialized domains including biomedical\narticles, and legal bills (in addition to standard news benchmarks for\nreference). We focus especially on the factuality of outputs. We acquire\nannotations from domain experts to identify inconsistencies in summaries and\nsystematically categorize these errors. We analyze whether the prevalence of a\ngiven domain in the pretraining corpus affects extractiveness and faithfulness\nof generated summaries of articles in this domain. We release all collected\nannotations to facilitate additional research toward measuring and realizing\nfactually accurate summarization, beyond news articles. The dataset can be\ndownloaded from https://github.com/sanjanaramprasad/zero_shot_faceval_domains\n","authors":["Sanjana Ramprasad","Kundan Krishna","Zachary C Lipton","Byron C Wallace"],"pdf_url":"https://arxiv.org/pdf/2402.03509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03507v1","updated":"2024-02-05T20:48:57Z","published":"2024-02-05T20:48:57Z","title":"Neural networks for abstraction and reasoning: Towards broad\n generalization in machines","summary":" For half a century, artificial intelligence research has attempted to\nreproduce the human qualities of abstraction and reasoning - creating computer\nsystems that can learn new concepts from a minimal set of examples, in settings\nwhere humans find this easy. While specific neural networks are able to solve\nan impressive range of problems, broad generalisation to situations outside\ntheir training data has proved elusive.In this work, we look at several novel\napproaches for solving the Abstraction & Reasoning Corpus (ARC), a dataset of\nabstract visual reasoning tasks introduced to test algorithms on broad\ngeneralization. Despite three international competitions with $100,000 in\nprizes, the best algorithms still fail to solve a majority of ARC tasks and\nrely on complex hand-crafted rules, without using machine learning at all. We\nrevisit whether recent advances in neural networks allow progress on this task.\n First, we adapt the DreamCoder neurosymbolic reasoning solver to ARC.\nDreamCoder automatically writes programs in a bespoke domain-specific language\nto perform reasoning, using a neural network to mimic human intuition. We\npresent the Perceptual Abstraction and Reasoning Language (PeARL) language,\nwhich allows DreamCoder to solve ARC tasks, and propose a new recognition model\nthat allows us to significantly improve on the previous best implementation.We\nalso propose a new encoding and augmentation scheme that allows large language\nmodels (LLMs) to solve ARC tasks, and find that the largest models can solve\nsome ARC tasks. LLMs are able to solve a different group of problems to\nstate-of-the-art solvers, and provide an interesting way to complement other\napproaches. We perform an ensemble analysis, combining models to achieve better\nresults than any system alone. Finally, we publish the arckit Python library to\nmake future research on ARC easier.\n","authors":["Mikel Bober-Irizar","Soumya Banerjee"],"pdf_url":"https://arxiv.org/pdf/2402.03507v1.pdf","comment":"32 pages main text, 17 pages"},{"id":"http://arxiv.org/abs/2402.03501v1","updated":"2024-02-05T20:34:32Z","published":"2024-02-05T20:34:32Z","title":"An Inpainting-Infused Pipeline for Attire and Background Replacement","summary":" In recent years, groundbreaking advancements in Generative Artificial\nIntelligence (GenAI) have triggered a transformative paradigm shift,\nsignificantly influencing various domains. In this work, we specifically\nexplore an integrated approach, leveraging advanced techniques in GenAI and\ncomputer vision emphasizing image manipulation. The methodology unfolds through\nseveral stages, including depth estimation, the creation of inpaint masks based\non depth information, the generation and replacement of backgrounds utilizing\nStable Diffusion in conjunction with Latent Consistency Models (LCMs), and the\nsubsequent replacement of clothes and application of aesthetic changes through\nan inpainting pipeline. Experiments conducted in this study underscore the\nmethodology's efficacy, highlighting its potential to produce visually\ncaptivating content. The convergence of these advanced techniques allows users\nto input photographs of individuals and manipulate them to modify clothing and\nbackground based on specific prompts without manually input inpainting masks,\neffectively placing the subjects within the vast landscape of creative\nimagination.\n","authors":["Felipe Rodrigues Perche-Mahlow","André Felipe-Zanella","William Alberto Cruz-Castañeda","Marcellus Amadeus"],"pdf_url":"https://arxiv.org/pdf/2402.03501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14403v4","updated":"2024-02-05T20:17:14Z","published":"2023-10-22T20:28:33Z","title":"O3D: Offline Data-driven Discovery and Distillation for Sequential\n Decision-Making with Large Language Models","summary":" Recent advancements in large language models (LLMs) have exhibited promising\nperformance in solving sequential decision-making problems. By imitating\nfew-shot examples provided in the prompts (i.e., in-context learning), an LLM\nagent can interact with an external environment and complete given tasks\nwithout additional training. However, such few-shot examples are often\ninsufficient to generate high-quality solutions for complex and long-horizon\ntasks, while the limited context length cannot consume larger-scale\ndemonstrations with long interaction horizons. To this end, we propose an\noffline learning framework that utilizes offline data at scale (e.g, logs of\nhuman interactions) to improve LLM-powered policies without finetuning. The\nproposed method O3D (Offline Data-driven Discovery and Distillation)\nautomatically discovers reusable skills and distills generalizable knowledge\nacross multiple tasks based on offline interaction data, advancing the\ncapability of solving downstream tasks. Empirical results under two interactive\ndecision-making benchmarks (ALFWorld and WebShop) verify that O3D can notably\nenhance the decision-making capabilities of LLMs through the offline discovery\nand distillation process, and consistently outperform baselines across various\nLLMs.\n","authors":["Yuchen Xiao","Yanchao Sun","Mengda Xu","Udari Madhushani","Jared Vann","Deepeka Garg","Sumitra Ganesh"],"pdf_url":"https://arxiv.org/pdf/2310.14403v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15812v3","updated":"2024-02-05T19:59:46Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n Aligning Large Language Models","summary":" Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v3.pdf","comment":"31 pages, Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2402.03485v1","updated":"2024-02-05T19:56:56Z","published":"2024-02-05T19:56:56Z","title":"Attention Meets Post-hoc Interpretability: A Mathematical Perspective","summary":" Attention-based architectures, in particular transformers, are at the heart\nof a technological revolution. Interestingly, in addition to helping obtain\nstate-of-the-art results on a wide range of applications, the attention\nmechanism intrinsically provides meaningful insights on the internal behavior\nof the model. Can these insights be used as explanations? Debate rages on. In\nthis paper, we mathematically study a simple attention-based architecture and\npinpoint the differences between post-hoc and attention-based explanations. We\nshow that they provide quite different results, and that, despite their\nlimitations, post-hoc methods are capable of capturing more useful insights\nthan merely examining the attention weights.\n","authors":["Gianluigi Lopardo","Frederic Precioso","Damien Garreau"],"pdf_url":"https://arxiv.org/pdf/2402.03485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03484v1","updated":"2024-02-05T19:56:27Z","published":"2024-02-05T19:56:27Z","title":"Harnessing PubMed User Query Logs for Post Hoc Explanations of\n Recommended Similar Articles","summary":" Searching for a related article based on a reference article is an integral\npart of scientific research. PubMed, like many academic search engines, has a\n\"similar articles\" feature that recommends articles relevant to the current\narticle viewed by a user. Explaining recommended items can be of great utility\nto users, particularly in the literature search process. With more than a\nmillion biomedical papers being published each year, explaining the recommended\nsimilar articles would facilitate researchers and clinicians in searching for\nrelated articles. Nonetheless, the majority of current literature\nrecommendation systems lack explanations for their suggestions. We employ a\npost hoc approach to explaining recommendations by identifying relevant tokens\nin the titles of similar articles. Our major contribution is building PubCLogs\nby repurposing 5.6 million pairs of coclicked articles from PubMed's user query\nlogs. Using our PubCLogs dataset, we train the Highlight Similar Article Title\n(HSAT), a transformer-based model designed to select the most relevant parts of\nthe title of a similar article, based on the title and abstract of a seed\narticle. HSAT demonstrates strong performance in our empirical evaluations,\nachieving an F1 score of 91.72 percent on the PubCLogs test set, considerably\noutperforming several baselines including BM25 (70.62), MPNet (67.11), MedCPT\n(62.22), GPT-3.5 (46.00), and GPT-4 (64.89). Additional evaluations on a\nseparate, manually annotated test set further verifies HSAT's performance.\nMoreover, participants of our user study indicate a preference for HSAT, due to\nits superior balance between conciseness and comprehensiveness. Our study\nsuggests that repurposing user query logs of academic search engines can be a\npromising way to train state-of-the-art models for explaining literature\nrecommendation.\n","authors":["Ashley Shin","Qiao Jin","James Anibal","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03483v1","updated":"2024-02-05T19:55:06Z","published":"2024-02-05T19:55:06Z","title":"SWAG: Storytelling With Action Guidance","summary":" Automated long-form story generation typically employs long-context large\nlanguage models (LLMs) for one-shot creation, which can produce cohesive but\nnot necessarily engaging content. We introduce Storytelling With Action\nGuidance (SWAG), a novel approach to storytelling with LLMs. Our approach\nreduces story writing to a search problem through a two-model feedback loop:\none LLM generates story content, and another auxiliary LLM is used to choose\nthe next best \"action\" to steer the story's future direction. Our results show\nthat SWAG can substantially outperform previous end-to-end story generation\ntechniques when evaluated by GPT-4 and through human evaluation, and our SWAG\npipeline using only open-source models surpasses GPT-3.5-Turbo.\n","authors":["Zeeshan Patel","Karim El-Refai","Jonathan Pei","Tianle Li"],"pdf_url":"https://arxiv.org/pdf/2402.03483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03477v1","updated":"2024-02-05T19:39:07Z","published":"2024-02-05T19:39:07Z","title":"Arabic Synonym BERT-based Adversarial Examples for Text Classification","summary":" Text classification systems have been proven vulnerable to adversarial text\nexamples, modified versions of the original text examples that are often\nunnoticed by human eyes, yet can force text classification models to alter\ntheir classification. Often, research works quantifying the impact of\nadversarial text attacks have been applied only to models trained in English.\nIn this paper, we introduce the first word-level study of adversarial attacks\nin Arabic. Specifically, we use a synonym (word-level) attack using a Masked\nLanguage Modeling (MLM) task with a BERT model in a black-box setting to assess\nthe robustness of the state-of-the-art text classification models to\nadversarial attacks in Arabic. To evaluate the grammatical and semantic\nsimilarities of the newly produced adversarial examples using our synonym\nBERT-based attack, we invite four human evaluators to assess and compare the\nproduced adversarial examples with their original examples. We also study the\ntransferability of these newly produced Arabic adversarial examples to various\nmodels and investigate the effectiveness of defense mechanisms against these\nadversarial examples on the BERT models. We find that fine-tuned BERT models\nwere more susceptible to our synonym attacks than the other Deep Neural\nNetworks (DNN) models like WordCNN and WordLSTM we trained. We also find that\nfine-tuned BERT models were more susceptible to transferred attacks. We,\nlastly, find that fine-tuned BERT models successfully regain at least 2% in\naccuracy after applying adversarial training as an initial defense mechanism.\n","authors":["Norah Alshahrani","Saied Alshahrani","Esma Wali","Jeanna Matthews"],"pdf_url":"https://arxiv.org/pdf/2402.03477v1.pdf","comment":"This paper is accepted at The 18th Conference of the European Chapter\n of the Association for Computational Linguistics (Student Research Workshop),\n March 17-22, 2024"},{"id":"http://arxiv.org/abs/2309.07683v2","updated":"2024-02-05T19:01:55Z","published":"2023-09-14T12:58:30Z","title":"Assessing the nature of large language models: A caution against\n anthropocentrism","summary":" Generative AI models garnered a large amount of public attention and\nspeculation with the release of OpenAIs chatbot, ChatGPT. At least two opinion\ncamps exist: one excited about possibilities these models offer for fundamental\nchanges to human tasks, and another highly concerned about power these models\nseem to have. To address these concerns, we assessed several LLMs, primarily\nGPT 3.5, using standard, normed, and validated cognitive and personality\nmeasures. For this seedling project, we developed a battery of tests that\nallowed us to estimate the boundaries of some of these models capabilities, how\nstable those capabilities are over a short period of time, and how they compare\nto humans. Our results indicate that LLMs are unlikely to have developed\nsentience, although its ability to respond to personality inventories is\ninteresting. GPT3.5 did display large variability in both cognitive and\npersonality measures over repeated observations, which is not expected if it\nhad a human-like personality. Variability notwithstanding, LLMs display what in\na human would be considered poor mental health, including low self-esteem,\nmarked dissociation from reality, and in some cases narcissism and psychopathy,\ndespite upbeat and helpful responses.\n","authors":["Ann Speed"],"pdf_url":"https://arxiv.org/pdf/2309.07683v2.pdf","comment":"31 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03435v1","updated":"2024-02-05T19:00:02Z","published":"2024-02-05T19:00:02Z","title":"Psychological Assessments with Large Language Models: A Privacy-Focused\n and Cost-Effective Approach","summary":" This study explores the use of Large Language Models (LLMs) to analyze text\ncomments from Reddit users, aiming to achieve two primary objectives: firstly,\nto pinpoint critical excerpts that support a predefined psychological\nassessment of suicidal risk; and secondly, to summarize the material to\nsubstantiate the preassigned suicidal risk level. The work is circumscribed to\nthe use of \"open-source\" LLMs that can be run locally, thereby enhancing data\nprivacy. Furthermore, it prioritizes models with low computational\nrequirements, making it accessible to both individuals and institutions\noperating on limited computing budgets. The implemented strategy only relies on\na carefully crafted prompt and a grammar to guide the LLM's text completion.\nDespite its simplicity, the evaluation metrics show outstanding results, making\nit a valuable privacy-focused and cost-effective approach. This work is part of\nthe Computational Linguistics and Clinical Psychology (CLPsych) 2024 shared\ntask.\n","authors":["Sergi Blanco-Cuaresma"],"pdf_url":"https://arxiv.org/pdf/2402.03435v1.pdf","comment":"Accepted to the Workshop on Computational Linguistics and Clinical\n Psychology (CLPsych) at EACL 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.03312v1","updated":"2024-02-05T18:59:52Z","published":"2024-02-05T18:59:52Z","title":"Test-Time Adaptation for Depth Completion","summary":" It is common to observe performance degradation when transferring models\ntrained on some (source) datasets to target testing data due to a domain gap\nbetween them. Existing methods for bridging this gap, such as domain adaptation\n(DA), may require the source data on which the model was trained (often not\navailable), while others, i.e., source-free DA, require many passes through the\ntesting data. We propose an online test-time adaptation method for depth\ncompletion, the task of inferring a dense depth map from a single image and\nassociated sparse depth map, that closes the performance gap in a single pass.\nWe first present a study on how the domain shift in each data modality affects\nmodel performance. Based on our observations that the sparse depth modality\nexhibits a much smaller covariate shift than the image, we design an embedding\nmodule trained in the source domain that preserves a mapping from features\nencoding only sparse depth to those encoding image and sparse depth. During\ntest time, sparse depth features are projected using this map as a proxy for\nsource domain features and are used as guidance to train a set of auxiliary\nparameters (i.e., adaptation layer) to align image and sparse depth features\nfrom the target test domain to that of the source domain. We evaluate our\nmethod on indoor and outdoor scenarios and show that it improves over baselines\nby an average of 21.1%.\n","authors":["Hyoungseob Park","Anjali Gupta","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2402.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03311v1","updated":"2024-02-05T18:59:41Z","published":"2024-02-05T18:59:41Z","title":"HASSOD: Hierarchical Adaptive Self-Supervised Object Detection","summary":" The human visual perception system demonstrates exceptional capabilities in\nlearning without explicit supervision and understanding the part-to-whole\ncomposition of objects. Drawing inspiration from these two abilities, we\npropose Hierarchical Adaptive Self-Supervised Object Detection (HASSOD), a\nnovel approach that learns to detect objects and understand their compositions\nwithout human supervision. HASSOD employs a hierarchical adaptive clustering\nstrategy to group regions into object masks based on self-supervised visual\nrepresentations, adaptively determining the number of objects per image.\nFurthermore, HASSOD identifies the hierarchical levels of objects in terms of\ncomposition, by analyzing coverage relations between masks and constructing\ntree structures. This additional self-supervised learning task leads to\nimproved detection performance and enhanced interpretability. Lastly, we\nabandon the inefficient multi-round self-training process utilized in prior\nmethods and instead adapt the Mean Teacher framework from semi-supervised\nlearning, which leads to a smoother and more efficient training process.\nThrough extensive experiments on prevalent image datasets, we demonstrate the\nsuperiority of HASSOD over existing methods, thereby advancing the state of the\nart in self-supervised object detection. Notably, we improve Mask AR from 20.2\nto 22.5 on LVIS, and from 17.0 to 26.0 on SA-1B. Project page:\nhttps://HASSOD-NeurIPS23.github.io.\n","authors":["Shengcao Cao","Dhiraj Joshi","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03311v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.03310v1","updated":"2024-02-05T18:59:36Z","published":"2024-02-05T18:59:36Z","title":"V-IRL: Grounding Virtual Intelligence in Real Life","summary":" There is a sensory gulf between the Earth that humans inhabit and the digital\nrealms in which modern AI agents are created. To develop AI agents that can\nsense, think, and act as flexibly as humans in real-world settings, it is\nimperative to bridge the realism gap between the digital and physical worlds.\nHow can we embody agents in an environment as rich and diverse as the one we\ninhabit, without the constraints imposed by real hardware and control? Towards\nthis end, we introduce V-IRL: a platform that enables agents to scalably\ninteract with the real world in a virtual yet realistic environment. Our\nplatform serves as a playground for developing agents that can accomplish\nvarious practical tasks and as a vast testbed for measuring progress in\ncapabilities spanning perception, decision-making, and interaction with\nreal-world data across the entire globe.\n","authors":["Jihan Yang","Runyu Ding","Ellis Brown","Xiaojuan Qi","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2402.03310v1.pdf","comment":"Project page: https://virl-platform.github.io"},{"id":"http://arxiv.org/abs/2402.03309v1","updated":"2024-02-05T18:59:31Z","published":"2024-02-05T18:59:31Z","title":"AONeuS: A Neural Rendering Framework for Acoustic-Optical Sensor Fusion","summary":" Underwater perception and 3D surface reconstruction are challenging problems\nwith broad applications in construction, security, marine archaeology, and\nenvironmental monitoring. Treacherous operating conditions, fragile\nsurroundings, and limited navigation control often dictate that submersibles\nrestrict their range of motion and, thus, the baseline over which they can\ncapture measurements. In the context of 3D scene reconstruction, it is\nwell-known that smaller baselines make reconstruction more challenging. Our\nwork develops a physics-based multimodal acoustic-optical neural surface\nreconstruction framework (AONeuS) capable of effectively integrating\nhigh-resolution RGB measurements with low-resolution depth-resolved imaging\nsonar measurements. By fusing these complementary modalities, our framework can\nreconstruct accurate high-resolution 3D surfaces from measurements captured\nover heavily-restricted baselines. Through extensive simulations and in-lab\nexperiments, we demonstrate that AONeuS dramatically outperforms recent\nRGB-only and sonar-only inverse-differentiable-rendering--based surface\nreconstruction methods. A website visualizing the results of our paper is\nlocated at this address: https://aoneus.github.io/\n","authors":["Mohamad Qadri","Kevin Zhang","Akshay Hinduja","Michael Kaess","Adithya Pediredla","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2402.03309v1.pdf","comment":"First two authors contributed equally. Paper website:\n https://aoneus.github.io/"},{"id":"http://arxiv.org/abs/2402.03307v1","updated":"2024-02-05T18:59:04Z","published":"2024-02-05T18:59:04Z","title":"4D Gaussian Splatting: Towards Efficient Novel View Synthesis for\n Dynamic Scenes","summary":" We consider the problem of novel view synthesis (NVS) for dynamic scenes.\nRecent neural approaches have accomplished exceptional NVS results for static\n3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior\nefforts often encode dynamics by learning a canonical space plus implicit or\nexplicit deformation fields, which struggle in challenging scenarios like\nsudden movements or capturing high-fidelity renderings. In this paper, we\nintroduce 4D Gaussian Splatting (4DGS), a novel method that represents dynamic\nscenes with anisotropic 4D XYZT Gaussians, inspired by the success of 3D\nGaussian Splatting in static scenes. We model dynamics at each timestamp by\ntemporally slicing the 4D Gaussians, which naturally compose dynamic 3D\nGaussians and can be seamlessly projected into images. As an explicit\nspatial-temporal representation, 4DGS demonstrates powerful capabilities for\nmodeling complicated dynamics and fine details, especially for scenes with\nabrupt motions. We further implement our temporal slicing and splatting\ntechniques in a highly optimized CUDA acceleration framework, achieving\nreal-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and\n583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions\nshowcase the superior efficiency and effectiveness of 4DGS, which consistently\noutperforms existing methods both quantitatively and qualitatively.\n","authors":["Yuanxing Duan","Fangyin Wei","Qiyu Dai","Yuhang He","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03305v1","updated":"2024-02-05T18:58:38Z","published":"2024-02-05T18:58:38Z","title":"Do Diffusion Models Learn Semantically Meaningful and Efficient\n Representations?","summary":" Diffusion models are capable of impressive feats of image generation with\nuncommon juxtapositions such as astronauts riding horses on the moon with\nproperly placed shadows. These outputs indicate the ability to perform\ncompositional generalization, but how do the models do so? We perform\ncontrolled experiments on conditional DDPMs learning to generate 2D spherical\nGaussian bumps centered at specified $x$- and $y$-positions. Our results show\nthat the emergence of semantically meaningful latent representations is key to\nachieving high performance. En route to successful performance over learning,\nthe model traverses three distinct phases of latent representations: (phase A)\nno latent structure, (phase B) a 2D manifold of disordered states, and (phase\nC) a 2D ordered manifold. Corresponding to each of these phases, we identify\nqualitatively different generation behaviors: 1) multiple bumps are generated,\n2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is\ngenerated at the correct $x$ and y location. Furthermore, we show that even\nunder imbalanced datasets where features ($x$- versus $y$-positions) are\nrepresented with skewed frequencies, the learning process for $x$ and $y$ is\ncoupled rather than factorized, demonstrating that simple vanilla-flavored\ndiffusion models cannot learn efficient representations in which localization\nin $x$ and $y$ are factorized into separate 1D tasks. These findings suggest\nthe need for future work to find inductive biases that will push generative\nmodels to discover and exploit factorizable independent structures in their\ninputs, which will be required to vault these models into more data-efficient\nregimes.\n","authors":["Qiyao Liang","Ziming Liu","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2402.03305v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.03302v1","updated":"2024-02-05T18:58:11Z","published":"2024-02-05T18:58:11Z","title":"Swin-UMamba: Mamba-based UNet with ImageNet-based pretraining","summary":" Accurate medical image segmentation demands the integration of multi-scale\ninformation, spanning from local features to global dependencies. However, it\nis challenging for existing methods to model long-range global information,\nwhere convolutional neural networks (CNNs) are constrained by their local\nreceptive fields, and vision transformers (ViTs) suffer from high quadratic\ncomplexity of their attention mechanism. Recently, Mamba-based models have\ngained great attention for their impressive ability in long sequence modeling.\nSeveral studies have demonstrated that these models can outperform popular\nvision models in various tasks, offering higher accuracy, lower memory\nconsumption, and less computational burden. However, existing Mamba-based\nmodels are mostly trained from scratch and do not explore the power of\npretraining, which has been proven to be quite effective for data-efficient\nmedical image analysis. This paper introduces a novel Mamba-based model,\nSwin-UMamba, designed specifically for medical image segmentation tasks,\nleveraging the advantages of ImageNet-based pretraining. Our experimental\nresults reveal the vital role of ImageNet-based training in enhancing the\nperformance of Mamba-based models. Swin-UMamba demonstrates superior\nperformance with a large margin compared to CNNs, ViTs, and latest Mamba-based\nmodels. Notably, on AbdomenMRI, Encoscopy, and Microscopy datasets, Swin-UMamba\noutperforms its closest counterpart U-Mamba by an average score of 3.58%. The\ncode and models of Swin-UMamba are publicly available at:\nhttps://github.com/JiarunLiu/Swin-UMamba\n","authors":["Jiarun Liu","Hao Yang","Hong-Yu Zhou","Yan Xi","Lequan Yu","Yizhou Yu","Yong Liang","Guangming Shi","Shaoting Zhang","Hairong Zheng","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03302v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2402.03299v1","updated":"2024-02-05T18:54:43Z","published":"2024-02-05T18:54:43Z","title":"GUARD: Role-playing to Generate Natural-language Jailbreakings to Test\n Guideline Adherence of Large Language Models","summary":" The discovery of \"jailbreaks\" to bypass safety filters of Large Language\nModels (LLMs) and harmful responses have encouraged the community to implement\nsafety measures. One major safety measure is to proactively test the LLMs with\njailbreaks prior to the release. Therefore, such testing will require a method\nthat can generate jailbreaks massively and efficiently. In this paper, we\nfollow a novel yet intuitive strategy to generate jailbreaks in the style of\nthe human generation. We propose a role-playing system that assigns four\ndifferent roles to the user LLMs to collaborate on new jailbreaks. Furthermore,\nwe collect existing jailbreaks and split them into different independent\ncharacteristics using clustering frequency and semantic patterns sentence by\nsentence. We organize these characteristics into a knowledge graph, making them\nmore accessible and easier to retrieve. Our system of different roles will\nleverage this knowledge graph to generate new jailbreaks, which have proved\neffective in inducing LLMs to generate unethical or guideline-violating\nresponses. In addition, we also pioneer a setting in our system that will\nautomatically follow the government-issued guidelines to generate jailbreaks to\ntest whether LLMs follow the guidelines accordingly. We refer to our system as\nGUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have\nempirically validated the effectiveness of GUARD on three cutting-edge\nopen-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a\nwidely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the\nrealm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing\nGUARD's versatility and contributing valuable insights for the development of\nsafer, more reliable LLM-based applications across diverse modalities.\n","authors":["Haibo Jin","Ruoxi Chen","Andy Zhou","Jinyin Chen","Yang Zhang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03299v1.pdf","comment":"22 papges"},{"id":"http://arxiv.org/abs/2402.03292v1","updated":"2024-02-05T18:50:27Z","published":"2024-02-05T18:50:27Z","title":"Zero-shot Object-Level OOD Detection with Context-Aware Inpainting","summary":" Machine learning algorithms are increasingly provided as black-box cloud\nservices or pre-trained models, without access to their training data. This\nmotivates the problem of zero-shot out-of-distribution (OOD) detection.\nConcretely, we aim to detect OOD objects that do not belong to the classifier's\nlabel set but are erroneously classified as in-distribution (ID) objects. Our\napproach, RONIN, uses an off-the-shelf diffusion model to replace detected\nobjects with inpainting. RONIN conditions the inpainting process with the\npredicted ID label, drawing the input object closer to the in-distribution\ndomain. As a result, the reconstructed object is very close to the original in\nthe ID cases and far in the OOD cases, allowing RONIN to effectively\ndistinguish ID and OOD samples. Throughout extensive experiments, we\ndemonstrate that RONIN achieves competitive results compared to previous\napproaches across several datasets, both in zero-shot and non-zero-shot\nsettings.\n","authors":["Quang-Huy Nguyen","Jin Peng Zhou","Zhenzhen Liu","Khanh-Huyen Bui","Kilian Q. Weinberger","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2402.03292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03290v1","updated":"2024-02-05T18:49:17Z","published":"2024-02-05T18:49:17Z","title":"InstanceDiffusion: Instance-level Control for Image Generation","summary":" Text-to-image diffusion models produce high quality images but do not offer\ncontrol over individual instances in the image. We introduce InstanceDiffusion\nthat adds precise instance-level control to text-to-image diffusion models.\nInstanceDiffusion supports free-form language conditions per instance and\nallows flexible ways to specify instance locations such as simple single\npoints, scribbles, bounding boxes or intricate instance segmentation masks, and\ncombinations thereof. We propose three major changes to text-to-image models\nthat enable precise instance-level control. Our UniFusion block enables\ninstance-level conditions for text-to-image models, the ScaleU block improves\nimage fidelity, and our Multi-instance Sampler improves generations for\nmultiple instances. InstanceDiffusion significantly surpasses specialized\nstate-of-the-art models for each location condition. Notably, on the COCO\ndataset, we outperform previous state-of-the-art by 20.4% AP$_{50}^\\text{box}$\nfor box inputs, and 25.4% IoU for mask inputs.\n","authors":["Xudong Wang","Trevor Darrell","Sai Saketh Rambhatla","Rohit Girdhar","Ishan Misra"],"pdf_url":"https://arxiv.org/pdf/2402.03290v1.pdf","comment":"Preprint; Project page:\n https://people.eecs.berkeley.edu/~xdwang/projects/InstDiff/"},{"id":"http://arxiv.org/abs/2402.03286v1","updated":"2024-02-05T18:42:34Z","published":"2024-02-05T18:42:34Z","title":"Training-Free Consistent Text-to-Image Generation","summary":" Text-to-image models offer a new level of creative flexibility by allowing\nusers to guide the image generation process through natural language. However,\nusing these models to consistently portray the same subject across diverse\nprompts remains challenging. Existing approaches fine-tune the model to teach\nit new words that describe specific user-provided subjects or add image\nconditioning to the model. These methods require lengthy per-subject\noptimization or large-scale pre-training. Moreover, they struggle to align\ngenerated images with text prompts and face difficulties in portraying multiple\nsubjects. Here, we present ConsiStory, a training-free approach that enables\nconsistent subject generation by sharing the internal activations of the\npretrained model. We introduce a subject-driven shared attention block and\ncorrespondence-based feature injection to promote subject consistency between\nimages. Additionally, we develop strategies to encourage layout diversity while\nmaintaining subject consistency. We compare ConsiStory to a range of baselines,\nand demonstrate state-of-the-art performance on subject consistency and text\nalignment, without requiring a single optimization step. Finally, ConsiStory\ncan naturally extend to multi-subject scenarios, and even enable training-free\npersonalization for common objects.\n","authors":["Yoad Tewel","Omri Kaduri","Rinon Gal","Yoni Kasten","Lior Wolf","Gal Chechik","Yuval Atzmon"],"pdf_url":"https://arxiv.org/pdf/2402.03286v1.pdf","comment":"Project page is in https://consistory-paper.github.io"},{"id":"http://arxiv.org/abs/2402.03283v1","updated":"2024-02-05T18:39:04Z","published":"2024-02-05T18:39:04Z","title":"Towards a Flexible Scale-out Framework for Efficient Visual Data Query\n Processing","summary":" There is growing interest in visual data management systems that support\nqueries with specialized operations ranging from resizing an image to running\ncomplex machine learning models. With a plethora of such operations, the basic\nneed to receive query responses in minimal time takes a hit, especially when\nthe client desires to run multiple such operations in a single query. Existing\nsystems provide an ad-hoc approach where different solutions are clubbed\ntogether to provide an end-to-end visual data management system. Unlike such\nsolutions, the Visual Data Management System (VDMS) natively executes queries\nwith multiple operations, thus providing an end-to-end solution. However, a\nfixed subset of native operations and a synchronous threading architecture\nlimit its generality and scalability.\n In this paper, we develop VDMS-Async that adds the capability to run\nuser-defined operations with VDMS and execute operations within a query on a\nremote server. VDMS-Async utilizes an event-driven architecture to create an\nefficient pipeline for executing operations within a query. Our experiments\nhave shown that VDMS-Async reduces the query execution time by 2-3X compared to\nexisting state-of-the-art systems. Further, remote operations coupled with an\nevent-driven architecture enables VDMS-Async to scale query execution time\nlinearly with the addition of every new remote server. We demonstrate a 64X\nreduction in query execution time when adding 64 remote servers.\n","authors":["Rohit Verma","Arun Raghunath"],"pdf_url":"https://arxiv.org/pdf/2402.03283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03251v1","updated":"2024-02-05T18:09:33Z","published":"2024-02-05T18:09:33Z","title":"CLIP Can Understand Depth","summary":" Recent studies on generalizing CLIP for monocular depth estimation reveal\nthat CLIP pre-trained on web-crawled data is inefficient for deriving proper\nsimilarities between image patches and depth-related prompts. In this paper, we\nadapt CLIP for meaningful quality of monocular depth estimation with dense\nprediction, without fine-tuning its original vision-language alignment. By\njointly training a compact deconvolutional decoder with a tiny learnable\nembedding matrix named mirror, as a static prompt for its text encoder, CLIP is\nenabled to understand depth. With this approach, our model exhibits impressive\nperformance matching several previous state-of-the-art vision-only models on\nthe NYU Depth v2 and KITTI datasets, outperforming every CLIP-based depth\nestimation model with a large margin. Experiments on temporal depth consistency\nand spatial continuity demonstrate that the prior knowledge of CLIP can be\neffectively refined by our proposed framework. Furthermore, an ablation study\non mirror proves that the resulting model estimates depth utilizing knowledge\nnot only from the image encoder but also text encoder despite not being given\nany prompt written in a human way. This research demonstrates that through\nminimal adjustments, the prior knowledge of vision-language foundation models,\nsuch as CLIP, can be generalized even to domains where learning during\npretraining is challenging. We facilitate future works focused on methods to\nadjust suboptimal prior knowledge of vision-language models using non-human\nlanguage prompts, achieving performance on par with task-specific\nstate-of-the-art methodologies.\n","authors":["Dunam Kim","Seokju Lee"],"pdf_url":"https://arxiv.org/pdf/2402.03251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03246v1","updated":"2024-02-05T18:03:53Z","published":"2024-02-05T18:03:53Z","title":"SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM","summary":" Semantic understanding plays a crucial role in Dense Simultaneous\nLocalization and Mapping (SLAM), facilitating comprehensive scene\ninterpretation. Recent advancements that integrate Gaussian Splatting into SLAM\nsystems have demonstrated its effectiveness in generating high-quality\nrenderings through the use of explicit 3D Gaussian representations. Building on\nthis progress, we propose SGS-SLAM, the first semantic dense visual SLAM system\ngrounded in 3D Gaussians, which provides precise 3D semantic segmentation\nalongside high-fidelity reconstructions. Specifically, we propose to employ\nmulti-channel optimization during the mapping process, integrating appearance,\ngeometric, and semantic constraints with key-frame optimization to enhance\nreconstruction quality. Extensive experiments demonstrate that SGS-SLAM\ndelivers state-of-the-art performance in camera pose estimation, map\nreconstruction, and semantic segmentation, outperforming existing methods\nmeanwhile preserving real-time rendering ability.\n","authors":["Mingrui Li","Shuhong Liu","Heng Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.03246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03241v1","updated":"2024-02-05T17:56:41Z","published":"2024-02-05T17:56:41Z","title":"FROSTER: Frozen CLIP Is A Strong Teacher for Open-Vocabulary Action\n Recognition","summary":" In this paper, we introduce FROSTER, an effective framework for\nopen-vocabulary action recognition. The CLIP model has achieved remarkable\nsuccess in a range of image-based tasks, benefiting from its strong\ngeneralization capability stemming from pretaining on massive image-text pairs.\nHowever, applying CLIP directly to the open-vocabulary action recognition task\nis challenging due to the absence of temporal information in CLIP's\npretraining. Further, fine-tuning CLIP on action recognition datasets may lead\nto overfitting and hinder its generalizability, resulting in unsatisfactory\nresults when dealing with unseen actions.\n To address these issues, FROSTER employs a residual feature distillation\napproach to ensure that CLIP retains its generalization capability while\neffectively adapting to the action recognition task. Specifically, the residual\nfeature distillation treats the frozen CLIP model as a teacher to maintain the\ngeneralizability exhibited by the original CLIP and supervises the feature\nlearning for the extraction of video-specific features to bridge the gap\nbetween images and videos. Meanwhile, it uses a residual sub-network for\nfeature distillation to reach a balance between the two distinct objectives of\nlearning generalizable and video-specific features.\n We extensively evaluate FROSTER on open-vocabulary action recognition\nbenchmarks under both base-to-novel and cross-dataset settings. FROSTER\nconsistently achieves state-of-the-art performance on all datasets across the\nboard. Project page: https://visual-ai.github.io/froster.\n","authors":["Xiaohu Huang","Hao Zhou","Kun Yao","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2402.03241v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.03235v1","updated":"2024-02-05T17:52:58Z","published":"2024-02-05T17:52:58Z","title":"ActiveAnno3D -- An Active Learning Framework for Multi-Modal 3D Object\n Detection","summary":" The curation of large-scale datasets is still costly and requires much time\nand resources. Data is often manually labeled, and the challenge of creating\nhigh-quality datasets remains. In this work, we fill the research gap using\nactive learning for multi-modal 3D object detection. We propose ActiveAnno3D,\nan active learning framework to select data samples for labeling that are of\nmaximum informativeness for training. We explore various continuous training\nmethods and integrate the most efficient method regarding computational demand\nand detection performance. Furthermore, we perform extensive experiments and\nablation studies with BEVFusion and PV-RCNN on the nuScenes and TUM Traffic\nIntersection dataset. We show that we can achieve almost the same performance\nwith PV-RCNN and the entropy-based query strategy when using only half of the\ntraining data (77.25 mAP compared to 83.50 mAP) of the TUM Traffic Intersection\ndataset. BEVFusion achieved an mAP of 64.31 when using half of the training\ndata and 75.0 mAP when using the complete nuScenes dataset. We integrate our\nactive learning framework into the proAnno labeling tool to enable AI-assisted\ndata selection and labeling and minimize the labeling costs. Finally, we\nprovide code, weights, and visualization results on our website:\nhttps://active3d-framework.github.io/active3d-framework.\n","authors":["Ahmed Ghita","Bjørk Antoniussen","Walter Zimmer","Ross Greer","Christian Creß","Andreas Møgelmose","Mohan M. Trivedi","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2402.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03230v1","updated":"2024-02-05T17:43:02Z","published":"2024-02-05T17:43:02Z","title":"CT-based Anatomical Segmentation for Thoracic Surgical Planning: A\n Benchmark Study for 3D U-shaped Deep Learning Models","summary":" Recent rising interests in patient-specific thoracic surgical planning and\nsimulation require efficient and robust creation of digital anatomical models\nfrom automatic medical image segmentation algorithms. Deep learning (DL) is now\nstate-of-the-art in various radiological tasks, and U-shaped DL models have\nparticularly excelled in medical image segmentation since the inception of the\n2D UNet. To date, many variants of U-shaped models have been proposed by the\nintegration of different attention mechanisms and network configurations.\nLeveraging the recent development of large multi-label databases, systematic\nbenchmark studies for these models can provide valuable insights for clinical\ndeployment and future model designs, but such studies are still rare. We\nconduct the first benchmark study for variants of 3D U-shaped models (3DUNet,\nSTUNet, AttentionUNet, SwinUNETR, FocalSegNet, and a novel 3D SwinUnet with\nfour variants) with a focus on CT-based anatomical segmentation for thoracic\nsurgery. Our study systematically examines the impact of different attention\nmechanisms, number of resolution stages, and network configurations on\nsegmentation accuracy and computational complexity. To allow cross-reference\nwith other recent benchmarking studies, we also included a performance\nassessment of the BTCV abdominal structural segmentation. With the STUNet\nranking at the top, our study demonstrated the value of CNN-based U-shaped\nmodels for the investigated tasks and the benefit of residual blocks in network\nconfiguration designs to boost segmentation performance.\n","authors":["Arash Harirpoush","Amirhossein Rasoulian","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.03230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03227v1","updated":"2024-02-05T17:38:49Z","published":"2024-02-05T17:38:49Z","title":"IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of\n brain MR images","summary":" In MRI studies, the aggregation of imaging data from multiple acquisition\nsites enhances sample size but may introduce site-related variabilities that\nhinder consistency in subsequent analyses. Deep learning methods for image\ntranslation have emerged as a solution for harmonizing MR images across sites.\nIn this study, we introduce IGUANe (Image Generation with Unified Adversarial\nNetworks), an original 3D model that leverages the strengths of domain\ntranslation and straightforward application of style transfer methods for\nmulticenter brain MR image harmonization. IGUANe extends CycleGAN architecture\nby integrating an arbitrary number of domains for training through a\nmany-to-one strategy. During inference, the model can be applied to any image,\neven from an unknown acquisition site, making it a universal generator for\nharmonization. Trained on a dataset comprising T1-weighted images from 11\ndifferent scanners, IGUANe was evaluated on data from unseen sites. The\nassessments included the transformation of MR images with traveling subjects,\nthe preservation of pairwise distances between MR images within domains, the\nevolution of volumetric patterns related to age and Alzheimer$^\\prime$s disease\n(AD), and the performance in age regression and patient classification tasks.\nComparisons with other harmonization and normalization methods suggest that\nIGUANe better preserves individual information in MR images and is more\nsuitable for maintaining and reinforcing variabilities related to age and AD.\nFuture studies may further assess IGUANe in other multicenter contexts, either\nusing the same model or retraining it for applications to different image\nmodalities.\n","authors":["Vincent Roca","Grégory Kuchcinski","Jean-Pierre Pruvo","Dorian Manouvriez","Renaud Lopes"],"pdf_url":"https://arxiv.org/pdf/2402.03227v1.pdf","comment":"23 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.03214v1","updated":"2024-02-05T17:25:04Z","published":"2024-02-05T17:25:04Z","title":"Organic or Diffused: Can We Distinguish Human Art from AI-generated\n Images?","summary":" The advent of generative AI images has completely disrupted the art world.\nIdentifying AI generated images from human art is a challenging problem whose\nimpact is growing over time. The failure to address this problem allows bad\nactors to defraud individuals paying a premium for human art, and companies\nwhose stated policies forbid AI imagery. This is also critical for AI model\ntrainers, who need to filter training data to avoid potential model collapse.\nThere are several different approaches to distinguishing human art from AI\nimages, including classifiers trained by supervised learning, research tools\ntargeting diffusion models, and identification by professional artists using\ntheir knowledge of artistic techniques. In this paper, we seek to understand\nhow well these approaches can perform against today's modern generative models\nin both benign and adversarial settings. We curate real human art across 7\nstyles, generate matching images from 5 generative models, and apply 8\ndetectors (5 automated detectors and 3 different human groups including 180\ncrowdworkers, 4000+ professional artists, and 13 expert artists experienced at\ndetecting AI). Both Hive and expert artists do very well, but make mistakes in\ndifferent ways (Hive is weaker against adversarial perturbations while Expert\nartists produce higher false positives). We believe these weaknesses will\nremain as models continue to evolve, and use our data to demonstrate why a\ncombined team of human and automated detectors provides the best combination of\naccuracy and robustness.\n","authors":["Anna Yoo Jeong Ha","Josephine Passananti","Ronik Bhaskar","Shawn Shan","Reid Southen","Haitao Zheng","Ben Y. Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.03214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03614v3","updated":"2024-02-05T17:15:26Z","published":"2023-05-05T15:20:27Z","title":"Denoising-Diffusion Alignment for Continuous Sign Language Recognition","summary":" As a key to social good, continuous sign language recognition (CSLR) aims to\npromote active and accessible communication for the hearing impaired. Current\nCSLR research adopts a cross-modality alignment scheme to learn the mapping\nrelationship between \"video clip-textual gloss\". However, this local alignment\nmethod, especially with weak data annotation, ignores the contextual\ninformation of modalities and directly reduces the generalization of visual\nfeatures. To this end, we propose a novel Denoising-Diffusion global Alignment\nscheme (DDA), which focuses on modeling the mapping of the \"entire video-gloss\nsequence\". DDA consists of a partial noising process strategy and a\ndenoising-diffusion autoencoder. The former is used to achieve efficient\nguidance of the text modality to the visual modality; the latter learns the\nglobal alignment information of the two modalities in a denoising manner. Our\nDDA confirms the feasibility of diffusion models for visual representation\nlearning in CSLR. Experiments on three public benchmarks demonstrate that our\nmethod achieves state-of-the-art performances. Furthermore, the proposed method\ncan be a plug-and-play optimization to generalize other CSLR methods.\n","authors":["Leming Guo","Wanli Xue","Ze Kang","Yuxi Zhou","Tiantian Yuan","Zan Gao","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2305.03614v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03188v1","updated":"2024-02-05T16:53:54Z","published":"2024-02-05T16:53:54Z","title":"Towards mitigating uncann(eye)ness in face swaps via gaze-centric loss\n terms","summary":" Advances in face swapping have enabled the automatic generation of highly\nrealistic faces. Yet face swaps are perceived differently than when looking at\nreal faces, with key differences in viewer behavior surrounding the eyes. Face\nswapping algorithms generally place no emphasis on the eyes, relying on pixel\nor feature matching losses that consider the entire face to guide the training\nprocess. We further investigate viewer perception of face swaps, focusing our\nanalysis on the presence of an uncanny valley effect. We additionally propose a\nnovel loss equation for the training of face swapping models, leveraging a\npretrained gaze estimation network to directly improve representation of the\neyes. We confirm that viewed face swaps do elicit uncanny responses from\nviewers. Our proposed improvements significant reduce viewing angle errors\nbetween face swaps and their source material. Our method additionally reduces\nthe prevalence of the eyes as a deciding factor when viewers perform deepfake\ndetection tasks. Our findings have implications on face swapping for special\neffects, as digital avatars, as privacy mechanisms, and more; negative\nresponses from users could limit effectiveness in said applications. Our gaze\nimprovements are a first step towards alleviating negative viewer perceptions\nvia a targeted approach.\n","authors":["Ethan Wilson","Frederick Shic","Sophie Jörg","Eakta Jain"],"pdf_url":"https://arxiv.org/pdf/2402.03188v1.pdf","comment":"Accepted to Computers and Graphics Special Issue: Eye Gaze\n Visualization, Interaction, Synthesis, and Analysis"},{"id":"http://arxiv.org/abs/2402.03173v1","updated":"2024-02-05T16:41:02Z","published":"2024-02-05T16:41:02Z","title":"Multi: Multimodal Understanding Leaderboard with Text and Images","summary":" Rapid progress in multimodal large language models (MLLMs) highlights the\nneed to introduce challenging yet realistic benchmarks to the academic\ncommunity. Existing benchmarks primarily focus on simple natural image\nunderstanding, but Multi emerges as a cutting-edge benchmark for MLLMs,\noffering a comprehensive dataset for evaluating MLLMs against understanding\ncomplex figures and tables, and scientific questions. This benchmark,\nreflecting current realistic examination styles, provides multimodal inputs and\nrequires responses that are either precise or open-ended, similar to real-life\nschool tests. It challenges MLLMs with a variety of tasks, ranging from formula\nderivation to image detail analysis, and cross-modality reasoning. Multi\nincludes over 18,000 questions, with a focus on science-based QA in diverse\nformats. We also introduce Multi-Elite, a 500-question subset for testing the\nextremities of MLLMs, and Multi-Extend, which enhances In-Context Learning\nresearch with more than 4,500 knowledge pieces. Our evaluation indicates\nsignificant potential for MLLM advancement, with GPT-4V achieving a 63.7%\naccuracy rate on Multi, in contrast to other MLLMs scoring between 31.3% and\n53.7%. Multi serves not only as a robust evaluation platform but also paves the\nway for the development of expert-level AI.\n","authors":["Zichen Zhu","Yang Xu","Lu Chen","Jingkai Yang","Yichuan Ma","Yiming Sun","Hailin Wen","Jiaqi Liu","Jinyu Cai","Yingzi Ma","Situo Zhang","Zihan Zhao","Liangtai Sun","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03173v1.pdf","comment":"Details and access are available at:\n https://OpenDFM.github.io/MULTI-Benchmark/"},{"id":"http://arxiv.org/abs/2401.12945v2","updated":"2024-02-05T16:36:30Z","published":"2024-01-23T18:05:25Z","title":"Lumiere: A Space-Time Diffusion Model for Video Generation","summary":" We introduce Lumiere -- a text-to-video diffusion model designed for\nsynthesizing videos that portray realistic, diverse and coherent motion -- a\npivotal challenge in video synthesis. To this end, we introduce a Space-Time\nU-Net architecture that generates the entire temporal duration of the video at\nonce, through a single pass in the model. This is in contrast to existing video\nmodels which synthesize distant keyframes followed by temporal super-resolution\n-- an approach that inherently makes global temporal consistency difficult to\nachieve. By deploying both spatial and (importantly) temporal down- and\nup-sampling and leveraging a pre-trained text-to-image diffusion model, our\nmodel learns to directly generate a full-frame-rate, low-resolution video by\nprocessing it in multiple space-time scales. We demonstrate state-of-the-art\ntext-to-video generation results, and show that our design easily facilitates a\nwide range of content creation tasks and video editing applications, including\nimage-to-video, video inpainting, and stylized generation.\n","authors":["Omer Bar-Tal","Hila Chefer","Omer Tov","Charles Herrmann","Roni Paiss","Shiran Zada","Ariel Ephrat","Junhwa Hur","Guanghui Liu","Amit Raj","Yuanzhen Li","Michael Rubinstein","Tomer Michaeli","Oliver Wang","Deqing Sun","Tali Dekel","Inbar Mosseri"],"pdf_url":"https://arxiv.org/pdf/2401.12945v2.pdf","comment":"Webpage: https://lumiere-video.github.io/ | Video:\n https://www.youtube.com/watch?v=wxLr02Dz2Sc"},{"id":"http://arxiv.org/abs/2402.03166v1","updated":"2024-02-05T16:35:29Z","published":"2024-02-05T16:35:29Z","title":"RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein\n Segmentation and Classification","summary":" The caliber and configuration of retinal blood vessels serve as important\nbiomarkers for various diseases and medical conditions. A thorough analysis of\nthe retinal vasculature requires the segmentation of blood vessels and their\nclassification into arteries and veins, which is typically performed on color\nfundus images obtained by retinography, a widely used imaging technique.\nNonetheless, manually performing these tasks is labor-intensive and prone to\nhuman error. Various automated methods have been proposed to address this\nproblem. However, the current state of art in artery/vein segmentation and\nclassification faces challenges due to manifest classification errors that\naffect the topological consistency of segmentation maps. This study presents an\ninnovative end-to-end framework, RRWNet, designed to recursively refine\nsemantic segmentation maps and correct manifest classification errors. The\nframework consists of a fully convolutional neural network with a Base\nsubnetwork that generates base segmentation maps from input images, and a\nRecursive Refinement subnetwork that iteratively and recursively improves these\nmaps. Evaluation on public datasets demonstrates the state-of-the-art\nperformance of the proposed method, yielding more topologically consistent\nsegmentation maps with fewer manifest classification errors than existing\napproaches. In addition, the Recursive Refinement module proves effective in\npost-processing segmentation maps from other methods, automatically correcting\nclassification errors and improving topological consistency. The model code,\nweights, and predictions are publicly available at\nhttps://github.com/j-morano/rrwnet.\n","authors":["José Morano","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2402.03166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09944v2","updated":"2024-02-05T16:33:03Z","published":"2023-09-18T17:04:04Z","title":"DiffusionWorldViewer: Exposing and Broadening the Worldview Reflected by\n Generative Text-to-Image Models","summary":" Generative text-to-image (TTI) models produce high-quality images from short\ntextual descriptions and are widely used in academic and creative domains. Like\nhumans, TTI models have a worldview, a conception of the world learned from\ntheir training data and task that influences the images they generate for a\ngiven prompt. However, the worldviews of TTI models are often hidden from\nusers, making it challenging for users to build intuition about TTI outputs,\nand they are often misaligned with users' worldviews, resulting in output\nimages that do not match user expectations. In response, we introduce\nDiffusionWorldViewer, an interactive interface that exposes a TTI model's\nworldview across output demographics and provides editing tools for aligning\noutput images with user perspectives. In a user study with 18 diverse TTI\nusers, we find that DiffusionWorldViewer helps users represent their varied\nviewpoints in generated images and challenge the limited worldview reflected in\ncurrent TTI models.\n","authors":["Zoe De Simone","Angie Boggust","Arvind Satyanarayan","Ashia Wilson"],"pdf_url":"https://arxiv.org/pdf/2309.09944v2.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.03162v1","updated":"2024-02-05T16:30:57Z","published":"2024-02-05T16:30:57Z","title":"Direct-a-Video: Customized Video Generation with User-Directed Camera\n Movement and Object Motion","summary":" Recent text-to-video diffusion models have achieved impressive progress. In\npractice, users often desire the ability to control object motion and camera\nmovement independently for customized video creation. However, current methods\nlack the focus on separately controlling object motion and camera movement in a\ndecoupled manner, which limits the controllability and flexibility of\ntext-to-video models. In this paper, we introduce Direct-a-Video, a system that\nallows users to independently specify motions for one or multiple objects\nand/or camera movements, as if directing a video. We propose a simple yet\neffective strategy for the decoupled control of object motion and camera\nmovement. Object motion is controlled through spatial cross-attention\nmodulation using the model's inherent priors, requiring no additional\noptimization. For camera movement, we introduce new temporal cross-attention\nlayers to interpret quantitative camera movement parameters. We further employ\nan augmentation-based approach to train these layers in a self-supervised\nmanner on a small-scale dataset, eliminating the need for explicit motion\nannotation. Both components operate independently, allowing individual or\ncombined control, and can generalize to open-domain scenarios. Extensive\nexperiments demonstrate the superiority and effectiveness of our method.\nProject page: https://direct-a-video.github.io/.\n","authors":["Shiyuan Yang","Liang Hou","Haibin Huang","Chongyang Ma","Pengfei Wan","Di Zhang","Xiaodong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2402.03162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03161v1","updated":"2024-02-05T16:30:49Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n Visual-Motional Tokenization","summary":" In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models will be available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05453v2","updated":"2024-02-05T16:27:48Z","published":"2023-10-09T06:57:55Z","title":"Memory-Assisted Sub-Prototype Mining for Universal Domain Adaptation","summary":" Universal domain adaptation aims to align the classes and reduce the feature\ngap between the same category of the source and target domains. The target\nprivate category is set as the unknown class during the adaptation process, as\nit is not included in the source domain. However, most existing methods\noverlook the intra-class structure within a category, especially in cases where\nthere exists significant concept shift between the samples belonging to the\nsame category. When samples with large concept shift are forced to be pushed\ntogether, it may negatively affect the adaptation performance. Moreover, from\nthe interpretability aspect, it is unreasonable to align visual features with\nsignificant differences, such as fighter jets and civil aircraft, into the same\ncategory. Unfortunately, due to such semantic ambiguity and annotation cost,\ncategories are not always classified in detail, making it difficult for the\nmodel to perform precise adaptation. To address these issues, we propose a\nnovel Memory-Assisted Sub-Prototype Mining (MemSPM) method that can learn the\ndifferences between samples belonging to the same category and mine sub-classes\nwhen there exists significant concept shift between them. By doing so, our\nmodel learns a more reasonable feature space that enhances the transferability\nand reflects the inherent differences among samples annotated as the same\ncategory. We evaluate the effectiveness of our MemSPM method over multiple\nscenarios, including UniDA, OSDA, and PDA. Our method achieves state-of-the-art\nperformance on four benchmarks in most cases.\n","authors":["Yuxiang Lai","Yi Zhou","Xinghong Liu","Tao Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12038v2","updated":"2024-02-05T16:15:38Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in non-English\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a (quasi)-zero-shot\nmanner, even surpassing models trained on image-text data in native languages.\nTaking Chinese as a practice of MPM, we build large multimodal models VisCPM in\nimage-to-text and text-to-image generation, which achieve state-of-the-art\n(open-source) performance in Chinese. To facilitate future research, we\nopen-source codes and model weights at https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v2.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2402.03135v1","updated":"2024-02-05T16:01:15Z","published":"2024-02-05T16:01:15Z","title":"GPU-Accelerated 3D Polygon Visibility Volumes for Synergistic Perception\n and Navigation","summary":" UAV missions often require specific geometric constraints to be satisfied\nbetween ground locations and the vehicle location. Such requirements are\ntypical for contexts where line-of-sight must be maintained between the vehicle\nlocation and the ground control location and are also important in surveillance\napplications where the UAV wishes to be able to sense, e.g., with a camera\nsensor, a specific region within a complex geometric environment. This problem\nis further complicated when the ground location is generalized to a convex 2D\npolygonal region. This article describes the theory and implementation of a\nsystem which can quickly calculate the 3D volume that encloses all 3D\ncoordinates from which a 2D convex planar region can be entirely viewed;\nreferred to as a visibility volume. The proposed approach computes visibility\nvolumes using a combination of depth map computation using GPU-acceleration and\ngeometric boolean operations. Solutions to this problem require complex 3D\ngeometric analysis techniques that must execute using arbitrary precision\narithmetic on a collection of discontinuous and non-analytic surfaces.\nPost-processing steps incorporate navigational constraints to further restrict\nthe enclosed coordinates to include both visibility and navigation constraints.\nIntegration of sensing visibility constraints with navigational constraints\nyields a range of navigable space where a vehicle will satisfy both perceptual\nsensing and navigational needs of the mission. This algorithm then provides a\nsynergistic perception and navigation sensitive solution yielding a volume of\ncoordinates in 3D that satisfy both the mission path and sensing needs.\n","authors":["Andrew Willis","Collin Hague","Artur Wolek","Kevin Brink"],"pdf_url":"https://arxiv.org/pdf/2402.03135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03124v1","updated":"2024-02-05T15:51:34Z","published":"2024-02-05T15:51:34Z","title":"Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks","summary":" Gradient inversion attacks aim to reconstruct local training data from\nintermediate gradients exposed in the federated learning framework. Despite\nsuccessful attacks, all previous methods, starting from reconstructing a single\ndata point and then relaxing the single-image limit to batch level, are only\ntested under hard label constraints. Even for single-image reconstruction, we\nstill lack an analysis-based algorithm to recover augmented soft labels. In\nthis work, we change the focus from enlarging batchsize to investigating the\nhard label constraints, considering a more realistic circumstance where label\nsmoothing and mixup techniques are used in the training process. In particular,\nwe are the first to initiate a novel algorithm to simultaneously recover the\nground-truth augmented label and the input feature of the last fully-connected\nlayer from single-input gradients, and provide a necessary condition for any\nanalytical-based label recovery methods. Extensive experiments testify to the\nlabel recovery accuracy, as well as the benefits to the following image\nreconstruction. We believe soft labels in classification tasks are worth\nfurther attention in gradient inversion attacks.\n","authors":["Yanbo Wang","Jian Liang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2402.03124v1.pdf","comment":"ICLR2024 poster The prior submission version had a bug in the image\n reconstruction implementation, which has been corrected without harm to the\n main conclusions"},{"id":"http://arxiv.org/abs/2402.03119v1","updated":"2024-02-05T15:47:54Z","published":"2024-02-05T15:47:54Z","title":"Good Teachers Explain: Explanation-Enhanced Knowledge Distillation","summary":" Knowledge Distillation (KD) has proven effective for compressing large\nteacher models into smaller student models. While it is well known that student\nmodels can achieve similar accuracies as the teachers, it has also been shown\nthat they nonetheless often do not learn the same function. It is, however,\noften highly desirable that the student's and teacher's functions share similar\nproperties such as basing the prediction on the same input features, as this\nensures that students learn the 'right features' from the teachers. In this\nwork, we explore whether this can be achieved by not only optimizing the\nclassic KD loss but also the similarity of the explanations generated by the\nteacher and the student. Despite the idea being simple and intuitive, we find\nthat our proposed 'explanation-enhanced' KD (e$^2$KD) (1) consistently provides\nlarge gains in terms of accuracy and student-teacher agreement, (2) ensures\nthat the student learns from the teacher to be right for the right reasons and\nto give similar explanations, and (3) is robust with respect to the model\narchitectures, the amount of training data, and even works with 'approximate',\npre-computed explanations.\n","authors":["Amin Parchami-Araghi","Moritz Böhle","Sukrut Rao","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2402.03119v1.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.15022v2","updated":"2024-02-05T15:36:44Z","published":"2024-01-26T17:29:01Z","title":"Applications of artificial intelligence in the analysis of\n histopathology images of gliomas: a review","summary":" In recent years, the diagnosis of gliomas has become increasingly complex.\nAnalysis of glioma histopathology images using artificial intelligence (AI)\noffers new opportunities to support diagnosis and outcome prediction. To give\nan overview of the current state of research, this review examines 70 publicly\navailable research studies that have proposed AI-based methods for whole-slide\nhistopathology images of human gliomas, covering the diagnostic tasks of\nsubtyping (16/70), grading (23/70), molecular marker prediction (13/70), and\nsurvival prediction (27/70). All studies were reviewed with regard to\nmethodological aspects as well as clinical applicability. It was found that the\nfocus of current research is the assessment of hematoxylin and eosin-stained\ntissue sections of adult-type diffuse gliomas. The majority of studies (49/70)\nare based on the publicly available glioblastoma and low-grade glioma datasets\nfrom The Cancer Genome Atlas (TCGA) and only a few studies employed other\ndatasets in isolation (10/70) or in addition to the TCGA datasets (11/70).\nCurrent approaches mostly rely on convolutional neural networks (53/70) for\nanalyzing tissue at 20x magnification (30/70). A new field of research is the\nintegration of clinical data, omics data, or magnetic resonance imaging\n(27/70). So far, AI-based methods have achieved promising results, but are not\nyet used in real clinical settings. Future work should focus on the independent\nvalidation of methods on larger, multi-site datasets with high-quality and\nup-to-date clinical and molecular pathology annotations to demonstrate routine\napplicability.\n","authors":["Jan-Philipp Redlich","Friedrich Feuerhake","Joachim Weis","Nadine S. Schaadt","Sarah Teuber-Hanselmann","Christoph Buck","Sabine Luttmann","Andrea Eberle","Stefan Nikolin","Arno Appenzeller","Andreas Portmann","André Homeyer"],"pdf_url":"https://arxiv.org/pdf/2401.15022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03095v1","updated":"2024-02-05T15:25:40Z","published":"2024-02-05T15:25:40Z","title":"Transcending Adversarial Perturbations: Manifold-Aided Adversarial\n Examples with Legitimate Semantics","summary":" Deep neural networks were significantly vulnerable to adversarial examples\nmanipulated by malicious tiny perturbations. Although most conventional\nadversarial attacks ensured the visual imperceptibility between adversarial\nexamples and corresponding raw images by minimizing their geometric distance,\nthese constraints on geometric distance led to limited attack transferability,\ninferior visual quality, and human-imperceptible interpretability. In this\npaper, we proposed a supervised semantic-transformation generative model to\ngenerate adversarial examples with real and legitimate semantics, wherein an\nunrestricted adversarial manifold containing continuous semantic variations was\nconstructed for the first time to realize a legitimate transition from\nnon-adversarial examples to adversarial ones. Comprehensive experiments on\nMNIST and industrial defect datasets showed that our adversarial examples not\nonly exhibited better visual quality but also achieved superior attack\ntransferability and more effective explanations for model vulnerabilities,\nindicating their great potential as generic adversarial examples. The code and\npre-trained models were available at https://github.com/shuaili1027/MAELS.git.\n","authors":["Shuai Li","Xiaoyu Jiang","Xiaoguang Ma"],"pdf_url":"https://arxiv.org/pdf/2402.03095v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03094v1","updated":"2024-02-05T15:25:32Z","published":"2024-02-05T15:25:32Z","title":"Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object\n Detector","summary":" This paper addresses the challenge of cross-domain few-shot object detection\n(CD-FSOD), aiming to develop an accurate object detector for novel domains with\nminimal labeled examples. While transformer-based open-set detectors e.g.,\nDE-ViT~\\cite{zhang2023detect} have excelled in both open-vocabulary object\ndetection and traditional few-shot object detection, detecting categories\nbeyond those seen during training, we thus naturally raise two key questions:\n1) can such open-set detection methods easily generalize to CD-FSOD? 2) If no,\nhow to enhance the results of open-set methods when faced with significant\ndomain gaps? To address the first question, we introduce several metrics to\nquantify domain variances and establish a new CD-FSOD benchmark with diverse\ndomain metric values. Some State-Of-The-Art (SOTA) open-set object detection\nmethods are evaluated on this benchmark, with evident performance degradation\nobserved across out-of-domain datasets. This indicates the failure of adopting\nopen-set detectors directly for CD-FSOD. Sequentially, to overcome the\nperformance degradation issue and also to answer the second proposed question,\nwe endeavor to enhance the vanilla DE-ViT. With several novel components\nincluding finetuning, a learnable prototype module, and a lightweight attention\nmodule, we present an improved Cross-Domain Vision Transformer for CD-FSOD\n(CD-ViTO). Experiments show that our CD-ViTO achieves impressive results on\nboth out-of-domain and in-domain target datasets, establishing new SOTAs for\nboth CD-FSOD and FSOD. All the datasets, codes, and models will be released to\nthe community.\n","authors":["Yuqian Fu","Yu Wang","Yixuan Pan","Lian Huai","Xingyu Qiu","Zeyu Shangguan","Tong Liu","Lingjie Kong","Yanwei Fu","Luc Van Gool","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03093v1","updated":"2024-02-05T15:24:13Z","published":"2024-02-05T15:24:13Z","title":"AI-Enhanced Virtual Reality in Medicine: A Comprehensive Survey","summary":" With the rapid advance of computer graphics and artificial intelligence\ntechnologies, the ways we interact with the world have undergone a\ntransformative shift. Virtual Reality (VR) technology, aided by artificial\nintelligence (AI), has emerged as a dominant interaction media in multiple\napplication areas, thanks to its advantage of providing users with immersive\nexperiences. Among those applications, medicine is considered one of the most\npromising areas. In this paper, we present a comprehensive examination of the\nburgeoning field of AI-enhanced VR applications in medical care and services.\nBy introducing a systematic taxonomy, we meticulously classify the pertinent\ntechniques and applications into three well-defined categories based on\ndifferent phases of medical diagnosis and treatment: Visualization Enhancement,\nVR-related Medical Data Processing, and VR-assisted Intervention. This\ncategorization enables a structured exploration of the diverse roles that\nAI-powered VR plays in the medical domain, providing a framework for a more\ncomprehensive understanding and evaluation of these technologies. To our best\nknowledge, this is the first systematic survey of AI-powered VR systems in\nmedical settings, laying a foundation for future research in this\ninterdisciplinary domain.\n","authors":["Yixuan Wu","Kaiyuan Hu","Danny Z. Chen","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2402.03093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03082v1","updated":"2024-02-05T15:13:20Z","published":"2024-02-05T15:13:20Z","title":"Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual\n Text Processing","summary":" Visual text, a pivotal element in both document and scene images, speaks\nvolumes and attracts significant attention in the computer vision domain.\nBeyond visual text detection and recognition, the field of visual text\nprocessing has experienced a surge in research, driven by the advent of\nfundamental generative models. However, challenges persist due to the unique\nproperties and features that distinguish text from general objects. Effectively\nleveraging these unique textual characteristics is crucial in visual text\nprocessing, as observed in our study. In this survey, we present a\ncomprehensive, multi-perspective analysis of recent advancements in this field.\nInitially, we introduce a hierarchical taxonomy encompassing areas ranging from\ntext image enhancement and restoration to text image manipulation, followed by\ndifferent learning paradigms. Subsequently, we conduct an in-depth discussion\nof how specific textual features such as structure, stroke, semantics, style,\nand spatial context are seamlessly integrated into various tasks. Furthermore,\nwe explore available public datasets and benchmark the reviewed methods on\nseveral widely-used datasets. Finally, we identify principal challenges and\npotential avenues for future research. Our aim is to establish this survey as a\nfundamental resource, fostering continued exploration and innovation in the\ndynamic area of visual text processing.\n","authors":["Yan Shu","Weichao Zeng","Zhenhang Li","Fangmin Zhao","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.03082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05738v2","updated":"2024-02-05T15:01:31Z","published":"2024-01-11T08:40:35Z","title":"LKCA: Large Kernel Convolutional Attention","summary":" We revisit the relationship between attention mechanisms and large kernel\nConvNets in visual transformers and propose a new spatial attention named Large\nKernel Convolutional Attention (LKCA). It simplifies the attention operation by\nreplacing it with a single large kernel convolution. LKCA combines the\nadvantages of convolutional neural networks and visual transformers, possessing\na large receptive field, locality, and parameter sharing. We explained the\nsuperiority of LKCA from both convolution and attention perspectives, providing\nequivalent code implementations for each view. Experiments confirm that LKCA\nimplemented from both the convolutional and attention perspectives exhibit\nequivalent performance. We extensively experimented with the LKCA variant of\nViT in both classification and segmentation tasks. The experiments demonstrated\nthat LKCA exhibits competitive performance in visual tasks. Our code will be\nmade publicly available at https://github.com/CatworldLee/LKCA.\n","authors":["Chenghao Li","Boheng Zeng","Yi Lu","Pengbo Shi","Qingzi Chen","Jirui Liu","Lingyun Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.05738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03047v1","updated":"2024-02-05T14:32:57Z","published":"2024-02-05T14:32:57Z","title":"PFDM: Parser-Free Virtual Try-on via Diffusion Model","summary":" Virtual try-on can significantly improve the garment shopping experiences in\nboth online and in-store scenarios, attracting broad interest in computer\nvision. However, to achieve high-fidelity try-on performance, most\nstate-of-the-art methods still rely on accurate segmentation masks, which are\noften produced by near-perfect parsers or manual labeling. To overcome the\nbottleneck, we propose a parser-free virtual try-on method based on the\ndiffusion model (PFDM). Given two images, PFDM can \"wear\" garments on the\ntarget person seamlessly by implicitly warping without any other information.\nTo learn the model effectively, we synthesize many pseudo-images and construct\nsample pairs by wearing various garments on persons. Supervised by the\nlarge-scale expanded dataset, we fuse the person and garment features using a\nproposed Garment Fusion Attention (GFA) mechanism. Experiments demonstrate that\nour proposed PFDM can successfully handle complex cases, synthesize\nhigh-fidelity images, and outperform both state-of-the-art parser-free and\nparser-based models.\n","authors":["Yunfang Niu","Dong Yi","Lingxiang Wu","Zhiwei Liu","Pengxiang Cai","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03047v1.pdf","comment":"Accepted by IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.16822v2","updated":"2024-02-05T14:24:59Z","published":"2024-01-30T08:57:48Z","title":"EarthGPT: A Universal Multi-modal Large Language Model for Multi-sensor\n Image Comprehension in Remote Sensing Domain","summary":" Multi-modal large language models (MLLMs) have demonstrated remarkable\nsuccess in vision and visual-language tasks within the natural image domain.\nOwing to the significant diversities between the natural and remote sensing\n(RS) images, the development of MLLMs in the RS domain is still in the infant\nstage. To fill the gap, a pioneer MLLM named EarthGPT integrating various\nmulti-sensor RS interpretation tasks uniformly is proposed in this paper for\nuniversal RS image comprehension. In EarthGPT, three key techniques are\ndeveloped including a visual-enhanced perception mechanism, a cross-modal\nmutual comprehension approach, and a unified instruction tuning method for\nmulti-sensor multi-task in the RS domain. More importantly, a dataset named\nMMRS-1M featuring large-scale multi-sensor multi-modal RS instruction-following\nis constructed, comprising over 1M image-text pairs based on 34 existing\ndiverse RS datasets and including multi-sensor images such as optical,\nsynthetic aperture radar (SAR), and infrared. The MMRS-1M dataset addresses the\ndrawback of MLLMs on RS expert knowledge and stimulates the development of\nMLLMs in the RS domain. Extensive experiments are conducted, demonstrating the\nEarthGPT's superior performance in various RS visual interpretation tasks\ncompared with the other specialist models and MLLMs, proving the effectiveness\nof the proposed EarthGPT and offering a versatile paradigm for open-set\nreasoning tasks.\n","authors":["Wei Zhang","Miaoxin Cai","Tong Zhang","Yin Zhuang","Xuerui Mao"],"pdf_url":"https://arxiv.org/pdf/2401.16822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03040v1","updated":"2024-02-05T14:24:46Z","published":"2024-02-05T14:24:46Z","title":"InteractiveVideo: User-Centric Controllable Video Generation with\n Synergistic Multimodal Instructions","summary":" We introduce $\\textit{InteractiveVideo}$, a user-centric framework for video\ngeneration. Different from traditional generative approaches that operate based\non user-provided images or text, our framework is designed for dynamic\ninteraction, allowing users to instruct the generative model through various\nintuitive mechanisms during the whole generation process, e.g. text and image\nprompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal\nInstruction mechanism, designed to seamlessly integrate users' multimodal\ninstructions into generative models, thus facilitating a cooperative and\nresponsive interaction between user inputs and the generative process. This\napproach enables iterative and fine-grained refinement of the generation result\nthrough precise and effective user instructions. With\n$\\textit{InteractiveVideo}$, users are given the flexibility to meticulously\ntailor key aspects of a video. They can paint the reference image, edit\nsemantics, and adjust video motions until their requirements are fully met.\nCode, models, and demo are available at\nhttps://github.com/invictus717/InteractiveVideo\n","authors":["Yiyuan Zhang","Yuhao Kang","Zhixin Zhang","Xiaohan Ding","Sanyuan Zhao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2402.03040v1.pdf","comment":"Code, models, and demo are available at\n https://github.com/invictus717/InteractiveVideo"},{"id":"http://arxiv.org/abs/2307.01645v2","updated":"2024-02-05T14:14:06Z","published":"2023-07-04T10:57:52Z","title":"In-Domain Self-Supervised Learning Improves Remote Sensing Image Scene\n Classification","summary":" We investigate the utility of in-domain self-supervised pre-training of\nvision models in the analysis of remote sensing imagery. Self-supervised\nlearning (SSL) has emerged as a promising approach for remote sensing image\nclassification due to its ability to exploit large amounts of unlabeled data.\nUnlike traditional supervised learning, SSL aims to learn representations of\ndata without the need for explicit labels. This is achieved by formulating\nauxiliary tasks that can be used for pre-training models before fine-tuning\nthem on a given downstream task. A common approach in practice to SSL\npre-training is utilizing standard pre-training datasets, such as ImageNet.\nWhile relevant, such a general approach can have a sub-optimal influence on the\ndownstream performance of models, especially on tasks from challenging domains\nsuch as remote sensing. In this paper, we analyze the effectiveness of SSL\npre-training by employing the iBOT framework coupled with Vision transformers\ntrained on Million-AID, a large and unlabeled remote sensing dataset. We\npresent a comprehensive study of different self-supervised pre-training\nstrategies and evaluate their effect across 14 downstream datasets with diverse\nproperties. Our results demonstrate that leveraging large in-domain datasets\nfor self-supervised pre-training consistently leads to improved predictive\ndownstream performance, compared to the standard approaches found in practice.\n","authors":["Ivica Dimitrovski","Ivan Kitanovski","Nikola Simidjievski","Dragi Kocev"],"pdf_url":"https://arxiv.org/pdf/2307.01645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01262v2","updated":"2024-02-05T14:14:02Z","published":"2024-02-02T09:33:07Z","title":"Cascaded Scaling Classifier: class incremental learning with probability\n scaling","summary":" Humans are capable of acquiring new knowledge and transferring learned\nknowledge into different domains, incurring a small forgetting. The same\nability, called Continual Learning, is challenging to achieve when operating\nwith neural networks due to the forgetting affecting past learned tasks when\nlearning new ones. This forgetting can be mitigated by replaying stored samples\nfrom past tasks, but a large memory size may be needed for long sequences of\ntasks; moreover, this could lead to overfitting on saved samples. In this\npaper, we propose a novel regularisation approach and a novel incremental\nclassifier called, respectively, Margin Dampening and Cascaded Scaling\nClassifier. The first combines a soft constraint and a knowledge distillation\napproach to preserve past learned knowledge while allowing the model to learn\nnew patterns effectively. The latter is a gated incremental classifier, helping\nthe model modify past predictions without directly interfering with them. This\nis achieved by modifying the output of the model with auxiliary scaling\nfunctions. We empirically show that our approach performs well on multiple\nbenchmarks against well-established baselines, and we also study each component\nof our proposal and how the combinations of such components affect the final\nresults.\n","authors":["Jary Pomponi","Alessio Devoto","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2402.01262v2.pdf","comment":"Paper under review. The official code is available\n https://github.com/jaryP/Cascaded-Scaling-Classifier"},{"id":"http://arxiv.org/abs/2402.03019v1","updated":"2024-02-05T14:00:13Z","published":"2024-02-05T14:00:13Z","title":"Taylor Videos for Action Recognition","summary":" Effectively extracting motions from video is a critical and long-standing\nproblem for action recognition. This problem is very challenging because\nmotions (i) do not have an explicit form, (ii) have various concepts such as\ndisplacement, velocity, and acceleration, and (iii) often contain noise caused\nby unstable pixels. Addressing these challenges, we propose the Taylor video, a\nnew video format that highlights the dominate motions (e.g., a waving hand) in\neach of its frames named the Taylor frame. Taylor video is named after Taylor\nseries, which approximates a function at a given point using important terms.\nIn the scenario of videos, we define an implicit motion-extraction function\nwhich aims to extract motions from video temporal block. In this block, using\nthe frames, the difference frames, and higher-order difference frames, we\nperform Taylor expansion to approximate this function at the starting frame. We\nshow the summation of the higher-order terms in the Taylor series gives us\ndominant motion patterns, where static objects, small and unstable motions are\nremoved. Experimentally we show that Taylor videos are effective inputs to\npopular architectures including 2D CNNs, 3D CNNs, and transformers. When used\nindividually, Taylor videos yield competitive action recognition accuracy\ncompared to RGB videos and optical flow. When fused with RGB or optical flow\nvideos, further accuracy improvement is achieved.\n","authors":["Lei Wang","Xiuyuan Yuan","Tom Gedeon","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.03019v1.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2402.03003v1","updated":"2024-02-05T13:41:22Z","published":"2024-02-05T13:41:22Z","title":"[Citation needed] Data usage and citation practices in medical imaging\n conferences","summary":" Medical imaging papers often focus on methodology, but the quality of the\nalgorithms and the validity of the conclusions are highly dependent on the\ndatasets used. As creating datasets requires a lot of effort, researchers often\nuse publicly available datasets, there is however no adopted standard for\nciting the datasets used in scientific papers, leading to difficulty in\ntracking dataset usage. In this work, we present two open-source tools we\ncreated that could help with the detection of dataset usage, a pipeline\n\\url{https://github.com/TheoSourget/Public_Medical_Datasets_References} using\nOpenAlex and full-text analysis, and a PDF annotation software\n\\url{https://github.com/TheoSourget/pdf_annotator} used in our study to\nmanually label the presence of datasets. We applied both tools on a study of\nthe usage of 20 publicly available medical datasets in papers from MICCAI and\nMIDL. We compute the proportion and the evolution between 2013 and 2023 of 3\ntypes of presence in a paper: cited, mentioned in the full text, cited and\nmentioned. Our findings demonstrate the concentration of the usage of a limited\nset of datasets. We also highlight different citing practices, making the\nautomation of tracking difficult.\n","authors":["Théo Sourget","Ahmet Akkoç","Stinna Winther","Christine Lyngbye Galsgaard","Amelia Jiménez-Sánchez","Dovile Juodelyte","Caroline Petitjean","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2402.03003v1.pdf","comment":"Submitted to MIDL conference"},{"id":"http://arxiv.org/abs/2312.08983v3","updated":"2024-02-05T13:39:03Z","published":"2023-12-14T14:29:52Z","title":"Interactive Humanoid: Online Full-Body Motion Reaction Synthesis with\n Social Affordance Canonicalization and Forecasting","summary":" We focus on the human-humanoid interaction task optionally with an object. We\npropose a new task named online full-body motion reaction synthesis, which\ngenerates humanoid reactions based on the human actor's motions. The previous\nwork only focuses on human interaction without objects and generates body\nreactions without hand. Besides, they also do not consider the task as an\nonline setting, which means the inability to observe information beyond the\ncurrent moment in practical situations. To support this task, we construct two\ndatasets named HHI and CoChair and propose a unified method. Specifically, we\npropose to construct a social affordance representation. We first select a\nsocial affordance carrier and use SE(3)-Equivariant Neural Networks to learn\nthe local frame for the carrier, then we canonicalize the social affordance.\nBesides, we propose a social affordance forecasting scheme to enable the\nreactor to predict based on the imagined future. Experiments demonstrate that\nour approach can effectively generate high-quality reactions on HHI and\nCoChair. Furthermore, we also validate our method on existing human interaction\ndatasets Interhuman and Chi3D.\n","authors":["Yunze Liu","Changxi Chen","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2312.08983v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02996v1","updated":"2024-02-05T13:34:21Z","published":"2024-02-05T13:34:21Z","title":"Text-Guided Image Clustering","summary":" Image clustering divides a collection of images into meaningful groups,\ntypically interpreted post-hoc via human-given annotations. Those are usually\nin the form of text, begging the question of using text as an abstraction for\nimage clustering. Current image clustering methods, however, neglect the use of\ngenerated textual descriptions. We, therefore, propose Text-Guided Image\nClustering, i.e., generating text using image captioning and visual\nquestion-answering (VQA) models and subsequently clustering the generated text.\nFurther, we introduce a novel approach to inject task- or domain knowledge for\nclustering by prompting VQA models. Across eight diverse image clustering\ndatasets, our results show that the obtained text representations often\noutperform image features. Additionally, we propose a counting-based cluster\nexplainability method. Our evaluations show that the derived keyword-based\nexplanations describe clusters better than the respective cluster accuracy\nsuggests. Overall, this research challenges traditional approaches and paves\nthe way for a paradigm shift in image clustering, using generated text.\n","authors":["Andreas Stephan","Lukas Miklautz","Kevin Sidak","Jan Philip Wahle","Bela Gipp","Claudia Plant","Benjamin Roth"],"pdf_url":"https://arxiv.org/pdf/2402.02996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00270v2","updated":"2024-02-05T13:21:37Z","published":"2023-07-01T08:38:18Z","title":"Real-time High-Resolution Neural Network with Semantic Guidance for\n Crack Segmentation","summary":" Deep learning plays an important role in crack segmentation, but most work\nutilize off-the-shelf or improved models that have not been specifically\ndeveloped for this task. High-resolution convolution neural networks that are\nsensitive to objects' location and detail help improve the performance of crack\nsegmentation, yet conflict with real-time detection. This paper describes\nHrSegNet, a high-resolution network with semantic guidance specifically\ndesigned for crack segmentation, which guarantees real-time inference speed\nwhile preserving crack details. After evaluation on the composite dataset\nCrackSeg9k and the scenario-specific datasets Asphalt3k and Concrete3k,\nHrSegNet obtains state-of-the-art segmentation performance and efficiencies\nthat far exceed those of the compared models. This approach demonstrates that\nthere is a trade-off between high-resolution modeling and real-time detection,\nwhich fosters the use of edge devices to analyze cracks in real-world\napplications.\n","authors":["Yongshang Li","Ronggui Ma","Han Liu","Gaoli Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.00270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02986v1","updated":"2024-02-05T13:16:38Z","published":"2024-02-05T13:16:38Z","title":"A Safety-Adapted Loss for Pedestrian Detection in Automated Driving","summary":" In safety-critical domains like automated driving (AD), errors by the object\ndetector may endanger pedestrians and other vulnerable road users (VRU). As\ncommon evaluation metrics are not an adequate safety indicator, recent works\nemploy approaches to identify safety-critical VRU and back-annotate the risk to\nthe object detector. However, those approaches do not consider the safety\nfactor in the deep neural network (DNN) training process. Thus,\nstate-of-the-art DNN penalizes all misdetections equally irrespective of their\ncriticality. Subsequently, to mitigate the occurrence of critical failure\ncases, i.e., false negatives, a safety-aware training strategy might be\nrequired to enhance the detection performance for critical pedestrians. In this\npaper, we propose a novel safety-aware loss variation that leverages the\nestimated per-pedestrian criticality scores during training. We exploit the\nreachability set-based time-to-collision (TTC-RSB) metric from the motion\ndomain along with distance information to account for the worst-case threat\nquantifying the criticality. Our evaluation results using RetinaNet and FCOS on\nthe nuScenes dataset demonstrate that training the models with our safety-aware\nloss function mitigates the misdetection of critical pedestrians without\nsacrificing performance for the general case, i.e., pedestrians outside the\nsafety-critical zone.\n","authors":["Maria Lyssenko","Piyush Pimplikar","Maarten Bieshaar","Farzad Nozarian","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2402.02986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02985v1","updated":"2024-02-05T13:16:12Z","published":"2024-02-05T13:16:12Z","title":"Unsupervised semantic segmentation of high-resolution UAV imagery for\n road scene parsing","summary":" Two challenges are presented when parsing road scenes in UAV images. First,\nthe high resolution of UAV images makes processing difficult. Second,\nsupervised deep learning methods require a large amount of manual annotations\nto train robust and accurate models. In this paper, an unsupervised road\nparsing framework that leverages recent advances in vision language models and\nfundamental computer vision model is introduced.Initially, a vision language\nmodel is employed to efficiently process ultra-large resolution UAV images to\nquickly detect road regions of interest in the images. Subsequently, the vision\nfoundation model SAM is utilized to generate masks for the road regions without\ncategory information. Following that, a self-supervised representation learning\nnetwork extracts feature representations from all masked regions. Finally, an\nunsupervised clustering algorithm is applied to cluster these feature\nrepresentations and assign IDs to each cluster. The masked regions are combined\nwith the corresponding IDs to generate initial pseudo-labels, which initiate an\niterative self-training process for regular semantic segmentation. The proposed\nmethod achieves an impressive 89.96% mIoU on the development dataset without\nrelying on any manual annotation. Particularly noteworthy is the extraordinary\nflexibility of the proposed method, which even goes beyond the limitations of\nhuman-defined categories and is able to acquire knowledge of new categories\nfrom the dataset itself.\n","authors":["Zihan Ma","Yongshang Li","Ronggui Ma","Chen Liang"],"pdf_url":"https://arxiv.org/pdf/2402.02985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03482v2","updated":"2024-02-05T12:56:43Z","published":"2022-02-07T19:40:20Z","title":"Navigating Neural Space: Revisiting Concept Activation Vectors to\n Overcome Directional Divergence","summary":" With a growing interest in understanding neural network prediction\nstrategies, Concept Activation Vectors (CAVs) have emerged as a popular tool\nfor modeling human-understandable concepts in the latent space. Commonly, CAVs\nare computed by leveraging linear classifiers optimizing the separability of\nlatent representations of samples with and without a given concept. However, in\nthis paper we show that such a separability-oriented computation leads to\nsolutions, which may diverge from the actual goal of precisely modeling the\nconcept direction. This discrepancy can be attributed to the significant\ninfluence of distractor directions, i.e., signals unrelated to the concept,\nwhich are picked up by filters (i.e., weights) of linear models to optimize\nclass-separability. To address this, we introduce pattern-based CAVs, solely\nfocussing on concept signals, thereby providing more accurate concept\ndirections. We evaluate various CAV methods in terms of their alignment with\nthe true concept direction and their impact on CAV applications, including\nconcept sensitivity testing and model correction for shortcut behavior caused\nby data artifacts. We demonstrate the benefits of pattern-based CAVs using the\nPediatric Bone Age, ISIC2019, and FunnyBirds datasets with VGG, ResNet, and\nEfficientNet model architectures.\n","authors":["Frederik Pahde","Maximilian Dreyer","Leander Weber","Moritz Weckbecker","Christopher J. Anders","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2202.03482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02972v1","updated":"2024-02-05T12:50:30Z","published":"2024-02-05T12:50:30Z","title":"Retrieval-Augmented Score Distillation for Text-to-3D Generation","summary":" Text-to-3D generation has achieved significant success by incorporating\npowerful 2D diffusion models, but insufficient 3D prior knowledge also leads to\nthe inconsistency of 3D geometry. Recently, since large-scale multi-view\ndatasets have been released, fine-tuning the diffusion model on the multi-view\ndatasets becomes a mainstream to solve the 3D inconsistency problem. However,\nit has confronted with fundamental difficulties regarding the limited quality\nand diversity of 3D data, compared with 2D data. To sidestep these trade-offs,\nwe explore a retrieval-augmented approach tailored for score distillation,\ndubbed RetDream. We postulate that both expressiveness of 2D diffusion models\nand geometric consistency of 3D assets can be fully leveraged by employing the\nsemantically relevant assets directly within the optimization process. To this\nend, we introduce novel framework for retrieval-based quality enhancement in\ntext-to-3D generation. We leverage the retrieved asset to incorporate its\ngeometric prior in the variational objective and adapt the diffusion model's 2D\nprior toward view consistency, achieving drastic improvements in both geometry\nand fidelity of generated scenes. We conduct extensive experiments to\ndemonstrate that RetDream exhibits superior quality with increased geometric\nconsistency. Project page is available at https://ku-cvlab.github.io/RetDream/.\n","authors":["Junyoung Seo","Susung Hong","Wooseok Jang","Inès Hyeonsu Kim","Minseop Kwak","Doyup Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2402.02972v1.pdf","comment":"Project Page: https://ku-cvlab.github.io/RetDream/"},{"id":"http://arxiv.org/abs/2402.02968v1","updated":"2024-02-05T12:47:09Z","published":"2024-02-05T12:47:09Z","title":"Delving into Multi-modal Multi-task Foundation Models for Road Scene\n Understanding: From Learning Paradigm Perspectives","summary":" Foundation models have indeed made a profound impact on various fields,\nemerging as pivotal components that significantly shape the capabilities of\nintelligent systems. In the context of intelligent vehicles, leveraging the\npower of foundation models has proven to be transformative, offering notable\nadvancements in visual understanding. Equipped with multi-modal and multi-task\nlearning capabilities, multi-modal multi-task visual understanding foundation\nmodels (MM-VUFMs) effectively process and fuse data from diverse modalities and\nsimultaneously handle various driving-related tasks with powerful adaptability,\ncontributing to a more holistic understanding of the surrounding scene. In this\nsurvey, we present a systematic analysis of MM-VUFMs specifically designed for\nroad scenes. Our objective is not only to provide a comprehensive overview of\ncommon practices, referring to task-specific models, unified multi-modal\nmodels, unified multi-task models, and foundation model prompting techniques,\nbut also to highlight their advanced capabilities in diverse learning\nparadigms. These paradigms include open-world understanding, efficient transfer\nfor road scenes, continual learning, interactive and generative capability.\nMoreover, we provide insights into key challenges and future trends, such as\nclosed-loop driving systems, interpretability, embodied driving agents, and\nworld models. To facilitate researchers in staying abreast of the latest\ndevelopments in MM-VUFMs for road scenes, we have established a continuously\nupdated repository at https://github.com/rolsheng/MM-VUFM4DS\n","authors":["Sheng Luo","Wei Chen","Wanxin Tian","Rui Liu","Luanxuan Hou","Xiubao Zhang","Haifeng Shen","Ruiqi Wu","Shuyi Geng","Yi Zhou","Ling Shao","Yi Yang","Bojun Gao","Qun Li","Guobin Wu"],"pdf_url":"https://arxiv.org/pdf/2402.02968v1.pdf","comment":"24 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2402.02963v1","updated":"2024-02-05T12:41:30Z","published":"2024-02-05T12:41:30Z","title":"One-class anomaly detection through color-to-thermal AI for building\n envelope inspection","summary":" We present a label-free method for detecting anomalies during thermographic\ninspection of building envelopes. It is based on the AI-driven prediction of\nthermal distributions from color images. Effectively the method performs as a\none-class classifier of the thermal image regions with high mismatch between\nthe predicted and actual thermal distributions. The algorithm can learn to\nidentify certain features as normal or anomalous by selecting the target sample\nused for training. We demonstrated this principle by training the algorithm\nwith data collected at different outdoors temperature, which lead to the\ndetection of thermal bridges. The method can be implemented to assist human\nprofessionals during routine building inspections or combined with mobile\nplatforms for automating examination of large areas.\n","authors":["Polina Kurtser","Kailun Feng","Thomas Olofsson","Aitor De Andres"],"pdf_url":"https://arxiv.org/pdf/2402.02963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17326v4","updated":"2024-02-05T12:40:24Z","published":"2023-05-27T02:04:25Z","title":"Matrix Information Theory for Self-Supervised Learning","summary":" Contrastive learning often relies on comparing positive anchor samples with\nmultiple negative samples to perform Self-Supervised Learning (SSL). However,\nnon-contrastive approaches like BYOL, SimSiam, and Barlow Twins achieve SSL\nwithout explicit negative samples. In this paper, we introduce a unified matrix\ninformation-theoretic framework that explains many contrastive and\nnon-contrastive learning methods. We then propose a novel method Matrix-SSL\nbased on matrix information theory. Experimental results reveal that Matrix-SSL\nsignificantly outperforms state-of-the-art methods on the ImageNet dataset\nunder linear evaluation settings and on MS-COCO for transfer learning tasks.\nSpecifically, when performing 100 epochs pre-training, our method outperforms\nSimCLR by 4.6%, and when performing transfer learning tasks on MS-COCO, our\nmethod outperforms previous SOTA methods such as MoCo v2 and BYOL up to 3.3%\nwith only 400 epochs compared to 800 epochs pre-training. Code available at\nhttps://github.com/yifanzhang-pro/Matrix-SSL.\n","authors":["Yifan Zhang","Zhiquan Tan","Jingqin Yang","Weiran Huang","Yang Yuan"],"pdf_url":"https://arxiv.org/pdf/2305.17326v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02956v1","updated":"2024-02-05T12:34:03Z","published":"2024-02-05T12:34:03Z","title":"AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a\n Single High-Resolution Image","summary":" The process of estimating and counting tree density using only a single\naerial or satellite image is a difficult task in the fields of photogrammetry\nand remote sensing. However, it plays a crucial role in the management of\nforests. The huge variety of trees in varied topography severely hinders tree\ncounting models to perform well. The purpose of this paper is to propose a\nframework that is learnt from the source domain with sufficient labeled trees\nand is adapted to the target domain with only a limited number of labeled\ntrees. Our method, termed as AdaTreeFormer, contains one shared encoder with a\nhierarchical feature extraction scheme to extract robust features from the\nsource and target domains. It also consists of three subnets: two for\nextracting self-domain attention maps from source and target domains\nrespectively and one for extracting cross-domain attention maps. For the\nlatter, an attention-to-adapt mechanism is introduced to distill relevant\ninformation from different domains while generating tree density maps; a\nhierarchical cross-domain feature alignment scheme is proposed that\nprogressively aligns the features from the source and target domains. We also\nadopt adversarial learning into the framework to further reduce the gap between\nsource and target domains. Our AdaTreeFormer is evaluated on six designed\ndomain adaptation tasks using three tree counting datasets, ie Jiangsu,\nYosemite, and London; and outperforms the state of the art methods\nsignificantly.\n","authors":["Hamed Amini Amirkolaee","Miaojing Shi","Lianghua He","Mark Mulligan"],"pdf_url":"https://arxiv.org/pdf/2402.02956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11831v3","updated":"2024-02-05T12:25:43Z","published":"2023-03-21T13:19:51Z","title":"CLADE: Cycle Loss Augmented Degradation Enhancement for Unpaired\n Super-Resolution of Anisotropic Medical Images","summary":" Three-dimensional (3D) imaging is popular in medical applications, however,\nanisotropic 3D volumes with thick, low-spatial-resolution slices are often\nacquired to reduce scan times. Deep learning (DL) offers a solution to recover\nhigh-resolution features through super-resolution reconstruction (SRR).\nUnfortunately, paired training data is unavailable in many 3D medical\napplications and therefore we propose a novel unpaired approach; CLADE (Cycle\nLoss Augmented Degradation Enhancement). CLADE uses a modified CycleGAN\narchitecture with a cycle-consistent gradient mapping loss, to learn SRR of the\nlow-resolution dimension, from disjoint patches of the high-resolution plane\nwithin the anisotropic 3D volume data itself. We show the feasibility of CLADE\nin abdominal MRI and abdominal CT and demonstrate significant improvements in\nCLADE image quality over low-resolution volumes and state-of-the-art\nself-supervised SRR; SMORE (Synthetic Multi-Orientation Resolution\nEnhancement). Quantitative PIQUE (qualitative perception-based image quality\nevaluator) scores and quantitative edge sharpness (ES - calculated as the\nmaximum gradient of pixel intensities over a border of interest), showed\nsuperior performance for CLADE in both MRI and CT. Qualitatively CLADE had the\nbest overall image quality and highest perceptual ES over the low-resolution\nvolumes and SMORE. This paper demonstrates the potential of using CLADE for\nsuper-resolution reconstruction of anisotropic 3D medical imaging data without\nthe need for paired 3D training data.\n","authors":["Michele Pascale","Vivek Muthurangu","Javier Montalt Tordera","Heather E Fitzke","Gauraang Bhatnagar","Stuart Taylor","Jennifer Steeden"],"pdf_url":"https://arxiv.org/pdf/2303.11831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02946v1","updated":"2024-02-05T12:19:16Z","published":"2024-02-05T12:19:16Z","title":"HoughToRadon Transform: New Neural Network Layer for Features\n Improvement in Projection Space","summary":" In this paper, we introduce HoughToRadon Transform layer, a novel layer\ndesigned to improve the speed of neural networks incorporated with Hough\nTransform to solve semantic image segmentation problems. By placing it after a\nHough Transform layer, \"inner\" convolutions receive modified feature maps with\nnew beneficial properties, such as a smaller area of processed images and\nparameter space linearity by angle and shift. These properties were not\npresented in Hough Transform alone. Furthermore, HoughToRadon Transform layer\nallows us to adjust the size of intermediate feature maps using two new\nparameters, thus allowing us to balance the speed and quality of the resulting\nneural network. Our experiments on the open MIDV-500 dataset show that this new\napproach leads to time savings in document segmentation tasks and achieves\nstate-of-the-art 97.7% accuracy, outperforming HoughEncoder with larger\ncomputational complexity.\n","authors":["Alexandra Zhabitskaya","Alexander Sheshkus","Vladimir L. Arlazarov"],"pdf_url":"https://arxiv.org/pdf/2402.02946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02941v1","updated":"2024-02-05T12:08:17Z","published":"2024-02-05T12:08:17Z","title":"Exploring the Synergies of Hybrid CNNs and ViTs Architectures for\n Computer Vision: A survey","summary":" The hybrid of Convolutional Neural Network (CNN) and Vision Transformers\n(ViT) architectures has emerged as a groundbreaking approach, pushing the\nboundaries of computer vision (CV). This comprehensive review provides a\nthorough examination of the literature on state-of-the-art hybrid CNN-ViT\narchitectures, exploring the synergies between these two approaches. The main\ncontent of this survey includes: (1) a background on the vanilla CNN and ViT,\n(2) systematic review of various taxonomic hybrid designs to explore the\nsynergy achieved through merging CNNs and ViTs models, (3) comparative analysis\nand application task-specific synergy between different hybrid architectures,\n(4) challenges and future directions for hybrid models, (5) lastly, the survey\nconcludes with a summary of key findings and recommendations. Through this\nexploration of hybrid CV architectures, the survey aims to serve as a guiding\nresource, fostering a deeper understanding of the intricate dynamics between\nCNNs and ViTs and their collective impact on shaping the future of CV\narchitectures.\n","authors":["Haruna Yunusa","Shiyin Qin","Abdulrahman Hamman Adama Chukkol","Abdulganiyu Abdu Yusuf","Isah Bello","Adamu Lawan"],"pdf_url":"https://arxiv.org/pdf/2402.02941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02936v1","updated":"2024-02-05T11:58:08Z","published":"2024-02-05T11:58:08Z","title":"Panoramic Image Inpainting With Gated Convolution And Contextual\n Reconstruction Loss","summary":" Deep learning-based methods have demonstrated encouraging results in tackling\nthe task of panoramic image inpainting. However, it is challenging for existing\nmethods to distinguish valid pixels from invalid pixels and find suitable\nreferences for corrupted areas, thus leading to artifacts in the inpainted\nresults. In response to these challenges, we propose a panoramic image\ninpainting framework that consists of a Face Generator, a Cube Generator, a\nside branch, and two discriminators. We use the Cubemap Projection (CMP) format\nas network input. The generator employs gated convolutions to distinguish valid\npixels from invalid ones, while a side branch is designed utilizing contextual\nreconstruction (CR) loss to guide the generators to find the most suitable\nreference patch for inpainting the missing region. The proposed method is\ncompared with state-of-the-art (SOTA) methods on SUN360 Street View dataset in\nterms of PSNR and SSIM. Experimental results and ablation study demonstrate\nthat the proposed method outperforms SOTA both quantitatively and\nqualitatively.\n","authors":["Li Yu","Yanjun Gao","Farhad Pakdaman","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2402.02936v1.pdf","comment":"Copyright 2024 IEEE - to appear in IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.02928v1","updated":"2024-02-05T11:47:45Z","published":"2024-02-05T11:47:45Z","title":"Instance Segmentation XXL-CT Challenge of a Historic Airplane","summary":" Instance segmentation of compound objects in XXL-CT imagery poses a unique\nchallenge in non-destructive testing. This complexity arises from the lack of\nknown reference segmentation labels, limited applicable segmentation tools, as\nwell as partially degraded image quality. To asses recent advancements in the\nfield of machine learning-based image segmentation, the \"Instance Segmentation\nXXL-CT Challenge of a Historic Airplane\" was conducted. The challenge aimed to\nexplore automatic or interactive instance segmentation methods for an efficient\ndelineation of the different aircraft components, such as screws, rivets, metal\nsheets or pressure tubes. We report the organization and outcome of this\nchallenge and describe the capabilities and limitations of the submitted\nsegmentation methods.\n","authors":["Roland Gruber","Johann Christopher Engster","Markus Michen","Nele Blum","Maik Stille","Stefan Gerth","Thomas Wittenberg"],"pdf_url":"https://arxiv.org/pdf/2402.02928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17515v3","updated":"2024-02-05T11:45:39Z","published":"2023-11-29T10:38:42Z","title":"Fusion of Single and Integral Multispectral Aerial Images","summary":" An adequate fusion of the most significant salient information from multiple\ninput channels is essential for many aerial imaging tasks. While multispectral\nrecordings reveal features in various spectral ranges, synthetic aperture\nsensing makes occluded features visible. We present a first and hybrid (model-\nand learning-based) architecture for fusing the most significant features from\nconventional aerial images with the ones from integral aerial images that are\nthe result of synthetic aperture sensing for removing occlusion. It combines\nthe environment's spatial references with features of unoccluded targets that\nwould normally be hidden by dense vegetation. Our method out-beats\nstate-of-the-art two-channel and multi-channel fusion approaches visually and\nquantitatively in common metrics, such as mutual information, visual\ninformation fidelity, and peak signal-to-noise ratio. The proposed model does\nnot require manually tuned parameters, can be extended to an arbitrary number\nand combinations of spectral channels, and is reconfigurable for addressing\ndifferent use cases. We demonstrate examples for search-and-rescue, wildfire\ndetection, and wildlife observation.\n","authors":["Mohamed Youssef","Oliver Bimber"],"pdf_url":"https://arxiv.org/pdf/2311.17515v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02922v1","updated":"2024-02-05T11:42:19Z","published":"2024-02-05T11:42:19Z","title":"Pixel-Wise Color Constancy via Smoothness Techniques in Multi-Illuminant\n Scenes","summary":" Most scenes are illuminated by several light sources, where the traditional\nassumption of uniform illumination is invalid. This issue is ignored in most\ncolor constancy methods, primarily due to the complex spatial impact of\nmultiple light sources on the image. Moreover, most existing multi-illuminant\nmethods fail to preserve the smooth change of illumination, which stems from\nspatial dependencies in natural images. Motivated by this, we propose a novel\nmulti-illuminant color constancy method, by learning pixel-wise illumination\nmaps caused by multiple light sources. The proposed method enforces smoothness\nwithin neighboring pixels, by regularizing the training with the total\nvariation loss. Moreover, a bilateral filter is provisioned further to enhance\nthe natural appearance of the estimated images, while preserving the edges.\nAdditionally, we propose a label-smoothing technique that enables the model to\ngeneralize well despite the uncertainties in ground truth. Quantitative and\nqualitative experiments demonstrate that the proposed method outperforms the\nstate-of-the-art.\n","authors":["Umut Cem Entok","Firas Laakom","Farhad Pakdaman","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2402.02922v1.pdf","comment":"Copyright 2024 IEEE - Submitted to IEEE ICIP 2024"},{"id":"http://arxiv.org/abs/2212.08639v2","updated":"2024-02-05T11:35:43Z","published":"2022-12-16T18:31:23Z","title":"An annotated instance segmentation XXL-CT data-set from a historic\n airplane","summary":" The Me 163 was a Second World War fighter airplane and a result of the German\nair force secret developments. One of these airplanes is currently owned and\ndisplayed in the historic aircraft exhibition of the Deutsches Museum in\nMunich, Germany. To gain insights with respect to its history, design and state\nof preservation, a complete CT scan was obtained using an industrial\nXXL-computer tomography scanner. Using the CT data from the Me 163, all its\ndetails can visually be examined at various levels, ranging from the complete\nhull down to single sprockets and rivets. However, while a trained human\nobserver can identify and interpret the volumetric data with all its parts and\nconnections, a virtual dissection of the airplane and all its different parts\nwould be quite desirable. Nevertheless, this means, that an instance\nsegmentation of all components and objects of interest into disjoint entities\nfrom the CT data is necessary. As of currently, no adequate computer-assisted\ntools for automated or semi-automated segmentation of such XXL-airplane data\nare available, in a first step, an interactive data annotation and object\nlabelling process has been established. So far, seven 512 x 512 x 512 voxel\nsub-volumes from the Me 163 airplane have been annotated and labelled, whose\nresults can potentially be used for various new applications in the field of\ndigital heritage, non-destructive testing, or machine-learning. This work\ndescribes the data acquisition process of the airplane using an industrial\nXXL-CT scanner, outlines the interactive segmentation and labelling scheme to\nannotate sub-volumes of the airplane's CT data, describes and discusses various\nchallenges with respect to interpreting and handling the annotated and labelled\ndata.\n","authors":["Roland Gruber","Nils Reims","Andreas Hempfer","Stefan Gerth","Michael Böhnel","Theobald Fuchs","Michael Salamon","Thomas Wittenberg"],"pdf_url":"https://arxiv.org/pdf/2212.08639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14831v2","updated":"2024-02-05T11:28:56Z","published":"2024-01-26T12:59:26Z","title":"The Machine Vision Iceberg Explained: Advancing Dynamic Testing by\n Considering Holistic Environmental Circumstances","summary":" Are we heading for an iceberg with the current testing of machine vision?\nThis work delves into the landscape of Machine Vision (MV) testing, which is\nheavily required in Highly Automated Driving (HAD) systems. Utilizing the\nmetaphorical notion of navigating towards an iceberg, we discuss the potential\nshortcomings concealed within current testing strategies. We emphasize the\nurgent need for a deeper understanding of how to deal with the opaque functions\nof MV in development processes. As overlooked considerations can cost lives.\nOur main contribution is the hierarchical level model, which we call\nGranularity Grades. The model encourages a refined exploration of the\nmulti-scaled depths of understanding about the circumstances of environments in\nwhich MV is intended to operate. This model aims to provide a holistic overview\nof all entities that may impact MV functions, ranging from relations of\nindividual entities like object attributes to entire environmental scenes. The\napplication of our model delivers a structured exploration of entities in a\nspecific domain, their relationships and assigning results of a MV-under-test\nto construct an entity-relationship graph. Through clustering patterns of\nrelations in the graph general MV deficits are arguable. In Summary, our work\ncontributes to a more nuanced and systematized identification of deficits of a\nMV test object in correlation to holistic circumstances in HAD operating\ndomains.\n","authors":["Hubert Padusinski","Thilo Braun","Christian Steinhauser","Lennart Ries","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2401.14831v2.pdf","comment":"Submitted at IEEE IV 2024"},{"id":"http://arxiv.org/abs/2402.02906v1","updated":"2024-02-05T11:22:14Z","published":"2024-02-05T11:22:14Z","title":"ViewFusion: Learning Composable Diffusion Models for Novel View\n Synthesis","summary":" Deep learning is providing a wealth of new approaches to the old problem of\nnovel view synthesis, from Neural Radiance Field (NeRF) based approaches to\nend-to-end style architectures. Each approach offers specific strengths but\nalso comes with specific limitations in their applicability. This work\nintroduces ViewFusion, a state-of-the-art end-to-end generative approach to\nnovel view synthesis with unparalleled flexibility. ViewFusion consists in\nsimultaneously applying a diffusion denoising step to any number of input views\nof a scene, then combining the noise gradients obtained for each view with an\n(inferred) pixel-weighting mask, ensuring that for each region of the target\nscene only the most informative input views are taken into account. Our\napproach resolves several limitations of previous approaches by (1) being\ntrainable and generalizing across multiple scenes and object classes, (2)\nadaptively taking in a variable number of pose-free views at both train and\ntest time, (3) generating plausible views even in severely undetermined\nconditions (thanks to its generative nature) -- all while generating views of\nquality on par or even better than state-of-the-art methods. Limitations\ninclude not generating a 3D embedding of the scene, resulting in a relatively\nslow inference speed, and our method only being tested on the relatively small\ndataset NMR. Code is available.\n","authors":["Bernard Spiegl","Andrea Perin","Stéphane Deny","Alexander Ilin"],"pdf_url":"https://arxiv.org/pdf/2402.02906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06279v3","updated":"2024-02-05T11:09:00Z","published":"2023-02-13T11:34:17Z","title":"Sneaky Spikes: Uncovering Stealthy Backdoor Attacks in Spiking Neural\n Networks with Neuromorphic Data","summary":" Deep neural networks (DNNs) have demonstrated remarkable performance across\nvarious tasks, including image and speech recognition. However, maximizing the\neffectiveness of DNNs requires meticulous optimization of numerous\nhyperparameters and network parameters through training. Moreover,\nhigh-performance DNNs entail many parameters, which consume significant energy\nduring training. In order to overcome these challenges, researchers have turned\nto spiking neural networks (SNNs), which offer enhanced energy efficiency and\nbiologically plausible data processing capabilities, rendering them highly\nsuitable for sensory data tasks, particularly in neuromorphic data. Despite\ntheir advantages, SNNs, like DNNs, are susceptible to various threats,\nincluding adversarial examples and backdoor attacks. Yet, the field of SNNs\nstill needs to be explored in terms of understanding and countering these\nattacks.\n This paper delves into backdoor attacks in SNNs using neuromorphic datasets\nand diverse triggers. Specifically, we explore backdoor triggers within\nneuromorphic data that can manipulate their position and color, providing a\nbroader scope of possibilities than conventional triggers in domains like\nimages. We present various attack strategies, achieving an attack success rate\nof up to 100% while maintaining a negligible impact on clean accuracy.\nFurthermore, we assess these attacks' stealthiness, revealing that our most\npotent attacks possess significant stealth capabilities. Lastly, we adapt\nseveral state-of-the-art defenses from the image domain, evaluating their\nefficacy on neuromorphic data and uncovering instances where they fall short,\nleading to compromised performance.\n","authors":["Gorka Abad","Oguzhan Ersoy","Stjepan Picek","Aitor Urbieta"],"pdf_url":"https://arxiv.org/pdf/2302.06279v3.pdf","comment":"To appear in Network and Distributed System Security (NDSS) Symposium\n 2024"},{"id":"http://arxiv.org/abs/2402.02892v1","updated":"2024-02-05T11:00:14Z","published":"2024-02-05T11:00:14Z","title":"Motion-Aware Video Frame Interpolation","summary":" Video frame interpolation methodologies endeavor to create novel frames\nbetwixt extant ones, with the intent of augmenting the video's frame frequency.\nHowever, current methods are prone to image blurring and spurious artifacts in\nchallenging scenarios involving occlusions and discontinuous motion. Moreover,\nthey typically rely on optical flow estimation, which adds complexity to\nmodeling and computational costs. To address these issues, we introduce a\nMotion-Aware Video Frame Interpolation (MA-VFI) network, which directly\nestimates intermediate optical flow from consecutive frames by introducing a\nnovel hierarchical pyramid module. It not only extracts global semantic\nrelationships and spatial details from input frames with different receptive\nfields, enabling the model to capture intricate motion patterns, but also\neffectively reduces the required computational cost and complexity.\nSubsequently, a cross-scale motion structure is presented to estimate and\nrefine intermediate flow maps by the extracted features. This approach\nfacilitates the interplay between input frame features and flow maps during the\nframe interpolation process and markedly heightens the precision of the\nintervening flow delineations. Finally, a discerningly fashioned loss centered\naround an intermediate flow is meticulously contrived, serving as a deft rudder\nto skillfully guide the prognostication of said intermediate flow, thereby\nsubstantially refining the precision of the intervening flow mappings.\nExperiments illustrate that MA-VFI surpasses several representative VFI methods\nacross various datasets, and can enhance efficiency while maintaining\ncommendable efficacy.\n","authors":["Pengfei Han","Fuhua Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2402.02892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02889v1","updated":"2024-02-05T10:57:48Z","published":"2024-02-05T10:57:48Z","title":"Exploring Federated Self-Supervised Learning for General Purpose Audio\n Understanding","summary":" The integration of Federated Learning (FL) and Self-supervised Learning (SSL)\noffers a unique and synergetic combination to exploit the audio data for\ngeneral-purpose audio understanding, without compromising user data privacy.\nHowever, rare efforts have been made to investigate the SSL models in the FL\nregime for general-purpose audio understanding, especially when the training\ndata is generated by large-scale heterogeneous audio sources. In this paper, we\nevaluate the performance of feature-matching and predictive audio-SSL\ntechniques when integrated into large-scale FL settings simulated with\nnon-independently identically distributed (non-iid) data. We propose a novel\nFederated SSL (F-SSL) framework, dubbed FASSL, that enables learning\nintermediate feature representations from large-scale decentralized\nheterogeneous clients, holding unlabelled audio data. Our study has found that\naudio F-SSL approaches perform on par with the centralized audio-SSL approaches\non the audio-retrieval task. Extensive experiments demonstrate the\neffectiveness and significance of FASSL as it assists in obtaining the optimal\nglobal model for state-of-the-art FL aggregation methods.\n","authors":["Yasar Abbas Ur Rehman","Kin Wai Lau","Yuyang Xie","Lan Ma","Jiajun Shen"],"pdf_url":"https://arxiv.org/pdf/2402.02889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02887v1","updated":"2024-02-05T10:55:47Z","published":"2024-02-05T10:55:47Z","title":"Time-, Memory- and Parameter-Efficient Visual Adaptation","summary":" As foundation models become more popular, there is a growing need to\nefficiently finetune them for downstream tasks. Although numerous adaptation\nmethods have been proposed, they are designed to be efficient only in terms of\nhow many parameters are trained. They, however, typically still require\nbackpropagating gradients throughout the model, meaning that their\ntraining-time and -memory cost does not reduce as significantly. We propose an\nadaptation method which does not backpropagate gradients through the backbone.\nWe achieve this by designing a lightweight network in parallel that operates on\nfeatures from the frozen, pretrained backbone. As a result, our method is\nefficient not only in terms of parameters, but also in training-time and memory\nusage. Our approach achieves state-of-the-art accuracy-parameter trade-offs on\nthe popular VTAB benchmark, and we further show how we outperform prior works\nwith respect to training-time and -memory usage too. We further demonstrate the\ntraining efficiency and scalability of our method by adapting a vision\ntransformer backbone of 4 billion parameters for the computationally demanding\ntask of video classification, without any intricate model parallelism. Here, we\noutperform a prior adaptor-based method which could only scale to a 1 billion\nparameter backbone, or fully-finetuning a smaller backbone, with the same GPU\nand less training time.\n","authors":["Otniel-Bogdan Mercea","Alexey Gritsenko","Cordelia Schmid","Anurag Arnab"],"pdf_url":"https://arxiv.org/pdf/2402.02887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02886v1","updated":"2024-02-05T10:54:17Z","published":"2024-02-05T10:54:17Z","title":"Time-Distributed Backdoor Attacks on Federated Spiking Learning","summary":" This paper investigates the vulnerability of spiking neural networks (SNNs)\nand federated learning (FL) to backdoor attacks using neuromorphic data.\nDespite the efficiency of SNNs and the privacy advantages of FL, particularly\nin low-powered devices, we demonstrate that these systems are susceptible to\nsuch attacks. We first assess the viability of using FL with SNNs using\nneuromorphic data, showing its potential usage. Then, we evaluate the\ntransferability of known FL attack methods to SNNs, finding that these lead to\nsuboptimal attack performance. Therefore, we explore backdoor attacks involving\nsingle and multiple attackers to improve the attack performance. Our primary\ncontribution is developing a novel attack strategy tailored to SNNs and FL,\nwhich distributes the backdoor trigger temporally and across malicious devices,\nenhancing the attack's effectiveness and stealthiness. In the best case, we\nachieve a 100 attack success rate, 0.13 MSE, and 98.9 SSIM. Moreover, we adapt\nand evaluate an existing defense against backdoor attacks, revealing its\ninadequacy in protecting SNNs. This study underscores the need for robust\nsecurity measures in deploying SNNs and FL, particularly in the context of\nbackdoor attacks.\n","authors":["Gorka Abad","Stjepan Picek","Aitor Urbieta"],"pdf_url":"https://arxiv.org/pdf/2402.02886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02851v1","updated":"2024-02-05T10:06:24Z","published":"2024-02-05T10:06:24Z","title":"Enhancing Compositional Generalization via Compositional Feature\n Alignment","summary":" Real-world applications of machine learning models often confront data\ndistribution shifts, wherein discrepancies exist between the training and test\ndata distributions. In the common multi-domain multi-class setup, as the number\nof classes and domains scales up, it becomes infeasible to gather training data\nfor every domain-class combination. This challenge naturally leads the quest\nfor models with Compositional Generalization (CG) ability, where models can\ngeneralize to unseen domain-class combinations. To delve into the CG challenge,\nwe develop CG-Bench, a suite of CG benchmarks derived from existing real-world\nimage datasets, and observe that the prevalent pretraining-finetuning paradigm\non foundational models, such as CLIP and DINOv2, struggles with the challenge.\nTo address this challenge, we propose Compositional Feature Alignment (CFA), a\nsimple two-stage finetuning technique that i) learns two orthogonal linear\nheads on a pretrained encoder with respect to class and domain labels, and ii)\nfine-tunes the encoder with the newly learned head frozen. We theoretically and\nempirically justify that CFA encourages compositional feature learning of\npretrained models. We further conduct extensive experiments on CG-Bench for\nCLIP and DINOv2, two powerful pretrained vision foundation models. Experiment\nresults show that CFA outperforms common finetuning techniques in compositional\ngeneralization, corroborating CFA's efficacy in compositional feature learning.\n","authors":["Haoxiang Wang","Haozhe Si","Huajie Shao","Han Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02851v1.pdf","comment":"Code is released at\n https://github.com/Haoxiang-Wang/Compositional-Feature-Alignment"},{"id":"http://arxiv.org/abs/2402.02836v1","updated":"2024-02-05T09:45:38Z","published":"2024-02-05T09:45:38Z","title":"Perceptual Learned Image Compression via End-to-End JND-Based\n Optimization","summary":" Emerging Learned image Compression (LC) achieves significant improvements in\ncoding efficiency by end-to-end training of neural networks for compression. An\nimportant benefit of this approach over traditional codecs is that any\noptimization criteria can be directly applied to the encoder-decoder networks\nduring training. Perceptual optimization of LC to comply with the Human Visual\nSystem (HVS) is among such criteria, which has not been fully explored yet.\nThis paper addresses this gap by proposing a novel framework to integrate Just\nNoticeable Distortion (JND) principles into LC. Leveraging existing JND\ndatasets, three perceptual optimization methods are proposed to integrate JND\ninto the LC training process: (1) Pixel-Wise JND Loss (PWL) prioritizes\npixel-by-pixel fidelity in reproducing JND characteristics, (2) Image-Wise JND\nLoss (IWL) emphasizes on overall imperceptible degradation levels, and (3)\nFeature-Wise JND Loss (FWL) aligns the reconstructed image features with\nperceptually significant features. Experimental evaluations demonstrate the\neffectiveness of JND integration, highlighting improvements in rate-distortion\nperformance and visual quality, compared to baseline methods. The proposed\nmethods add no extra complexity after training.\n","authors":["Farhad Pakdaman","Sanaz Nami","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2402.02836v1.pdf","comment":"Copyright 2024 IEEE - Submitted to IEEE ICIP 2024"},{"id":"http://arxiv.org/abs/2305.08514v2","updated":"2024-02-05T09:40:57Z","published":"2023-05-15T10:23:14Z","title":"Generative Adversarial Networks for Spatio-Spectral Compression of\n Hyperspectral Images","summary":" The development of deep learning-based models for the compression of\nhyperspectral images (HSIs) has recently attracted great attention in remote\nsensing due to the sharp growing of hyperspectral data archives. Most of the\nexisting models achieve either spectral or spatial compression, and do not\njointly consider the spatio-spectral redundancies present in HSIs. To address\nthis problem, in this paper we focus our attention on the High Fidelity\nCompression (HiFiC) model (which is proven to be highly effective for spatial\ncompression problems) and adapt it to perform spatio-spectral compression of\nHSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and\nExcitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D\nconvolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs.\nWe analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing\nthe spatio-spectral redundancies with channel attention and inter-dependency\nanalysis. Experimental results show the efficacy of the proposed models in\nperforming spatio-spectral compression, while reconstructing images at reduced\nbitrates with higher reconstruction quality. The code of the proposed models is\npublicly available at https://git.tu-berlin.de/rsim/HSI-SSC .\n","authors":["Akshara Preethy Byju","Martin Hermann Paul Fuchs","Alisa Walda","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2305.08514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01313v2","updated":"2024-02-05T09:27:35Z","published":"2024-02-02T11:07:27Z","title":"AutoGCN -- Towards Generic Human Activity Recognition with Neural\n Architecture Search","summary":" This paper introduces AutoGCN, a generic Neural Architecture Search (NAS)\nalgorithm for Human Activity Recognition (HAR) using Graph Convolution Networks\n(GCNs). HAR has gained attention due to advances in deep learning, increased\ndata availability, and enhanced computational capabilities. At the same time,\nGCNs have shown promising results in modeling relationships between body key\npoints in a skeletal graph. While domain experts often craft dataset-specific\nGCN-based methods, their applicability beyond this specific context is severely\nlimited. AutoGCN seeks to address this limitation by simultaneously searching\nfor the ideal hyperparameters and architecture combination within a versatile\nsearch space using a reinforcement controller while balancing optimal\nexploration and exploitation behavior with a knowledge reservoir during the\nsearch process. We conduct extensive experiments on two large-scale datasets\nfocused on skeleton-based action recognition to assess the proposed algorithm's\nperformance. Our experimental results underscore the effectiveness of AutoGCN\nin constructing optimal GCN architectures for HAR, outperforming conventional\nNAS and GCN methods, as well as random search. These findings highlight the\nsignificance of a diverse search space and an expressive input representation\nto enhance the network performance and generalizability.\n","authors":["Felix Tempel","Inga Strümke","Espen Alexander F. Ihlen"],"pdf_url":"https://arxiv.org/pdf/2402.01313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.04310v3","updated":"2024-02-05T09:24:41Z","published":"2021-04-09T11:29:44Z","title":"Context-self contrastive pretraining for crop type semantic segmentation","summary":" In this paper, we propose a fully supervised pre-training scheme based on\ncontrastive learning particularly tailored to dense classification tasks. The\nproposed Context-Self Contrastive Loss (CSCL) learns an embedding space that\nmakes semantic boundaries pop-up by use of a similarity metric between every\nlocation in a training sample and its local context. For crop type semantic\nsegmentation from Satellite Image Time Series (SITS) we find performance at\nparcel boundaries to be a critical bottleneck and explain how CSCL tackles the\nunderlying cause of that problem, improving the state-of-the-art performance in\nthis task. Additionally, using images from the Sentinel-2 (S2) satellite\nmissions we compile the largest, to our knowledge, SITS dataset densely\nannotated by crop type and parcel identities, which we make publicly available\ntogether with the data generation pipeline. Using that data we find CSCL, even\nwith minimal pre-training, to improve all respective baselines and present a\nprocess for semantic segmentation at super-resolution for obtaining crop\nclasses at a more granular level. The code and instructions to download the\ndata can be found in https://github.com/michaeltrs/DeepSatModels.\n","authors":["Michail Tarasiou","Riza Alp Guler","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2104.04310v3.pdf","comment":"15 pages, 17 figures"},{"id":"http://arxiv.org/abs/2310.01812v3","updated":"2024-02-05T09:21:28Z","published":"2023-10-03T05:55:11Z","title":"PPT: Token Pruning and Pooling for Efficient Vision Transformers","summary":" Vision Transformers (ViTs) have emerged as powerful models in the field of\ncomputer vision, delivering superior performance across various vision tasks.\nHowever, the high computational complexity poses a significant barrier to their\npractical applications in real-world scenarios. Motivated by the fact that not\nall tokens contribute equally to the final predictions and fewer tokens bring\nless computational cost, reducing redundant tokens has become a prevailing\nparadigm for accelerating vision transformers. However, we argue that it is not\noptimal to either only reduce inattentive redundancy by token pruning, or only\nreduce duplicative redundancy by token merging. To this end, in this paper we\npropose a novel acceleration framework, namely token Pruning & Pooling\nTransformers (PPT), to adaptively tackle these two types of redundancy in\ndifferent layers. By heuristically integrating both token pruning and token\npooling techniques in ViTs without additional trainable parameters, PPT\neffectively reduces the model complexity while maintaining its predictive\naccuracy. For example, PPT reduces over 37% FLOPs and improves the throughput\nby over 45% for DeiT-S without any accuracy drop on the ImageNet dataset. The\ncode is available at https://github.com/xjwu1024/PPT and\nhttps://github.com/mindspore-lab/models/\n","authors":["Xinjian Wu","Fanhu Zeng","Xiudong Wang","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01812v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02826v1","updated":"2024-02-05T09:18:49Z","published":"2024-02-05T09:18:49Z","title":"SynthVision -- Harnessing Minimal Input for Maximal Output in Computer\n Vision Models using Synthetic Image data","summary":" Rapid development of disease detection computer vision models is vital in\nresponse to urgent medical crises like epidemics or events of bioterrorism.\nHowever, traditional data gathering methods are too slow for these scenarios\nnecessitating innovative approaches to generate reliable models quickly from\nminimal data. We demonstrate our new approach by building a comprehensive\ncomputer vision model for detecting Human Papilloma Virus Genital warts using\nonly synthetic data. In our study, we employed a two phase experimental design\nusing diffusion models. In the first phase diffusion models were utilized to\ngenerate a large number of diverse synthetic images from 10 HPV guide images\nexplicitly focusing on accurately depicting genital warts. The second phase\ninvolved the training and testing vision model using this synthetic dataset.\nThis method aimed to assess the effectiveness of diffusion models in rapidly\ngenerating high quality training data and the subsequent impact on the vision\nmodel performance in medical image recognition. The study findings revealed\nsignificant insights into the performance of the vision model trained on\nsynthetic images generated through diffusion models. The vision model showed\nexceptional performance in accurately identifying cases of genital warts. It\nachieved an accuracy rate of 96% underscoring its effectiveness in medical\nimage classification. For HPV cases the model demonstrated a high precision of\n99% and a recall of 94%. In normal cases the precision was 95% with an\nimpressive recall of 99%. These metrics indicate the model capability to\ncorrectly identify true positive cases and minimize false positives. The model\nachieved an F1 Score of 96% for HPV cases and 97% for normal cases. The high F1\nScore across both categories highlights the balanced nature of the model\nprecision and recall ensuring reliability and robustness in its predictions.\n","authors":["Yudara Kularathne","Prathapa Janitha","Sithira Ambepitiya","Thanveer Ahamed","Dinuka Wijesundara","Prarththanan Sothyrajah"],"pdf_url":"https://arxiv.org/pdf/2402.02826v1.pdf","comment":"12 pages 5 figures 1 table"},{"id":"http://arxiv.org/abs/2307.05591v2","updated":"2024-02-05T09:10:15Z","published":"2023-07-10T17:59:21Z","title":"Linear Alignment of Vision-language Models for Image Captioning","summary":" Recently, vision-language models like CLIP have advanced the state of the art\nin a variety of multi-modal tasks including image captioning and caption\nevaluation. Many approaches adapt CLIP-style models to a downstream task by\ntraining a mapping network between CLIP and a language model. This is costly as\nit usually involves calculating gradients for large models. We propose a more\nefficient training protocol that fits a linear mapping between image and text\nembeddings of CLIP via a closed-form solution. This bypasses the need for\ngradient computation and results in a lightweight captioning method called\nReCap, which can be trained up to 1000 times faster than existing lightweight\nmethods. Moreover, we propose two new learning-based image-captioning metrics\nthat build on CLIP score along with our linear mapping. Furthermore, we combine\nReCap with our new metrics to design an iterative datastore-augmentation loop\n(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k,\nVizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art\nlightweight methods on established metrics while outperforming them on our new\nmetrics, which are better aligned with human ratings on Flickr8k-Expert and\nFlickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to\nother domains and that our DAL leads to a performance boost.\n","authors":["Fabian Paischer","Markus Hofmarcher","Sepp Hochreiter","Thomas Adler"],"pdf_url":"https://arxiv.org/pdf/2307.05591v2.pdf","comment":"8 pages (+ references and appendix)"},{"id":"http://arxiv.org/abs/2402.02811v1","updated":"2024-02-05T08:41:39Z","published":"2024-02-05T08:41:39Z","title":"Multi-scale fMRI time series analysis for understanding\n neurodegeneration in MCI","summary":" In this study, we present a technique that spans multi-scale views (global\nscale -- meaning brain network-level and local scale -- examining each\nindividual ROI that constitutes the network) applied to resting-state fMRI\nvolumes. Deep learning based classification is utilized in understanding\nneurodegeneration. The novelty of the proposed approach lies in utilizing two\nextreme scales of analysis. One branch considers the entire network within\ngraph-analysis framework. Concurrently, the second branch scrutinizes each ROI\nwithin a network independently, focusing on evolution of dynamics. For each\nsubject, graph-based approach employs partial correlation to profile the\nsubject in a single graph where each ROI is a node, providing insights into\ndifferences in levels of participation. In contrast, non-linear analysis\nemploys recurrence plots to profile a subject as a multichannel 2D image,\nrevealing distinctions in underlying dynamics. The proposed approach is\nemployed for classification of a cohort of 50 healthy control (HC) and 50 Mild\nCognitive Impairment (MCI), sourced from ADNI dataset. Results point to: (1)\nreduced activity in ROIs such as PCC in MCI (2) greater activity in occipital\nin MCI, which is not seen in HC (3) when analysed for dynamics, all ROIs in MCI\nshow greater predictability in time-series.\n","authors":["Ammu R.","Debanjali Bhattacharya","Ameiy Acharya","Ninad Aithal","Neelam Sinha"],"pdf_url":"https://arxiv.org/pdf/2402.02811v1.pdf","comment":"12 pages, 3 figures and 4 tables"},{"id":"http://arxiv.org/abs/2311.18373v2","updated":"2024-02-05T08:34:36Z","published":"2023-11-30T09:14:37Z","title":"A Survey on Deep Learning for Polyp Segmentation: Techniques, Challenges\n and Future Trends","summary":" Early detection and assessment of polyps play a crucial role in the\nprevention and treatment of colorectal cancer (CRC). Polyp segmentation\nprovides an effective solution to assist clinicians in accurately locating and\nsegmenting polyp regions. In the past, people often relied on manually\nextracted lower-level features such as color, texture, and shape, which often\nhad issues capturing global context and lacked robustness to complex scenarios.\nWith the advent of deep learning, more and more outstanding medical image\nsegmentation algorithms based on deep learning networks have emerged, making\nsignificant progress in this field. This paper provides a comprehensive review\nof polyp segmentation algorithms. We first review some traditional algorithms\nbased on manually extracted features and deep segmentation algorithms, then\ndetail benchmark datasets related to the topic. Specifically, we carry out a\ncomprehensive evaluation of recent deep learning models and results based on\npolyp sizes, considering the pain points of research topics and differences in\nnetwork structures. Finally, we discuss the challenges of polyp segmentation\nand future trends in this field. The models, benchmark datasets, and source\ncode links we collected are all published at\nhttps://github.com/taozh2017/Awesome-Polyp-Segmentation.\n","authors":["Jiaxin Mei","Tao Zhou","Kaiwen Huang","Yizhe Zhang","Yi Zhou","Ye Wu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2311.18373v2.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.02800v1","updated":"2024-02-05T08:18:47Z","published":"2024-02-05T08:18:47Z","title":"Extreme Two-View Geometry From Object Poses with Diffusion Models","summary":" Human has an incredible ability to effortlessly perceive the viewpoint\ndifference between two images containing the same object, even when the\nviewpoint change is astonishingly vast with no co-visible regions in the\nimages. This remarkable skill, however, has proven to be a challenge for\nexisting camera pose estimation methods, which often fail when faced with large\nviewpoint differences due to the lack of overlapping local features for\nmatching. In this paper, we aim to effectively harness the power of object\npriors to accurately determine two-view geometry in the face of extreme\nviewpoint changes. In our method, we first mathematically transform the\nrelative camera pose estimation problem to an object pose estimation problem.\nThen, to estimate the object pose, we utilize the object priors learned from a\ndiffusion model Zero123 to synthesize novel-view images of the object. The\nnovel-view images are matched to determine the object pose and thus the\ntwo-view camera pose. In experiments, our method has demonstrated extraordinary\nrobustness and resilience to large viewpoint changes, consistently estimating\ntwo-view poses with exceptional generalization ability across both synthetic\nand real-world datasets. Code will be available at\nhttps://github.com/scy639/Extreme-Two-View-Geometry-From-Object-Poses-with-Diffusion-Models.\n","authors":["Yujing Sun","Caiyi Sun","Yuan Liu","Yuexin Ma","Siu Ming Yiu"],"pdf_url":"https://arxiv.org/pdf/2402.02800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15934v2","updated":"2024-02-05T08:10:36Z","published":"2024-01-29T07:44:09Z","title":"HICH Image/Text (HICH-IT): Comprehensive Text and Image Datasets for\n Hypertensive Intracerebral Hemorrhage Research","summary":" In this paper, we introduce a new dataset in the medical field of\nhypertensive intracerebral hemorrhage (HICH), called HICH-IT, which includes\nboth electronic medical records (EMRs) and head CT images. This dataset is\ndesigned to enhance the accuracy of artificial intelligence in the diagnosis\nand treatment of HICH. This dataset, built upon the foundation of standard text\nand image data, incorporates specific annotations within the EMRs, extracting\nkey content from the text information, and categorizes the annotation content\nof imaging data into four types: brain midline, hematoma, left and right\ncerebral ventricle. HICH-IT aims to be a foundational dataset for feature\nlearning in image segmentation tasks and named entity recognition. To further\nunderstand the dataset, we have trained deep learning algorithms to observe the\nperformance. The pretrained models have been released at both www.daip.club and\ngithub.com/Deep-AI-Application-DAIP. The dataset has been uploaded to\nhttps://github.com/CYBUS123456/HICH-IT-Datasets.\n Index Terms-HICH, Deep learning, Intraparenchymal hemorrhage, named entity\nrecognition, novel dataset\n","authors":["Jie Li","Yulong Xia","Tongxin Yang","Fenglin Cai","Miao Wei","Zhiwei Zhang","Li Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.15934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02797v1","updated":"2024-02-05T08:10:16Z","published":"2024-02-05T08:10:16Z","title":"Joint Attention-Guided Feature Fusion Network for Saliency Detection of\n Surface Defects","summary":" Surface defect inspection plays an important role in the process of\nindustrial manufacture and production. Though Convolutional Neural Network\n(CNN) based defect inspection methods have made huge leaps, they still confront\na lot of challenges such as defect scale variation, complex background, low\ncontrast, and so on. To address these issues, we propose a joint\nattention-guided feature fusion network (JAFFNet) for saliency detection of\nsurface defects based on the encoder-decoder network. JAFFNet mainly\nincorporates a joint attention-guided feature fusion (JAFF) module into\ndecoding stages to adaptively fuse low-level and high-level features. The JAFF\nmodule learns to emphasize defect features and suppress background noise during\nfeature fusion, which is beneficial for detecting low-contrast defects. In\naddition, JAFFNet introduces a dense receptive field (DRF) module following the\nencoder to capture features with rich context information, which helps detect\ndefects of different scales. The JAFF module mainly utilizes a learned joint\nchannel-spatial attention map provided by high-level semantic features to guide\nfeature fusion. The attention map makes the model pay more attention to defect\nfeatures. The DRF module utilizes a sequence of multi-receptive-field (MRF)\nunits with each taking as inputs all the preceding MRF feature maps and the\noriginal input. The obtained DRF features capture rich context information with\na large range of receptive fields. Extensive experiments conducted on\nSD-saliency-900, Magnetic tile, and DAGM 2007 indicate that our method achieves\npromising performance in comparison with other state-of-the-art methods.\nMeanwhile, our method reaches a real-time defect detection speed of 66 FPS.\n","authors":["Xiaoheng Jiang","Feng Yan","Yang Lu","Ke Wang","Shuai Guo","Tianzhu Zhang","Yanwei Pang","Jianwei Niu","Mingliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.02797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01595v2","updated":"2024-02-05T07:59:34Z","published":"2022-10-04T13:18:15Z","title":"FreDSNet: Joint Monocular Depth and Semantic Segmentation with Fast\n Fourier Convolutions","summary":" In this work we present FreDSNet, a deep learning solution which obtains\nsemantic 3D understanding of indoor environments from single panoramas.\nOmnidirectional images reveal task-specific advantages when addressing scene\nunderstanding problems due to the 360-degree contextual information about the\nentire environment they provide. However, the inherent characteristics of the\nomnidirectional images add additional problems to obtain an accurate detection\nand segmentation of objects or a good depth estimation. To overcome these\nproblems, we exploit convolutions in the frequential domain obtaining a wider\nreceptive field in each convolutional layer. These convolutions allow to\nleverage the whole context information from omnidirectional images. FreDSNet is\nthe first network that jointly provides monocular depth estimation and semantic\nsegmentation from a single panoramic image exploiting fast Fourier\nconvolutions. Our experiments show that FreDSNet has similar performance as\nspecific state of the art methods for semantic segmentation and depth\nestimation. FreDSNet code is publicly available in\nhttps://github.com/Sbrunoberenguel/FreDSNet\n","authors":["Bruno Berenguel-Baeta","Jesus Bermudez-Cameo","Jose J. Guerrero"],"pdf_url":"https://arxiv.org/pdf/2210.01595v2.pdf","comment":"7 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.07402v3","updated":"2024-02-05T07:41:33Z","published":"2024-01-15T00:40:41Z","title":"Improved Implicit Neural Representation with Fourier Bases\n Reparameterized Training","summary":" Implicit Neural Representation (INR) as a mighty representation paradigm has\nachieved success in various computer vision tasks recently. Due to the\nlow-frequency bias issue of vanilla multi-layer perceptron (MLP), existing\nmethods have investigated advanced techniques, such as positional encoding and\nperiodic activation function, to improve the accuracy of INR. In this paper, we\nconnect the network training bias with the reparameterization technique and\ntheoretically prove that weight reparameterization could provide us a chance to\nalleviate the spectral bias of MLP. Based on our theoretical analysis, we\npropose a Fourier reparameterization method which learns coefficient matrix of\nfixed Fourier bases to compose the weights of MLP. We evaluate the proposed\nFourier reparameterization method on different INR tasks with various MLP\narchitectures, including vanilla MLP, MLP with positional encoding and MLP with\nadvanced activation function, etc. The superiority approximation results on\ndifferent MLP architectures clearly validate the advantage of our proposed\nmethod. Armed with our Fourier reparameterization method, better INR with more\ntextures and less artifacts can be learned from the training data.\n","authors":["Kexuan Shi","Xingyu Zhou","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.07402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11388v5","updated":"2024-02-05T06:55:07Z","published":"2022-10-20T16:27:54Z","title":"Physics-informed Deep Diffusion MRI Reconstruction with Synthetic Data:\n Break Training Data Bottleneck in Artificial Intelligence","summary":" Diffusion magnetic resonance imaging (MRI) is the only imaging modality for\nnon-invasive movement detection of in vivo water molecules, with significant\nclinical and research applications. Diffusion MRI (DWI) acquired by multi-shot\ntechniques can achieve higher resolution, better signal-to-noise ratio, and\nlower geometric distortion than single-shot, but suffers from inter-shot\nmotion-induced artifacts. These artifacts cannot be removed prospectively,\nleading to the absence of artifact-free training labels. Thus, the potential of\ndeep learning in multi-shot DWI reconstruction remains largely untapped. To\nbreak the training data bottleneck, here, we propose a Physics-Informed Deep\nDWI reconstruction method (PIDD) to synthesize high-quality paired training\ndata by leveraging the physical diffusion model (magnitude synthesis) and\ninter-shot motion-induced phase model (motion phase synthesis). The network is\ntrained only once with 100,000 synthetic samples, achieving encouraging results\non multiple realistic in vivo data reconstructions. Advantages over\nconventional methods include: (a) Better motion artifact suppression and\nreconstruction stability; (b) Outstanding generalization to multi-scenario\nreconstructions, including multi-resolution, multi-b-value,\nmulti-undersampling, multi-vendor, and multi-center; (c) Excellent clinical\nadaptability to patients with verifications by seven experienced doctors\n(p<0.001). In conclusion, PIDD presents a novel deep learning framework by\nexploiting the power of MRI physics, providing a cost-effective and explainable\nway to break the data bottleneck in deep learning medical imaging.\n","authors":["Chen Qian","Yuncheng Gao","Mingyang Han","Zi Wang","Dan Ruan","Yu Shen","Yaping Wu","Yirong Zhou","Chengyan Wang","Boyu Jiang","Ran Tao","Zhigang Wu","Jiazheng Wang","Liuhong Zhu","Yi Guo","Taishan Kang","Jianzhong Lin","Tao Gong","Chen Yang","Guoqiang Fei","Meijin Lin","Di Guo","Jianjun Zhou","Meiyun Wang","Xiaobo Qu"],"pdf_url":"https://arxiv.org/pdf/2210.11388v5.pdf","comment":"23 pages, 16 figures"},{"id":"http://arxiv.org/abs/2402.02761v1","updated":"2024-02-05T06:37:09Z","published":"2024-02-05T06:37:09Z","title":"Transmission Line Detection Based on Improved Hough Transform","summary":" To address the challenges of low detection accuracy and high false positive\nrates of transmission lines in UAV (Unmanned Aerial Vehicle) images, we explore\nthe linear features and spatial distribution. We introduce an enhanced\nstochastic Hough transform technique tailored for detecting transmission lines\nin complex backgrounds. By employing the Hessian matrix for initial\npreprocessing of transmission lines, and utilizing boundary search and pixel\nrow segmentation, our approach distinguishes transmission line areas from the\nbackground. We significantly reduce both false positives and missed detections,\nthereby improving the accuracy of transmission line identification. Experiments\ndemonstrate that our method not only processes images more rapidly, but also\nyields superior detection results compared to conventional and random Hough\ntransform methods.\n","authors":["Wei Song","Pei Li","Man Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02739v1","updated":"2024-02-05T05:46:31Z","published":"2024-02-05T05:46:31Z","title":"DisDet: Exploring Detectability of Backdoor Attack on Diffusion Models","summary":" In the exciting generative AI era, the diffusion model has emerged as a very\npowerful and widely adopted content generation and editing tool for various\ndata modalities, making the study of their potential security risks very\nnecessary and critical. Very recently, some pioneering works have shown the\nvulnerability of the diffusion model against backdoor attacks, calling for\nin-depth analysis and investigation of the security challenges of this popular\nand fundamental AI technique.\n In this paper, for the first time, we systematically explore the\ndetectability of the poisoned noise input for the backdoored diffusion models,\nan important performance metric yet little explored in the existing works.\nStarting from the perspective of a defender, we first analyze the properties of\nthe trigger pattern in the existing diffusion backdoor attacks, discovering the\nimportant role of distribution discrepancy in Trojan detection. Based on this\nfinding, we propose a low-cost trigger detection mechanism that can effectively\nidentify the poisoned input noise. We then take a further step to study the\nsame problem from the attack side, proposing a backdoor attack strategy that\ncan learn the unnoticeable trigger to evade our proposed detection scheme.\n Empirical evaluations across various diffusion models and datasets\ndemonstrate the effectiveness of the proposed trigger detection and\ndetection-evading attack strategy. For trigger detection, our distribution\ndiscrepancy-based solution can achieve a 100\\% detection rate for the Trojan\ntriggers used in the existing works. For evading trigger detection, our\nproposed stealthy trigger design approach performs end-to-end learning to make\nthe distribution of poisoned noise input approach that of benign noise,\nenabling nearly 100\\% detection pass rate with very high attack and benign\nperformance for the backdoored diffusion models.\n","authors":["Yang Sui","Huy Phan","Jinqi Xiao","Tianfang Zhang","Zijie Tang","Cong Shi","Yan Wang","Yingying Chen","Bo Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.02739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02738v1","updated":"2024-02-05T05:38:50Z","published":"2024-02-05T05:38:50Z","title":"Improving Robustness of LiDAR-Camera Fusion Model against Weather\n Corruption from Fusion Strategy Perspective","summary":" In recent years, LiDAR-camera fusion models have markedly advanced 3D object\ndetection tasks in autonomous driving. However, their robustness against common\nweather corruption such as fog, rain, snow, and sunlight in the intricate\nphysical world remains underexplored. In this paper, we evaluate the robustness\nof fusion models from the perspective of fusion strategies on the corrupted\ndataset. Based on the evaluation, we further propose a concise yet practical\nfusion strategy to enhance the robustness of the fusion models, namely flexibly\nweighted fusing features from LiDAR and camera sources to adapt to varying\nweather scenarios. Experiments conducted on four types of fusion models, each\nwith two distinct lightweight implementations, confirm the broad applicability\nand effectiveness of the approach.\n","authors":["Yihao Huang","Kaiyuan Yu","Qing Guo","Felix Juefei-Xu","Xiaojun Jia","Tianlin Li","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.02738v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2402.02736v1","updated":"2024-02-05T05:37:48Z","published":"2024-02-05T05:37:48Z","title":"Using Motion Cues to Supervise Single-Frame Body Pose and Shape\n Estimation in Low Data Regimes","summary":" When enough annotated training data is available, supervised deep-learning\nalgorithms excel at estimating human body pose and shape using a single camera.\nThe effects of too little such data being available can be mitigated by using\nother information sources, such as databases of body shapes, to learn priors.\nUnfortunately, such sources are not always available either. We show that, in\nsuch cases, easy-to-obtain unannotated videos can be used instead to provide\nthe required supervisory signals. Given a trained model using too little\nannotated data, we compute poses in consecutive frames along with the optical\nflow between them. We then enforce consistency between the image optical flow\nand the one that can be inferred from the change in pose from one frame to the\nnext. This provides enough additional supervision to effectively refine the\nnetwork weights and to perform on par with methods trained using far more\nannotated data.\n","authors":["Andrey Davydov","Alexey Sidnev","Artsiom Sanakoyeu","Yuhua Chen","Mathieu Salzmann","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2402.02736v1.pdf","comment":"21 pages; TMLR"},{"id":"http://arxiv.org/abs/2402.02734v1","updated":"2024-02-05T05:26:17Z","published":"2024-02-05T05:26:17Z","title":"InVA: Integrative Variational Autoencoder for Harmonization of\n Multi-modal Neuroimaging Data","summary":" There is a significant interest in exploring non-linear associations among\nmultiple images derived from diverse imaging modalities. While there is a\ngrowing literature on image-on-image regression to delineate predictive\ninference of an image based on multiple images, existing approaches have\nlimitations in efficiently borrowing information between multiple imaging\nmodalities in the prediction of an image. Building on the literature of\nVariational Auto Encoders (VAEs), this article proposes a novel approach,\nreferred to as Integrative Variational Autoencoder (\\texttt{InVA}) method,\nwhich borrows information from multiple images obtained from different sources\nto draw predictive inference of an image. The proposed approach captures\ncomplex non-linear association between the outcome image and input images,\nwhile allowing rapid computation. Numerical results demonstrate substantial\nadvantages of \\texttt{InVA} over VAEs, which typically do not allow borrowing\ninformation between input images. The proposed framework offers highly accurate\npredictive inferences for costly positron emission topography (PET) from\nmultiple measures of cortical structure in human brain scans readily available\nfrom magnetic resonance imaging (MRI).\n","authors":["Bowen Lei","Rajarshi Guhaniyogi","Krishnendu Chandra","Aaron Scheffler","Bani Mallick"],"pdf_url":"https://arxiv.org/pdf/2402.02734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02733v1","updated":"2024-02-05T05:25:33Z","published":"2024-02-05T05:25:33Z","title":"ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer","summary":" Face re-aging is a prominent field in computer vision and graphics, with\nsignificant applications in photorealistic domains such as movies, advertising,\nand live streaming. Recently, the need to apply face re-aging to\nnon-photorealistic images, like comics, illustrations, and animations, has\nemerged as an extension in various entertainment sectors. However, the absence\nof a network capable of seamlessly editing the apparent age on NPR images means\nthat these tasks have been confined to a naive approach, applying each task\nsequentially. This often results in unpleasant artifacts and a loss of facial\nattributes due to domain discrepancies. In this paper, we introduce a novel\none-stage method for face re-aging combined with portrait style transfer,\nexecuted in a single generative step. We leverage existing face re-aging and\nstyle transfer networks, both trained within the same PR domain. Our method\nuniquely fuses distinct latent vectors, each responsible for managing\naging-related attributes and NPR appearance. Adopting an exemplar-based\napproach, our method offers greater flexibility than domain-level fine-tuning\napproaches, which typically require separate training or fine-tuning for each\ndomain. This effectively addresses the limitation of requiring paired datasets\nfor re-aging and domain-level, data-driven approaches for stylization. Our\nexperiments show that our model can effortlessly generate re-aged images while\nsimultaneously transferring the style of examples, maintaining both natural\nappearance and controllability.\n","authors":["Bumsoo Kim","Abdul Muqeet","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2402.02733v1.pdf","comment":"8 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.17102v2","updated":"2024-02-05T05:04:53Z","published":"2023-09-29T10:01:50Z","title":"Guiding Instruction-based Image Editing via Multimodal Large Language\n Models","summary":" Instruction-based image editing improves the controllability and flexibility\nof image manipulation via natural commands without elaborate descriptions or\nregional masks. However, human instructions are sometimes too brief for current\nmethods to capture and follow. Multimodal large language models (MLLMs) show\npromising capabilities in cross-modal understanding and visual-aware response\ngeneration via LMs. We investigate how MLLMs facilitate edit instructions and\npresent MLLM-Guided Image Editing (MGIE). MGIE learns to derive expressive\ninstructions and provides explicit guidance. The editing model jointly captures\nthis visual imagination and performs manipulation through end-to-end training.\nWe evaluate various aspects of Photoshop-style modification, global photo\noptimization, and local editing. Extensive experimental results demonstrate\nthat expressive instructions are crucial to instruction-based image editing,\nand our MGIE can lead to a notable improvement in automatic metrics and human\nevaluation while maintaining competitive inference efficiency.\n","authors":["Tsu-Jui Fu","Wenze Hu","Xianzhi Du","William Yang Wang","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2309.17102v2.pdf","comment":"ICLR'24 (Spotlight) ; Project at https://mllm-ie.github.io ; Code at\n https://github.com/tsujuifu/pytorch_mgie"},{"id":"http://arxiv.org/abs/2307.10249v4","updated":"2024-02-05T05:02:39Z","published":"2023-07-17T07:22:25Z","title":"RCM-Fusion: Radar-Camera Multi-Level Fusion for 3D Object Detection","summary":" While LiDAR sensors have been successfully applied to 3D object detection,\nthe affordability of radar and camera sensors has led to a growing interest in\nfusing radars and cameras for 3D object detection. However, previous\nradar-camera fusion models were unable to fully utilize the potential of radar\ninformation. In this paper, we propose Radar-Camera Multi-level fusion\n(RCM-Fusion), which attempts to fuse both modalities at both feature and\ninstance levels. For feature-level fusion, we propose a Radar Guided BEV\nEncoder which transforms camera features into precise BEV representations using\nthe guidance of radar Bird's-Eye-View (BEV) features and combines the radar and\ncamera BEV features. For instance-level fusion, we propose a Radar Grid Point\nRefinement module that reduces localization error by accounting for the\ncharacteristics of the radar point clouds. The experiments conducted on the\npublic nuScenes dataset demonstrate that our proposed RCM-Fusion achieves\nstate-of-the-art performances among single frame-based radar-camera fusion\nmethods in the nuScenes 3D object detection benchmark. Code will be made\npublicly available.\n","authors":["Jisong Kim","Minjae Seong","Geonho Bang","Dongsuk Kum","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2307.10249v4.pdf","comment":"Accepted by IEEE International Conference on Robotics and Automation\n (ICRA 2024), 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.02729v1","updated":"2024-02-05T05:01:28Z","published":"2024-02-05T05:01:28Z","title":"Fast and Accurate Cooperative Radio Map Estimation Enabled by GAN","summary":" In the 6G era, real-time radio resource monitoring and management are urged\nto support diverse wireless-empowered applications. This calls for fast and\naccurate estimation on the distribution of the radio resources, which is\nusually represented by the spatial signal power strength over the geographical\nenvironment, known as a radio map. In this paper, we present a cooperative\nradio map estimation (CRME) approach enabled by the generative adversarial\nnetwork (GAN), called as GAN-CRME, which features fast and accurate radio map\nestimation without the transmitters' information. The radio map is inferred by\nexploiting the interaction between distributed received signal strength (RSS)\nmeasurements at mobile users and the geographical map using a deep neural\nnetwork estimator, resulting in low data-acquisition cost and computational\ncomplexity. Moreover, a GAN-based learning algorithm is proposed to boost the\ninference capability of the deep neural network estimator by exploiting the\npower of generative AI. Simulation results showcase that the proposed GAN-CRME\nis even capable of coarse error-correction when the geographical map\ninformation is inaccurate.\n","authors":["Zezhong Zhang","Guangxu Zhu","Junting Chen","Shuguang Cui"],"pdf_url":"https://arxiv.org/pdf/2402.02729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02724v1","updated":"2024-02-05T04:45:24Z","published":"2024-02-05T04:45:24Z","title":"FDNet: Frequency Domain Denoising Network For Cell Segmentation in\n Astrocytes Derived From Induced Pluripotent Stem Cells","summary":" Artificially generated induced pluripotent stem cells (iPSCs) from somatic\ncells play an important role for disease modeling and drug screening of\nneurodegenerative diseases. Astrocytes differentiated from iPSCs are important\ntargets to investigate neuronal metabolism. The astrocyte differentiation\nprogress can be monitored through the variations of morphology observed from\nmicroscopy images at different differentiation stages, then determined by\nmolecular biology techniques upon maturation. However, the astrocytes usually\n``perfectly'' blend into the background and some of them are covered by\ninterference information (i.e., dead cells, media sediments, and cell debris),\nwhich makes astrocytes difficult to observe. Due to the lack of annotated\ndatasets, the existing state-of-the-art deep learning approaches cannot be used\nto address this issue. In this paper, we introduce a new task named astrocyte\nsegmentation with a novel dataset, called IAI704, which contains 704 images and\ntheir corresponding pixel-level annotation masks. Moreover, a novel frequency\ndomain denoising network, named FDNet, is proposed for astrocyte segmentation.\nIn detail, our FDNet consists of a contextual information fusion module (CIF),\nan attention block (AB), and a Fourier transform block (FTB). CIF and AB fuse\nmulti-scale feature embeddings to localize the astrocytes. FTB transforms\nfeature embeddings into the frequency domain and conducts a high-pass filter to\neliminate interference information. Experimental results demonstrate the\nsuperiority of our proposed FDNet over the state-of-the-art substitutes in\nastrocyte segmentation, shedding insights for iPSC differentiation progress\nprediction.\n","authors":["Haoran Li","Jiahua Shi","Huaming Chen","Bo Du","Simon Maksour","Gabrielle Phillips","Mirella Dottori","Jun Shen"],"pdf_url":"https://arxiv.org/pdf/2402.02724v1.pdf","comment":"Accepted by The IEEE International Symposium on Biomedical Imaging\n (ISBI) 2024"},{"id":"http://arxiv.org/abs/2402.02705v1","updated":"2024-02-05T03:39:39Z","published":"2024-02-05T03:39:39Z","title":"Representation Surgery for Multi-Task Model Merging","summary":" Multi-task learning (MTL) compresses the information from multiple tasks into\na unified backbone to improve computational efficiency and generalization.\nRecent work directly merges multiple independently trained models to perform\nMTL instead of collecting their raw data for joint training, greatly expanding\nthe application scenarios of MTL. However, by visualizing the representation\ndistribution of existing model merging schemes, we find that the merged model\noften suffers from the dilemma of representation bias. That is, there is a\nsignificant discrepancy in the representation distribution between the merged\nand individual models, resulting in poor performance of merged MTL. In this\npaper, we propose a representation surgery solution called \"Surgery\" to reduce\nrepresentation bias in the merged model. Specifically, Surgery is a lightweight\ntask-specific module that takes the representation of the merged model as input\nand attempts to output the biases contained in the representation from the\nmerged model. We then designed an unsupervised optimization objective that\nupdates the Surgery module by minimizing the distance between the merged\nmodel's representation and the individual model's representation. Extensive\nexperiments demonstrate significant MTL performance improvements when our\nSurgery module is applied to state-of-the-art (SOTA) model merging schemes.\n","authors":["Enneng Yang","Li Shen","Zhenyi Wang","Guibing Guo","Xiaojun Chen","Xingwei Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2402.02705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.14872v2","updated":"2024-02-05T03:32:49Z","published":"2021-03-27T10:08:41Z","title":"Deep Learning Techniques for In-Crop Weed Identification: A Review","summary":" Weeds are a significant threat to the agricultural productivity and the\nenvironment. The increasing demand for sustainable agriculture has driven\ninnovations in accurate weed control technologies aimed at reducing the\nreliance on herbicides. With the great success of deep learning in various\nvision tasks, many promising image-based weed detection algorithms have been\ndeveloped. This paper reviews recent developments of deep learning techniques\nin the field of image-based weed detection. The review begins with an\nintroduction to the fundamentals of deep learning related to weed detection.\nNext, recent progresses on deep weed detection are reviewed with the discussion\nof the research materials including public weed datasets. Finally, the\nchallenges of developing practically deployable weed detection methods are\nsummarized, together with the discussions of the opportunities for future\nresearch.We hope that this review will provide a timely survey of the field and\nattract more researchers to address this inter-disciplinary research problem.\n","authors":["Kun Hu","Zhiyong Wang","Guy Coleman","Asher Bender","Tingting Yao","Shan Zeng","Dezhen Song","Arnold Schumann","Michael Walsh"],"pdf_url":"https://arxiv.org/pdf/2103.14872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04369v2","updated":"2024-02-05T03:08:52Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n Transformer for Frame-Event based Recognition","summary":" Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2402.01304v2","updated":"2024-02-05T03:04:58Z","published":"2024-02-02T10:48:43Z","title":"Phrase Grounding-based Style Transfer for Single-Domain Generalized\n Object Detection","summary":" Single-domain generalized object detection aims to enhance a model's\ngeneralizability to multiple unseen target domains using only data from a\nsingle source domain during training. This is a practical yet challenging task\nas it requires the model to address domain shift without incorporating target\ndomain data into training. In this paper, we propose a novel phrase\ngrounding-based style transfer (PGST) approach for the task. Specifically, we\nfirst define textual prompts to describe potential objects for each unseen\ntarget domain. Then, we leverage the grounded language-image pre-training\n(GLIP) model to learn the style of these target domains and achieve style\ntransfer from the source to the target domain. The style-transferred source\nvisual features are semantically rich and could be close to imaginary\ncounterparts in the target domain. Finally, we employ these style-transferred\nvisual features to fine-tune GLIP. By introducing imaginary counterparts, the\ndetector could be effectively generalized to unseen target domains using only a\nsingle source domain for training. Extensive experimental results on five\ndiverse weather driving benchmarks demonstrate our proposed approach achieves\nstate-of-the-art performance, even surpassing some domain adaptive methods that\nincorporate target domain images into the training process.The source codes and\npre-trained models will be made available.\n","authors":["Hao Li","Wei Wang","Cong Wang","Zhigang Luo","Xinwang Liu","Kenli Li","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2402.01304v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.17451v2","updated":"2024-02-05T02:50:49Z","published":"2023-10-26T15:00:21Z","title":"Generating by Understanding: Neural Visual Generation with Logical\n Symbol Groundings","summary":" Despite the great success of neural visual generative models in recent years,\nintegrating them with strong symbolic reasoning systems remains a challenging\ntask. There are two levels of symbol grounding problems among the core\nchallenges: the first is symbol assignment, i.e. mapping latent factors of\nneural visual generators to semantic-meaningful symbolic factors from the\nreasoning systems by learning from limited labeled data. The second is rule\nlearning, i.e. learning new rules that govern the generative process to enhance\nthe symbolic reasoning systems. To deal with these two problems, we propose a\nneurosymbolic learning approach, Abductive visual Generation (AbdGen), for\nintegrating logic programming systems with neural visual generative models\nbased on the abductive learning framework. To achieve reliable and efficient\nsymbol grounding, the quantized abduction method is introduced for generating\nabduction proposals by the nearest-neighbor lookup within semantic codebooks.\nTo achieve precise rule learning, the contrastive meta-abduction method is\nproposed to eliminate wrong rules with positive cases and avoid less\ninformative rules with negative cases simultaneously. Experimental results show\nthat compared to the baseline approaches, AbdGen requires significantly less\nlabeled data for symbol assignment. Furthermore, AbdGen can effectively learn\nunderlying logical generative rules from data, which is out of the capability\nof existing approaches. The code is released at this link:\nhttps://github.com/candytalking/AbdGen.\n","authors":["Yifei Peng","Yu Jin","Zhexu Luo","Yao-Xiang Ding","Wang-Zhou Dai","Zhong Ren","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.17451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16043v2","updated":"2024-02-05T01:55:13Z","published":"2023-12-26T13:14:17Z","title":"An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced\n linear classification","summary":" This article presents a new polynomial parameterized sigmoid called SIGTRON,\nwhich is an extended asymmetric sigmoid with Perceptron, and its companion\nconvex model called SIGTRON-imbalanced classification (SIC) model that employs\na virtual SIGTRON-induced convex loss function. In contrast to the conventional\n$\\pi$-weighted cost-sensitive learning model, the SIC model does not have an\nexternal $\\pi$-weight on the loss function but has internal parameters in the\nvirtual SIGTRON-induced loss function. As a consequence, when the given\ntraining dataset is close to the well-balanced condition, we show that the\nproposed SIC model is more adaptive to variations of the dataset, such as the\ninconsistency of the scale-class-imbalance ratio between the training and test\ndatasets. This adaptation is achieved by creating a skewed hyperplane equation.\nAdditionally, we present a quasi-Newton optimization(L-BFGS) framework for the\nvirtual convex loss by developing an interval-based bisection line search.\nEmpirically, we have observed that the proposed approach outperforms\n$\\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic\nregression, SVM, and L2SVM) in terms of test classification accuracy with $51$\ntwo-class and $67$ multi-class datasets. In binary classification problems,\nwhere the scale-class-imbalance ratio of the training dataset is not\nsignificant but the inconsistency exists, a group of SIC models with the best\ntest accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF\nkernel), a well-known kernel-based classifier.\n","authors":["Hyenkyun Woo"],"pdf_url":"https://arxiv.org/pdf/2312.16043v2.pdf","comment":"24 pages, 9 figures, a typo is corrected"},{"id":"http://arxiv.org/abs/2304.14065v4","updated":"2024-02-05T01:29:35Z","published":"2023-04-27T09:52:35Z","title":"Lightweight, Pre-trained Transformers for Remote Sensing Timeseries","summary":" Machine learning methods for satellite data have a range of societally\nrelevant applications, but labels used to train models can be difficult or\nimpossible to acquire. Self-supervision is a natural solution in settings with\nlimited labeled data, but current self-supervised models for satellite data\nfail to take advantage of the characteristics of that data, including the\ntemporal dimension (which is critical for many applications, such as monitoring\ncrop growth) and availability of data from many complementary sensors (which\ncan significantly improve a model's predictive performance). We present Presto\n(the Pretrained Remote Sensing Transformer), a model pre-trained on remote\nsensing pixel-timeseries data. By designing Presto specifically for remote\nsensing data, we can create a significantly smaller but performant model.\nPresto excels at a wide variety of globally distributed remote sensing tasks\nand performs competitively with much larger models while requiring far less\ncompute. Presto can be used for transfer learning or as a feature extractor for\nsimple models, enabling efficient deployment at scale.\n","authors":["Gabriel Tseng","Ruben Cartuyvels","Ivan Zvonkov","Mirali Purohit","David Rolnick","Hannah Kerner"],"pdf_url":"https://arxiv.org/pdf/2304.14065v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02662v1","updated":"2024-02-05T01:14:07Z","published":"2024-02-05T01:14:07Z","title":"Image-Caption Encoding for Improving Zero-Shot Generalization","summary":" Recent advances in vision-language models have combined contrastive\napproaches with generative methods to achieve state-of-the-art (SOTA) on\ndownstream inference tasks like zero-shot image classification. However, a\npersistent issue of these models for image classification is their\nout-of-distribution (OOD) generalization capabilities. We first show that when\nan OOD data point is misclassified, the correct class can be typically found in\nthe Top-K predicted classes. In order to steer the model prediction toward the\ncorrect class within the top predicted classes, we propose the Image-Caption\nEncoding (ICE) method, a straightforward approach that directly enforces\nconsistency between the image-conditioned and caption-conditioned predictions\nat evaluation time only. Intuitively, we take advantage of unique properties of\nthe generated captions to guide our local search for the correct class label\nwithin the Top-K predicted classes. We show that our method can be easily\ncombined with other SOTA methods to enhance Top-1 OOD accuracies by 0.5% on\naverage and up to 3% on challenging datasets. Our code:\nhttps://github.com/Chris210634/ice\n","authors":["Eric Yang Yu","Christopher Liao","Sathvik Ravi","Theodoros Tsiligkaridis","Brian Kulis"],"pdf_url":"https://arxiv.org/pdf/2402.02662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02653v1","updated":"2024-02-05T00:52:50Z","published":"2024-02-05T00:52:50Z","title":"Learning with Mixture of Prototypes for Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection aims to detect testing samples far away\nfrom the in-distribution (ID) training data, which is crucial for the safe\ndeployment of machine learning models in the real world. Distance-based OOD\ndetection methods have emerged with enhanced deep representation learning. They\nidentify unseen OOD samples by measuring their distances from ID class\ncentroids or prototypes. However, existing approaches learn the representation\nrelying on oversimplified data assumptions, e.g, modeling ID data of each class\nwith one centroid class prototype or using loss functions not designed for OOD\ndetection, which overlook the natural diversities within the data. Naively\nenforcing data samples of each class to be compact around only one prototype\nleads to inadequate modeling of realistic data and limited performance. To\ntackle these issues, we propose PrototypicAl Learning with a Mixture of\nprototypes (PALM) which models each class with multiple prototypes to capture\nthe sample diversities, and learns more faithful and compact samples embeddings\nto enhance OOD detection. Our method automatically identifies and dynamically\nupdates prototypes, assigning each sample to a subset of prototypes via\nreciprocal neighbor soft assignment weights. PALM optimizes a maximum\nlikelihood estimation (MLE) loss to encourage the sample embeddings to be\ncompact around the associated prototypes, as well as a contrastive loss on all\nprototypes to enhance intra-class compactness and inter-class discrimination at\nthe prototype level. Moreover, the automatic estimation of prototypes enables\nour approach to be extended to the challenging OOD detection task with\nunlabelled ID data. Extensive experiments demonstrate the superiority of PALM,\nachieving state-of-the-art average AUROC performance of 93.82 on the\nchallenging CIFAR-100 benchmark. Code is available at\nhttps://github.com/jeff024/PALM.\n","authors":["Haodong Lu","Dong Gong","Shuo Wang","Jason Xue","Lina Yao","Kristen Moore"],"pdf_url":"https://arxiv.org/pdf/2402.02653v1.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.02649v1","updated":"2024-02-05T00:44:57Z","published":"2024-02-05T00:44:57Z","title":"Densely Decoded Networks with Adaptive Deep Supervision for Medical\n Image Segmentation","summary":" Medical image segmentation using deep neural networks has been highly\nsuccessful. However, the effectiveness of these networks is often limited by\ninadequate dense prediction and inability to extract robust features. To\nachieve refined dense prediction, we propose densely decoded networks (ddn), by\nselectively introducing 'crutch' network connections. Such 'crutch' connections\nin each upsampling stage of the network decoder (1) enhance target localization\nby incorporating high resolution features from the encoder, and (2) improve\nsegmentation by facilitating multi-stage contextual information flow. Further,\nwe present a training strategy based on adaptive deep supervision (ads), which\nexploits and adapts specific attributes of input dataset, for robust feature\nextraction. In particular, ads strategically locates and deploys auxiliary\nsupervision, by matching the average input object size with the layer-wise\neffective receptive fields (lerf) of a network, resulting in a class of ddns.\nSuch inclusion of 'companion objective' from a specific hidden layer, helps the\nmodel pay close attention to some distinct input-dependent features, which the\nnetwork might otherwise 'ignore' during training. Our new networks and training\nstrategy are validated on 4 diverse datasets of different modalities,\ndemonstrating their effectiveness.\n","authors":["Suraj Mishra"],"pdf_url":"https://arxiv.org/pdf/2402.02649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05779v2","updated":"2024-02-05T00:32:13Z","published":"2024-01-11T09:30:36Z","title":"EraseDiff: Erasing Data Influence in Diffusion Models","summary":" In this work, we introduce an unlearning algorithm for diffusion models. Our\nalgorithm equips a diffusion model with a mechanism to mitigate the concerns\nrelated to data memorization. To achieve this, we formulate the unlearning\nproblem as a constraint optimization problem, aiming to preserve the utility of\nthe diffusion model on the remaining data and scrub the information associated\nwith forgetting data by deviating the learnable generative process from the\nground-truth denoising procedure. To solve the resulting problem, we adopt a\nfirst-order method, having superior practical performance while being vigilant\nabout the diffusion process. Empirically, we demonstrate that our algorithm can\npreserve the model utility, effectiveness, and efficiency while removing across\nthe widely-used diffusion models and in both conditional and unconditional\nimage generation scenarios.\n","authors":["Jing Wu","Trung Le","Munawar Hayat","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2401.05779v2.pdf","comment":"Diffusion Model, Machine Unlearning"},{"id":"http://arxiv.org/abs/2311.01441v2","updated":"2024-02-05T00:10:44Z","published":"2023-11-02T17:55:13Z","title":"Distilling Out-of-Distribution Robustness from Vision-Language\n Foundation Models","summary":" We propose a conceptually simple and lightweight framework for improving the\nrobustness of vision models through the combination of knowledge distillation\nand data augmentation. We address the conjecture that larger models do not make\nfor better teachers by showing strong gains in out-of-distribution robustness\nwhen distilling from pretrained foundation models. Following this finding, we\npropose Discrete Adversarial Distillation (DAD), which leverages a robust\nteacher to generate adversarial examples and a VQGAN to discretize them,\ncreating more informative samples than standard data augmentation techniques.\nWe provide a theoretical framework for the use of a robust teacher in the\nknowledge distillation with data augmentation setting and demonstrate strong\ngains in out-of-distribution robustness and clean accuracy across different\nstudent architectures. Notably, our method adds minor computational overhead\ncompared to similar techniques and can be easily combined with other data\naugmentations for further improvements.\n","authors":["Andy Zhou","Jindong Wang","Yu-Xiong Wang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01441v2.pdf","comment":"Published in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.03585v1","updated":"2024-02-05T23:30:37Z","published":"2024-02-05T23:30:37Z","title":"Decoder-Only Image Registration","summary":" In unsupervised medical image registration, the predominant approaches\ninvolve the utilization of a encoder-decoder network architecture, allowing for\nprecise prediction of dense, full-resolution displacement fields from given\npaired images. Despite its widespread use in the literature, we argue for the\nnecessity of making both the encoder and decoder learnable in such an\narchitecture. For this, we propose a novel network architecture, termed LessNet\nin this paper, which contains only a learnable decoder, while entirely omitting\nthe utilization of a learnable encoder. LessNet substitutes the learnable\nencoder with simple, handcrafted features, eliminating the need to learn\n(optimize) network parameters in the encoder altogether. Consequently, this\nleads to a compact, efficient, and decoder-only architecture for 3D medical\nimage registration. Evaluated on two publicly available brain MRI datasets, we\ndemonstrate that our decoder-only LessNet can effectively and efficiently learn\nboth dense displacement and diffeomorphic deformation fields in 3D.\nFurthermore, our decoder-only LessNet can achieve comparable registration\nperformance to state-of-the-art methods such as VoxelMorph and TransMorph,\nwhile requiring significantly fewer computational resources. Our code and\npre-trained models are available at https://github.com/xi-jia/LessNet.\n","authors":["Xi Jia","Wenqi Lu","Xinxing Cheng","Jinming Duan"],"pdf_url":"https://arxiv.org/pdf/2402.03585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.18013v3","updated":"2024-02-05T22:46:06Z","published":"2023-03-31T12:38:08Z","title":"LaCViT: A Label-aware Contrastive Fine-tuning Framework for Vision\n Transformers","summary":" Vision Transformers (ViTs) have emerged as popular models in computer vision,\ndemonstrating state-of-the-art performance across various tasks. This success\ntypically follows a two-stage strategy involving pre-training on large-scale\ndatasets using self-supervised signals, such as masked random patches, followed\nby fine-tuning on task-specific labeled datasets with cross-entropy loss.\nHowever, this reliance on cross-entropy loss has been identified as a limiting\nfactor in ViTs, affecting their generalization and transferability to\ndownstream tasks. Addressing this critical challenge, we introduce a novel\nLabel-aware Contrastive Training framework, LaCViT, which significantly\nenhances the quality of embeddings in ViTs. LaCViT not only addresses the\nlimitations of cross-entropy loss but also facilitates more effective transfer\nlearning across diverse image classification tasks. Our comprehensive\nexperiments on eight standard image classification datasets reveal that LaCViT\nstatistically significantly enhances the performance of three evaluated ViTs by\nup-to 10.78% under Top-1 Accuracy.\n","authors":["Zijun Long","Zaiqiao Meng","Gerardo Aragon Camarasa","Richard McCreadie"],"pdf_url":"https://arxiv.org/pdf/2303.18013v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01516v3","updated":"2024-02-05T22:43:45Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As Multimodal Large Language Models (MLLMs) grow in size, adapting them to\nspecialized tasks becomes increasingly challenging due to high computational\nand memory demands. Indeed, traditional fine-tuning methods are costly, due to\nthe need for extensive, task-specific training. While efficient adaptation\nmethods exist that aim to reduce these costs, in practice they suffer from\nshallow inter-modal alignment, which severely hurts model effectiveness. To\ntackle these computational challenges and improve inter-modal alignment, we\nintroduce the MultiWay-Adapter (MWA), a novel framework featuring an 'Alignment\nEnhancer'. This enhancer deepens inter-modal alignment, enabling high\ntransferability with minimal tuning effort. Our experiments show that unlike\nprior efficient tuning approaches, MWA maintains model effectiveness, while\nreducing training time by up-to 57%. MWA is also lightweight, increasing model\nsize by only 2-3% (in terms of parameters) for state-of-the-art foundation\nmodels like BEiT-3 Large. These results demonstrate that MWA provides an\nefficient and effective adaptation method for MLLMs, significantly broadening\ntheir applicability.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03561v1","updated":"2024-02-05T22:20:19Z","published":"2024-02-05T22:20:19Z","title":"VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language\n Navigation","summary":" Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate\nthrough realistic 3D outdoor environments based on natural language\ninstructions. The performance of existing VLN methods is limited by\ninsufficient diversity in navigation environments and limited training data. To\naddress these issues, we propose VLN-Video, which utilizes the diverse outdoor\nenvironments present in driving videos in multiple cities in the U.S. augmented\nwith automatically generated navigation instructions and actions to improve\noutdoor VLN performance. VLN-Video combines the best of intuitive classical\napproaches and modern deep learning techniques, using template infilling to\ngenerate grounded navigation instructions, combined with an image rotation\nsimilarity-based navigation action predictor to obtain VLN style data from\ndriving videos for pretraining deep learning VLN models. We pre-train the model\non the Touchdown dataset and our video-augmented dataset created from driving\nvideos with three proxy tasks: Masked Language Modeling, Instruction and\nTrajectory Matching, and Next Action Prediction, so as to learn\ntemporally-aware and visually-aligned instruction representations. The learned\ninstruction representation is adapted to the state-of-the-art navigator when\nfine-tuning on the Touchdown dataset. Empirical results demonstrate that\nVLN-Video significantly outperforms previous state-of-the-art models by 2.1% in\ntask completion rate, achieving a new state-of-the-art on the Touchdown\ndataset.\n","authors":["Jialu Li","Aishwarya Padmakumar","Gaurav Sukhatme","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2402.03561v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2402.03557v1","updated":"2024-02-05T22:15:55Z","published":"2024-02-05T22:15:55Z","title":"Robust Analysis of Multi-Task Learning on a Complex Vision System","summary":" Multi-task learning (MTL) has been widely studied in the past decade. In\nparticular, dozens of optimization algorithms have been proposed for different\nsettings. While each of them claimed improvement when applied to certain models\non certain datasets, there is still lack of deep understanding on the\nperformance in complex real-worlds scenarios. We identify the gaps between\nresearch and application and make the following 4 contributions. (1) We\ncomprehensively evaluate a large set of existing MTL optimization algorithms on\nthe MetaGraspNet dataset designed for robotic grasping task, which is complex\nand has high real-world application values, and conclude the best-performing\nmethods. (2) We empirically compare the method performance when applied on\nfeature-level gradients versus parameter-level gradients over a large set of\nMTL optimization algorithms, and conclude that this feature-level gradients\nsurrogate is reasonable when there are method-specific theoretical guarantee\nbut not generalizable to all methods. (3) We provide insights on the problem of\ntask interference and show that the existing perspectives of gradient angles\nand relative gradient norms do not precisely reflect the challenges of MTL, as\nthe rankings of the methods based on these two indicators do not align well\nwith those based on the test-set performance. (4) We provide a novel view of\nthe task interference problem from the perspective of the latent space induced\nby the feature extractor and provide training monitoring results based on\nfeature disentanglement.\n","authors":["Dayou Mao","Yuhao Chen","Yifan Wu","Maximilian Gilles","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2402.03557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03553v1","updated":"2024-02-05T22:12:42Z","published":"2024-02-05T22:12:42Z","title":"One-shot Neural Face Reenactment via Finding Directions in GAN's Latent\n Space","summary":" In this paper, we present our framework for neural face/head reenactment\nwhose goal is to transfer the 3D head orientation and expression of a target\nface to a source face. Previous methods focus on learning embedding networks\nfor identity and head pose/expression disentanglement which proves to be a\nrather hard task, degrading the quality of the generated images. We take a\ndifferent approach, bypassing the training of such networks, by using\n(fine-tuned) pre-trained GANs which have been shown capable of producing\nhigh-quality facial images. Because GANs are characterized by weak\ncontrollability, the core of our approach is a method to discover which\ndirections in latent GAN space are responsible for controlling head pose and\nexpression variations. We present a simple pipeline to learn such directions\nwith the aid of a 3D shape model which, by construction, inherently captures\ndisentangled directions for head pose, identity, and expression. Moreover, we\nshow that by embedding real images in the GAN latent space, our method can be\nsuccessfully used for the reenactment of real-world faces. Our method features\nseveral favorable properties including using a single source image (one-shot)\nand enabling cross-person reenactment. Extensive qualitative and quantitative\nresults show that our approach typically produces reenacted faces of notably\nhigher quality than those produced by state-of-the-art methods for the standard\nbenchmarks of VoxCeleb1 & 2.\n","authors":["Stella Bounareli","Christos Tzelepis","Vasileios Argyriou","Ioannis Patras","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2402.03553v1.pdf","comment":"Preprint version, accepted for publication in International Journal\n of Computer Vision (IJCV)"},{"id":"http://arxiv.org/abs/2402.03549v1","updated":"2024-02-05T22:10:54Z","published":"2024-02-05T22:10:54Z","title":"AnaMoDiff: 2D Analogical Motion Diffusion via Disentangled Denoising","summary":" We present AnaMoDiff, a novel diffusion-based method for 2D motion analogies\nthat is applied to raw, unannotated videos of articulated characters. Our goal\nis to accurately transfer motions from a 2D driving video onto a source\ncharacter, with its identity, in terms of appearance and natural movement, well\npreserved, even when there may be significant discrepancies between the source\nand driving characters in their part proportions and movement speed and styles.\nOur diffusion model transfers the input motion via a latent optical flow (LOF)\nnetwork operating in a noised latent space, which is spatially aware, efficient\nto process compared to the original RGB videos, and artifact-resistant through\nthe diffusion denoising process even amid dense movements. To accomplish both\nmotion analogy and identity preservation, we train our denoising model in a\nfeature-disentangled manner, operating at two noise levels. While\nidentity-revealing features of the source are learned via conventional noise\ninjection, motion features are learned from LOF-warped videos by only injecting\nnoise with large values, with the stipulation that motion properties involving\npose and limbs are encoded by higher-level features. Experiments demonstrate\nthat our method achieves the best trade-off between motion analogy and identity\npreservation.\n","authors":["Maham Tanveer","Yizhi Wang","Ruiqi Wang","Nanxuan Zhao","Ali Mahdavi-Amiri","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03547v1","updated":"2024-02-05T22:06:27Z","published":"2024-02-05T22:06:27Z","title":"Improving Pediatric Low-Grade Neuroepithelial Tumors Molecular Subtype\n Identification Using a Novel AUROC Loss Function for Convolutional Neural\n Networks","summary":" Pediatric Low-Grade Neuroepithelial Tumors (PLGNT) are the most common\npediatric cancer type, accounting for 40% of brain tumors in children, and\nidentifying PLGNT molecular subtype is crucial for treatment planning. However,\nthe gold standard to determine the PLGNT subtype is biopsy, which can be\nimpractical or dangerous for patients. This research improves the performance\nof Convolutional Neural Networks (CNNs) in classifying PLGNT subtypes through\nMRI scans by introducing a loss function that specifically improves the model's\nArea Under the Receiver Operating Characteristic (ROC) Curve (AUROC), offering\na non-invasive diagnostic alternative. In this study, a retrospective dataset\nof 339 children with PLGNT (143 BRAF fusion, 71 with BRAF V600E mutation, and\n125 non-BRAF) was curated. We employed a CNN model with Monte Carlo random data\nsplitting. The baseline model was trained using binary cross entropy (BCE), and\nachieved an AUROC of 86.11% for differentiating BRAF fusion and BRAF V600E\nmutations, which was improved to 87.71% using our proposed AUROC loss function\n(p-value 0.045). With multiclass classification, the AUROC improved from 74.42%\nto 76. 59% (p-value 0.0016).\n","authors":["Khashayar Namdar","Matthias W. Wagner","Cynthia Hawkins","Uri Tabori","Birgit B. Ertl-Wagner","Farzad Khalvati"],"pdf_url":"https://arxiv.org/pdf/2402.03547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03526v1","updated":"2024-02-05T21:28:47Z","published":"2024-02-05T21:28:47Z","title":"nnMamba: 3D Biomedical Image Segmentation, Classification and Landmark\n Detection with State Space Model","summary":" In the field of biomedical image analysis, the quest for architectures\ncapable of effectively capturing long-range dependencies is paramount,\nespecially when dealing with 3D image segmentation, classification, and\nlandmark detection. Traditional Convolutional Neural Networks (CNNs) struggle\nwith locality respective field, and Transformers have a heavy computational\nload when applied to high-dimensional medical images. In this paper, we\nintroduce nnMamba, a novel architecture that integrates the strengths of CNNs\nand the advanced long-range modeling capabilities of State Space Sequence\nModels (SSMs). nnMamba adds the SSMs to the convolutional residual-block to\nextract local features and model complex dependencies. For diffirent tasks, we\nbuild different blocks to learn the features. Extensive experiments demonstrate\nnnMamba's superiority over state-of-the-art methods in a suite of challenging\ntasks, including 3D image segmentation, classification, and landmark detection.\nnnMamba emerges as a robust solution, offering both the local representation\nability of CNNs and the efficient global context processing of SSMs, setting a\nnew standard for long-range dependency modeling in medical image analysis. Code\nis available at https://github.com/lhaof/nnMamba\n","authors":["Haifan Gong","Luoyao Kang","Yitao Wang","Xiang Wan","Haofeng Li"],"pdf_url":"https://arxiv.org/pdf/2402.03526v1.pdf","comment":"7 pages, Code is available at https://github.com/lhaof/nnMamba"},{"id":"http://arxiv.org/abs/2304.07248v3","updated":"2024-02-05T21:11:40Z","published":"2023-04-14T16:53:06Z","title":"The University of California San Francisco, Brain Metastases\n Stereotactic Radiosurgery (UCSF-BMSR) MRI Dataset","summary":" The University of California San Francisco Brain Metastases Stereotactic\nRadiosurgery (UCSF-BMSR) dataset is a public, clinical, multimodal brain MRI\ndataset consisting of 560 brain MRIs from 412 patients with expert annotations\nof 5136 brain metastases. Data consists of registered and skull stripped T1\npost-contrast, T1 pre-contrast, FLAIR and subtraction (T1 pre-contrast - T1\npost-contrast) images and voxelwise segmentations of enhancing brain metastases\nin NifTI format. The dataset also includes patient demographics, surgical\nstatus and primary cancer types. The UCSF-BSMR has been made publicly available\nin the hopes that researchers will use these data to push the boundaries of AI\napplications for brain metastases.\n","authors":["Jeffrey D. Rudie","Rachit Saluja","David A. Weiss","Pierre Nedelec","Evan Calabrese","John B. Colby","Benjamin Laguna","John Mongan","Steve Braunstein","Christopher P. Hess","Andreas M. Rauschecker","Leo P. Sugrue","Javier E. Villanueva-Meyer"],"pdf_url":"https://arxiv.org/pdf/2304.07248v3.pdf","comment":"15 pages, 2 tables, 2 figures"},{"id":"http://arxiv.org/abs/2402.03501v1","updated":"2024-02-05T20:34:32Z","published":"2024-02-05T20:34:32Z","title":"An Inpainting-Infused Pipeline for Attire and Background Replacement","summary":" In recent years, groundbreaking advancements in Generative Artificial\nIntelligence (GenAI) have triggered a transformative paradigm shift,\nsignificantly influencing various domains. In this work, we specifically\nexplore an integrated approach, leveraging advanced techniques in GenAI and\ncomputer vision emphasizing image manipulation. The methodology unfolds through\nseveral stages, including depth estimation, the creation of inpaint masks based\non depth information, the generation and replacement of backgrounds utilizing\nStable Diffusion in conjunction with Latent Consistency Models (LCMs), and the\nsubsequent replacement of clothes and application of aesthetic changes through\nan inpainting pipeline. Experiments conducted in this study underscore the\nmethodology's efficacy, highlighting its potential to produce visually\ncaptivating content. The convergence of these advanced techniques allows users\nto input photographs of individuals and manipulate them to modify clothing and\nbackground based on specific prompts without manually input inpainting masks,\neffectively placing the subjects within the vast landscape of creative\nimagination.\n","authors":["Felipe Rodrigues Perche-Mahlow","André Felipe-Zanella","William Alberto Cruz-Castañeda","Marcellus Amadeus"],"pdf_url":"https://arxiv.org/pdf/2402.03501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16355v2","updated":"2024-02-05T20:13:03Z","published":"2024-01-29T17:59:19Z","title":"PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding\n and Reasoning in Pathology","summary":" The emergence of large multimodal models has unlocked remarkable potential in\nAI, particularly in pathology. However, the lack of specialized, high-quality\nbenchmark impeded their development and precise evaluation. To address this, we\nintroduce PathMMU, the largest and highest-quality expert-validated pathology\nbenchmark for LMMs. It comprises 33,573 multimodal multi-choice questions and\n21,599 images from various sources, and an explanation for the correct answer\naccompanies each question. The construction of PathMMU capitalizes on the\nrobust capabilities of GPT-4V, utilizing approximately 30,000 gathered\nimage-caption pairs to generate Q\\&As. Significantly, to maximize PathMMU's\nauthority, we invite six pathologists to scrutinize each question under strict\nstandards in PathMMU's validation and test sets, while simultaneously setting\nan expert-level performance benchmark for PathMMU. We conduct extensive\nevaluations, including zero-shot assessments of 14 open-sourced and three\nclosed-sourced LMMs and their robustness to image corruption. We also fine-tune\nrepresentative LMMs to assess their adaptability to PathMMU. The empirical\nfindings indicate that advanced LMMs struggle with the challenging PathMMU\nbenchmark, with the top-performing LMM, GPT-4V, achieving only a 51.7\\%\nzero-shot performance, significantly lower than the 71.4\\% demonstrated by\nhuman pathologists. After fine-tuning, even open-sourced LMMs can surpass\nGPT-4V with a performance of over 60\\%, but still fall short of the expertise\nshown by pathologists. We hope that the PathMMU will offer valuable insights\nand foster the development of more specialized, next-generation LLMs for\npathology.\n","authors":["Yuxuan Sun","Hao Wu","Chenglu Zhu","Sunyi Zheng","Qizi Chen","Kai Zhang","Yunlong Zhang","Xiaoxiao Lan","Mengyue Zheng","Jingxiong Li","Xinheng Lyu","Tao Lin","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2401.16355v2.pdf","comment":"make source and method updates before resubmission"},{"id":"http://arxiv.org/abs/2402.03492v1","updated":"2024-02-05T20:08:53Z","published":"2024-02-05T20:08:53Z","title":"Beyond Strong labels: Weakly-supervised Learning Based on Gaussian\n Pseudo Labels for The Segmentation of Ellipse-like Vascular Structures in\n Non-contrast CTs","summary":" Deep-learning-based automated segmentation of vascular structures in\npreoperative CT scans contributes to computer-assisted diagnosis and\nintervention procedure in vascular diseases. While CT angiography (CTA) is the\ncommon standard, non-contrast CT imaging is significant as a contrast-risk-free\nalternative, avoiding complications associated with contrast agents. However,\nthe challenges of labor-intensive labeling and high labeling variability due to\nthe ambiguity of vascular boundaries hinder conventional strong-label-based,\nfully-supervised learning in non-contrast CTs. This paper introduces a\nweakly-supervised framework using ellipses' topology in slices, including 1) an\nefficient annotation process based on predefined standards, 2) ellipse-fitting\nprocessing, 3) the generation of 2D Gaussian heatmaps serving as pseudo labels,\n4) a training process through a combination of voxel reconstruction loss and\ndistribution loss with the pseudo labels. We assess the effectiveness of the\nproposed method on one local and two public datasets comprising non-contrast CT\nscans, particularly focusing on the abdominal aorta. On the local dataset, our\nweakly-supervised learning approach based on pseudo labels outperforms\nstrong-label-based fully-supervised learning (1.54\\% of Dice score on average),\nreducing labeling time by around 82.0\\%. The efficiency in generating pseudo\nlabels allows the inclusion of label-agnostic external data in the training\nset, leading to an additional improvement in performance (2.74\\% of Dice score\non average) with a reduction of 66.3\\% labeling time, where the labeling time\nremains considerably less than that of strong labels. On the public dataset,\nthe pseudo labels achieve an overall improvement of 1.95\\% in Dice score for 2D\nmodels while a reduction of 11.65 voxel spacing in Hausdorff distance for 3D\nmodel.\n","authors":["Qixiang Ma","Antoine Łucas","Huazhong Shu","Adrien Kaladji","Pascal Haigron"],"pdf_url":"https://arxiv.org/pdf/2402.03492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02409v2","updated":"2024-02-05T19:56:54Z","published":"2023-12-05T00:48:31Z","title":"MGTR: Multi-Granular Transformer for Motion Prediction with LiDAR","summary":" Motion prediction has been an essential component of autonomous driving\nsystems since it handles highly uncertain and complex scenarios involving\nmoving agents of different types. In this paper, we propose a Multi-Granular\nTRansformer (MGTR) framework, an encoder-decoder network that exploits context\nfeatures in different granularities for different kinds of traffic agents. To\nfurther enhance MGTR's capabilities, we leverage LiDAR point cloud data by\nincorporating LiDAR semantic features from an off-the-shelf LiDAR feature\nextractor. We evaluate MGTR on Waymo Open Dataset motion prediction benchmark\nand show that the proposed method achieved state-of-the-art performance,\nranking 1st on its leaderboard\n(https://waymo.com/open/challenges/2023/motion-prediction/).\n","authors":["Yiqian Gan","Hao Xiao","Yizhe Zhao","Ethan Zhang","Zhe Huang","Xin Ye","Lingting Ge"],"pdf_url":"https://arxiv.org/pdf/2312.02409v2.pdf","comment":"Accepted to ICRA 2024"},{"id":"http://arxiv.org/abs/2402.03478v1","updated":"2024-02-05T19:39:52Z","published":"2024-02-05T19:39:52Z","title":"Hyper-Diffusion: Estimating Epistemic and Aleatoric Uncertainty with a\n Single Model","summary":" Estimating and disentangling epistemic uncertainty (uncertainty that can be\nreduced with more training data) and aleatoric uncertainty (uncertainty that is\ninherent to the task at hand) is critically important when applying machine\nlearning (ML) to high-stakes applications such as medical imaging and weather\nforecasting. Conditional diffusion models' breakthrough ability to accurately\nand efficiently sample from the posterior distribution of a dataset now makes\nuncertainty estimation conceptually straightforward: One need only train and\nsample from a large ensemble of diffusion models. Unfortunately, training such\nan ensemble becomes computationally intractable as the complexity of the model\narchitecture grows.\n In this work we introduce a new approach to ensembling, hyper-diffusion,\nwhich allows one to accurately estimate epistemic and aleatoric uncertainty\nwith a single model. Unlike existing Monte Carlo dropout based single-model\nensembling methods, hyper-diffusion offers the same prediction accuracy as\nmulti-model ensembles. We validate our approach on two distinct tasks: x-ray\ncomputed tomography (CT) reconstruction and weather temperature forecasting.\n","authors":["Matthew A. Chan","Maria J. Molina","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2402.03478v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.04655v3","updated":"2024-02-05T19:33:53Z","published":"2023-10-07T02:18:52Z","title":"VLATTACK: Multimodal Adversarial Attacks on Vision-Language Tasks via\n Pre-trained Models","summary":" Vision-Language (VL) pre-trained models have shown their superiority on many\nmultimodal tasks. However, the adversarial robustness of such models has not\nbeen fully explored. Existing approaches mainly focus on exploring the\nadversarial robustness under the white-box setting, which is unrealistic. In\nthis paper, we aim to investigate a new yet practical task to craft image and\ntext perturbations using pre-trained VL models to attack black-box fine-tuned\nmodels on different downstream tasks. Towards this end, we propose VLATTACK to\ngenerate adversarial samples by fusing perturbations of images and texts from\nboth single-modal and multimodal levels. At the single-modal level, we propose\na new block-wise similarity attack (BSA) strategy to learn image perturbations\nfor disrupting universal representations. Besides, we adopt an existing text\nattack strategy to generate text perturbations independent of the image-modal\nattack. At the multimodal level, we design a novel iterative cross-search\nattack (ICSA) method to update adversarial image-text pairs periodically,\nstarting with the outputs from the single-modal level. We conduct extensive\nexperiments to attack five widely-used VL pre-trained models for six tasks.\nExperimental results show that VLATTACK achieves the highest attack success\nrates on all tasks compared with state-of-the-art baselines, which reveals a\nblind spot in the deployment of pre-trained VL models. Source codes can be\nfound at https://github.com/ericyinyzy/VLAttack.\n","authors":["Ziyi Yin","Muchao Ye","Tianrong Zhang","Tianyu Du","Jinguo Zhu","Han Liu","Jinghui Chen","Ting Wang","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2310.04655v3.pdf","comment":"Accepted by NeurIPS 2023, 21 pages"},{"id":"http://arxiv.org/abs/2402.03473v1","updated":"2024-02-05T19:32:10Z","published":"2024-02-05T19:32:10Z","title":"Assessing the Efficacy of Invisible Watermarks in AI-Generated Medical\n Images","summary":" AI-generated medical images are gaining growing popularity due to their\npotential to address the data scarcity challenge in the real world. However,\nthe issue of accurate identification of these synthetic images, particularly\nwhen they exhibit remarkable realism with their real copies, remains a concern.\nTo mitigate this challenge, image generators such as DALLE and Imagen, have\nintegrated digital watermarks aimed at facilitating the discernment of\nsynthetic images' authenticity. These watermarks are embedded within the image\npixels and are invisible to the human eye while remains their detectability.\nNevertheless, a comprehensive investigation into the potential impact of these\ninvisible watermarks on the utility of synthetic medical images has been\nlacking. In this study, we propose the incorporation of invisible watermarks\ninto synthetic medical images and seek to evaluate their efficacy in the\ncontext of downstream classification tasks. Our goal is to pave the way for\ndiscussions on the viability of such watermarks in boosting the detectability\nof synthetic medical images, fortifying ethical standards, and safeguarding\nagainst data pollution and potential scams.\n","authors":["Xiaodan Xing","Huiyu Zhou","Yingying Fang","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2402.03473v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2402.03466v1","updated":"2024-02-05T19:21:52Z","published":"2024-02-05T19:21:52Z","title":"Physics-Encoded Graph Neural Networks for Deformation Prediction under\n Contact","summary":" In robotics, it's crucial to understand object deformation during tactile\ninteractions. A precise understanding of deformation can elevate robotic\nsimulations and have broad implications across different industries. We\nintroduce a method using Physics-Encoded Graph Neural Networks (GNNs) for such\npredictions. Similar to robotic grasping and manipulation scenarios, we focus\non modeling the dynamics between a rigid mesh contacting a deformable mesh\nunder external forces. Our approach represents both the soft body and the rigid\nbody within graph structures, where nodes hold the physical states of the\nmeshes. We also incorporate cross-attention mechanisms to capture the interplay\nbetween the objects. By jointly learning geometry and physics, our model\nreconstructs consistent and detailed deformations. We've made our code and\ndataset public to advance research in robotic simulation and grasping.\n","authors":["Mahdi Saleh","Michael Sommersperger","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2402.03466v1.pdf","comment":"Accepted at 2024 IEEE International Conference on Robotics and\n Automation (ICRA2024)"},{"id":"http://arxiv.org/abs/2402.03456v1","updated":"2024-02-05T19:09:33Z","published":"2024-02-05T19:09:33Z","title":"Constrained Multiview Representation for Self-supervised Contrastive\n Learning","summary":" Representation learning constitutes a pivotal cornerstone in contemporary\ndeep learning paradigms, offering a conduit to elucidate distinctive features\nwithin the latent space and interpret the deep models. Nevertheless, the\ninherent complexity of anatomical patterns and the random nature of lesion\ndistribution in medical image segmentation pose significant challenges to the\ndisentanglement of representations and the understanding of salient features.\nMethods guided by the maximization of mutual information, particularly within\nthe framework of contrastive learning, have demonstrated remarkable success and\nsuperiority in decoupling densely intertwined representations. However, the\neffectiveness of contrastive learning highly depends on the quality of the\npositive and negative sample pairs, i.e. the unselected average mutual\ninformation among multi-views would obstruct the learning strategy so the\nselection of the views is vital. In this work, we introduce a novel approach\npredicated on representation distance-based mutual information (MI)\nmaximization for measuring the significance of different views, aiming at\nconducting more efficient contrastive learning and representation\ndisentanglement. Additionally, we introduce an MI re-ranking strategy for\nrepresentation selection, benefiting both the continuous MI estimating and\nrepresentation significance distance measuring. Specifically, we harness\nmulti-view representations extracted from the frequency domain, re-evaluating\ntheir significance based on mutual information across varying frequencies,\nthereby facilitating a multifaceted contrastive learning approach to bolster\nsemantic comprehension. The statistical results under the five metrics\ndemonstrate that our proposed framework proficiently constrains the MI\nmaximization-driven representation selection and steers the multi-view\ncontrastive learning process.\n","authors":["Siyuan Dai","Kai Ye","Kun Zhao","Ge Cui","Haoteng Tang","Liang Zhan"],"pdf_url":"https://arxiv.org/pdf/2402.03456v1.pdf","comment":"11 pages, 9 figures, 2 algorithms"},{"id":"http://arxiv.org/abs/2402.03445v1","updated":"2024-02-05T19:00:45Z","published":"2024-02-05T19:00:45Z","title":"Denoising Diffusion via Image-Based Rendering","summary":" Generating 3D scenes is a challenging open problem, which requires\nsynthesizing plausible content that is fully consistent in 3D space. While\nrecent methods such as neural radiance fields excel at view synthesis and 3D\nreconstruction, they cannot synthesize plausible details in unobserved regions\nsince they lack a generative capability. Conversely, existing generative\nmethods are typically not capable of reconstructing detailed, large-scale\nscenes in the wild, as they use limited-capacity 3D scene representations,\nrequire aligned camera poses, or rely on additional regularizers. In this work,\nwe introduce the first diffusion model able to perform fast, detailed\nreconstruction and generation of real-world 3D scenes. To achieve this, we make\nthree contributions. First, we introduce a new neural scene representation,\nIB-planes, that can efficiently and accurately represent large 3D scenes,\ndynamically allocating more capacity as needed to capture details visible in\neach image. Second, we propose a denoising-diffusion framework to learn a prior\nover this novel 3D scene representation, using only 2D images without the need\nfor any additional supervision signal such as masks or depths. This supports 3D\nreconstruction and generation in a unified architecture. Third, we develop a\nprincipled approach to avoid trivial 3D solutions when integrating the\nimage-based rendering with the diffusion model, by dropping out representations\nof some images. We evaluate the model on several challenging datasets of real\nand synthetic images, and demonstrate superior results on generation, novel\nview synthesis and 3D reconstruction.\n","authors":["Titas Anciukevicius","Fabian Manhardt","Federico Tombari","Paul Henderson"],"pdf_url":"https://arxiv.org/pdf/2402.03445v1.pdf","comment":"Accepted at ICLR 2024. Project page:\n https://anciukevicius.github.io/generative-image-based-rendering"},{"id":"http://arxiv.org/abs/2402.03417v1","updated":"2024-02-05T18:53:54Z","published":"2024-02-05T18:53:54Z","title":"A Computer Vision Based Approach for Stalking Detection Using a\n CNN-LSTM-MLP Hybrid Fusion Model","summary":" Criminal and suspicious activity detection has become a popular research\ntopic in recent years. The rapid growth of computer vision technologies has had\na crucial impact on solving this issue. However, physical stalking detection is\nstill a less explored area despite the evolution of modern technology.\nNowadays, stalking in public places has become a common occurrence with women\nbeing the most affected. Stalking is a visible action that usually occurs\nbefore any criminal activity begins as the stalker begins to follow, loiter,\nand stare at the victim before committing any criminal activity such as\nassault, kidnapping, rape, and so on. Therefore, it has become a necessity to\ndetect stalking as all of these criminal activities can be stopped in the first\nplace through stalking detection. In this research, we propose a novel deep\nlearning-based hybrid fusion model to detect potential stalkers from a single\nvideo with a minimal number of frames. We extract multiple relevant features,\nsuch as facial landmarks, head pose estimation, and relative distance, as\nnumerical values from video frames. This data is fed into a multilayer\nperceptron (MLP) to perform a classification task between a stalking and a\nnon-stalking scenario. Simultaneously, the video frames are fed into a\ncombination of convolutional and LSTM models to extract the spatio-temporal\nfeatures. We use a fusion of these numerical and spatio-temporal features to\nbuild a classifier to detect stalking incidents. Additionally, we introduce a\ndataset consisting of stalking and non-stalking videos gathered from various\nfeature films and television series, which is also used to train the model. The\nexperimental results show the efficiency and dynamism of our proposed stalker\ndetection system, achieving 89.58% testing accuracy with a significant\nimprovement as compared to the state-of-the-art approaches.\n","authors":["Murad Hasan","Shahriar Iqbal","Md. Billal Hossain Faisal","Md. Musnad Hossin Neloy","Md. Tonmoy Kabir","Md. Tanzim Reza","Md. Golam Rabiul Alam","Md Zia Uddin"],"pdf_url":"https://arxiv.org/pdf/2402.03417v1.pdf","comment":"Under review for publication in the PLOS ONE journal, 17 pages, 9\n figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.03277v1","updated":"2024-02-05T18:35:16Z","published":"2024-02-05T18:35:16Z","title":"Event-based Product Carousel Recommendation with Query-Click Graph","summary":" Many current recommender systems mainly focus on the product-to-product\nrecommendations and user-to-product recommendations even during the time of\nevents rather than modeling the typical recommendations for the target event\n(e.g., festivals, seasonal activities, or social activities) without addressing\nthe multiple aspects of the shopping demands for the target event. Product\nrecommendations for the multiple aspects of the target event are usually\ngenerated by human curators who manually identify the aspects and select a list\nof aspect-related products (i.e., product carousel) for each aspect as\nrecommendations. However, building a recommender system with machine learning\nis non-trivial due to the lack of both the ground truth of event-related\naspects and the aspect-related products. To fill this gap, we define the novel\nproblem as the event-based product carousel recommendations in e-commerce and\npropose an effective recommender system based on the query-click bipartite\ngraph. We apply the iterative clustering algorithm over the query-click\nbipartite graph and infer the event-related aspects by the clusters of queries.\nThe aspect-related recommendations are powered by the click-through rate of\nproducts regarding each aspect. We show through experiments that this approach\neffectively mines product carousels for the target event.\n","authors":["Luyi Ma","Nimesh Sinha","Parth Vajge","Jason HD Cho","Sushant Kumar","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2402.03277v1.pdf","comment":"7 pages, 2 figures, 2021 IEEE International Conference on Big Data\n (Big Data)"},{"id":"http://arxiv.org/abs/2402.03190v1","updated":"2024-02-05T16:56:11Z","published":"2024-02-05T16:56:11Z","title":"Unified Hallucination Detection for Multimodal Large Language Models","summary":" Despite significant strides in multimodal tasks, Multimodal Large Language\nModels (MLLMs) are plagued by the critical issue of hallucination. The reliable\ndetection of such hallucinations in MLLMs has, therefore, become a vital aspect\nof model evaluation and the safeguarding of practical application deployment.\nPrior research in this domain has been constrained by a narrow focus on\nsingular tasks, an inadequate range of hallucination categories addressed, and\na lack of detailed granularity. In response to these challenges, our work\nexpands the investigative horizons of hallucination detection. We present a\nnovel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate\nthe evaluation of advancements in hallucination detection methods.\nAdditionally, we unveil a novel unified multimodal hallucination detection\nframework, UNIHD, which leverages a suite of auxiliary tools to validate the\noccurrence of hallucinations robustly. We demonstrate the effectiveness of\nUNIHD through meticulous evaluation and comprehensive analysis. We also provide\nstrategic insights on the application of specific tools for addressing various\ncategories of hallucinations.\n","authors":["Xiang Chen","Chenxi Wang","Yida Xue","Ningyu Zhang","Xiaoyan Yang","Qiang Li","Yue Shen","Jinjie Gu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03190v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.03176v1","updated":"2024-02-05T16:43:53Z","published":"2024-02-05T16:43:53Z","title":"Comparison of Topic Modelling Approaches in the Banking Context","summary":" Topic modelling is a prominent task for automatic topic extraction in many\napplications such as sentiment analysis and recommendation systems. The\napproach is vital for service industries to monitor their customer discussions.\nThe use of traditional approaches such as Latent Dirichlet Allocation (LDA) for\ntopic discovery has shown great performances, however, they are not consistent\nin their results as these approaches suffer from data sparseness and inability\nto model the word order in a document. Thus, this study presents the use of\nKernel Principal Component Analysis (KernelPCA) and K-means Clustering in the\nBERTopic architecture. We have prepared a new dataset using tweets from\ncustomers of Nigerian banks and we use this to compare the topic modelling\napproaches. Our findings showed KernelPCA and K-means in the BERTopic\narchitecture-produced coherent topics with a coherence score of 0.8463.\n","authors":["Bayode Ogunleye","Tonderai Maswera","Laurence Hirsch","Jotham Gaudoin","Teresa Brunsdon"],"pdf_url":"https://arxiv.org/pdf/2402.03176v1.pdf","comment":"14 pages, Journal of Applied Science"},{"id":"http://arxiv.org/abs/2402.03163v1","updated":"2024-02-05T16:31:03Z","published":"2024-02-05T16:31:03Z","title":"Linguistic features for sentence difficulty prediction in ABSA","summary":" One of the challenges of natural language understanding is to deal with the\nsubjectivity of sentences, which may express opinions and emotions that add\nlayers of complexity and nuance. Sentiment analysis is a field that aims to\nextract and analyze these subjective elements from text, and it can be applied\nat different levels of granularity, such as document, paragraph, sentence, or\naspect. Aspect-based sentiment analysis is a well-studied topic with many\navailable data sets and models. However, there is no clear definition of what\nmakes a sentence difficult for aspect-based sentiment analysis. In this paper,\nwe explore this question by conducting an experiment with three data sets:\n\"Laptops\", \"Restaurants\", and \"MTSC\" (Multi-Target-dependent Sentiment\nClassification), and a merged version of these three datasets. We study the\nimpact of domain diversity and syntactic diversity on difficulty. We use a\ncombination of classifiers to identify the most difficult sentences and analyze\ntheir characteristics. We employ two ways of defining sentence difficulty. The\nfirst one is binary and labels a sentence as difficult if the classifiers fail\nto correctly predict the sentiment polarity. The second one is a six-level\nscale based on how many of the top five best-performing classifiers can\ncorrectly predict the sentiment polarity. We also define 9 linguistic features\nthat, combined, aim at estimating the difficulty at sentence level.\n","authors":["Adrian-Gabriel Chifu","Sébastien Fournier"],"pdf_url":"https://arxiv.org/pdf/2402.03163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12420v2","updated":"2024-02-05T16:06:14Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts\nextend beyond technology, influencing environmental and societal aspects. This\nevolution has increased publications, making manual literature analysis\nincreasingly challenging. We address this with a Natural Language Processing\n(NLP)-based systematic literature review method to explore the intersection of\nDistributed Ledger Technology (DLT) with its Environmental, Social, and\nGovernance (ESG) aspects. Our approach involves building and refining a\ndirected citation network from 107 seed papers to a corpus of 24,539\npublications and fine-tuning a transformer-based language model for Named\nEntity Recognition (NER) on DLT and ESG domains. Applying this model, we\ndistilled the corpus to 505 key publications, enabling an inaugural literature\nreview and temporal graph analysis of DLT's evolution in ESG contexts. Our\ncontributions include an adaptable and scalable NLP-driven systematic\nliterature review methodology and a unique NER dataset of 54,808 entities,\ntailored for DLT and ESG research. Our inaugural literature review demonstrates\ntheir applicability and effectiveness in analyzing DLT's evolution and impacts,\nproving invaluable for stakeholders in the DLT domain.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03049v1","updated":"2024-02-05T14:33:56Z","published":"2024-02-05T14:33:56Z","title":"EasyInstruct: An Easy-to-use Instruction Processing Framework for Large\n Language Models","summary":" In recent years, instruction tuning has gained increasing attention and\nemerged as a crucial technique to enhance the capabilities of Large Language\nModels (LLMs). To construct high-quality instruction datasets, many instruction\nprocessing approaches have been proposed, aiming to achieve a delicate balance\nbetween data quantity and data quality. Nevertheless, due to inconsistencies\nthat persist among various instruction processing methods, there is no standard\nopen-source instruction processing implementation framework available for the\ncommunity, which hinders practitioners from further developing and advancing.\nTo facilitate instruction processing research and development, we present\nEasyInstruct, an easy-to-use instruction processing framework for LLMs, which\nmodularizes instruction generation, selection, and prompting, while also\nconsidering their combination and interaction. EasyInstruct is publicly\nreleased and actively maintained at https://github.com/zjunlp/EasyInstruct,\nalong with a running demo App at\nhttps://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for\nbroader research centered on instruction data.\n","authors":["Yixin Ou","Ningyu Zhang","Honghao Gui","Ziwen Xu","Shuofei Qiao","Zhen Bi","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03049v1.pdf","comment":"Ongoing work; the project website is at\n https://zjunlp.github.io/project/EasyInstruct, code is at\n https://github.com/zjunlp/EasyInstruct, demo is at\n https://huggingface.co/spaces/zjunlp/EasyInstruct"},{"id":"http://arxiv.org/abs/2402.03025v1","updated":"2024-02-05T14:06:15Z","published":"2024-02-05T14:06:15Z","title":"Understanding and Guiding Weakly Supervised Entity Alignment with\n Potential Isomorphism Propagation","summary":" Weakly Supervised Entity Alignment (EA) is the task of identifying equivalent\nentities across diverse knowledge graphs (KGs) using only a limited number of\nseed alignments. Despite substantial advances in aggregation-based weakly\nsupervised EA, the underlying mechanisms in this setting remain unexplored. In\nthis paper, we present a propagation perspective to analyze weakly supervised\nEA and explain the existing aggregation-based EA models. Our theoretical\nanalysis reveals that these models essentially seek propagation operators for\npairwise entity similarities. We further prove that, despite the structural\nheterogeneity of different KGs, the potentially aligned entities within\naggregation-based EA models have isomorphic subgraphs, which is the core\npremise of EA but has not been investigated. Leveraging this insight, we\nintroduce a potential isomorphism propagation operator to enhance the\npropagation of neighborhood information across KGs. We develop a general EA\nframework, PipEA, incorporating this operator to improve the accuracy of every\ntype of aggregation-based model without altering the learning process.\nExtensive experiments substantiate our theoretical findings and demonstrate\nPipEA's significant performance gains over state-of-the-art weakly supervised\nEA methods. Our work not only advances the field but also enhances our\ncomprehension of aggregation-based weakly supervised EA.\n","authors":["Yuanyi Wang","Wei Tang","Haifeng Sun","Zirui Zhuang","Xiaoyuan Fu","Jingyu Wang","Qi Qi","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02932v1","updated":"2024-02-05T11:55:30Z","published":"2024-02-05T11:55:30Z","title":"Domain Adaptation of Multilingual Semantic Search -- Literature Review","summary":" This literature review gives an overview of current approaches to perform\ndomain adaptation in a low-resource and approaches to perform multilingual\nsemantic search in a low-resource setting. We developed a new typology to\ncluster domain adaptation approaches based on the part of dense textual\ninformation retrieval systems, which they adapt, focusing on how to combine\nthem efficiently. We also explore the possibilities of combining multilingual\nsemantic search with domain adaptation approaches for dense retrievers in a\nlow-resource setting.\n","authors":["Anna Bringmann","Anastasia Zhukova"],"pdf_url":"https://arxiv.org/pdf/2402.02932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02855v1","updated":"2024-02-05T10:16:20Z","published":"2024-02-05T10:16:20Z","title":"Dynamic Sparse Learning: A Novel Paradigm for Efficient Recommendation","summary":" In the realm of deep learning-based recommendation systems, the increasing\ncomputational demands, driven by the growing number of users and items, pose a\nsignificant challenge to practical deployment. This challenge is primarily\ntwofold: reducing the model size while effectively learning user and item\nrepresentations for efficient recommendations. Despite considerable\nadvancements in model compression and architecture search, prevalent approaches\nface notable constraints. These include substantial additional computational\ncosts from pre-training/re-training in model compression and an extensive\nsearch space in architecture design. Additionally, managing complexity and\nadhering to memory constraints is problematic, especially in scenarios with\nstrict time or space limitations. Addressing these issues, this paper\nintroduces a novel learning paradigm, Dynamic Sparse Learning (DSL), tailored\nfor recommendation models. DSL innovatively trains a lightweight sparse model\nfrom scratch, periodically evaluating and dynamically adjusting each weight's\nsignificance and the model's sparsity distribution during the training. This\napproach ensures a consistent and minimal parameter budget throughout the full\nlearning lifecycle, paving the way for \"end-to-end\" efficiency from training to\ninference. Our extensive experimental results underline DSL's effectiveness,\nsignificantly reducing training and inference costs while delivering comparable\nrecommendation performance.\n","authors":["Shuyao Wang","Yongduo Sui","Jiancan Wu","Zhi Zheng","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2402.02855v1.pdf","comment":"10 pages, 5 figures, 4 tables. Accecpted by WSDM 2024"},{"id":"http://arxiv.org/abs/2402.02844v1","updated":"2024-02-05T09:57:15Z","published":"2024-02-05T09:57:15Z","title":"Comparing Knowledge Sources for Open-Domain Scientific Claim\n Verification","summary":" The increasing rate at which scientific knowledge is discovered and health\nclaims shared online has highlighted the importance of developing efficient\nfact-checking systems for scientific claims. The usual setting for this task in\nthe literature assumes that the documents containing the evidence for claims\nare already provided and annotated or contained in a limited corpus. This\nrenders the systems unrealistic for real-world settings where knowledge sources\nwith potentially millions of documents need to be queried to find relevant\nevidence. In this paper, we perform an array of experiments to test the\nperformance of open-domain claim verification systems. We test the final\nverdict prediction of systems on four datasets of biomedical and health claims\nin different settings. While keeping the pipeline's evidence selection and\nverdict prediction parts constant, document retrieval is performed over three\ncommon knowledge sources (PubMed, Wikipedia, Google) and using two different\ninformation retrieval techniques. We show that PubMed works better with\nspecialized biomedical claims, while Wikipedia is more suited for everyday\nhealth concerns. Likewise, BM25 excels in retrieval precision, while semantic\nsearch in recall of relevant evidence. We discuss the results, outline frequent\nretrieval patterns and challenges, and provide promising future directions.\n","authors":["Juraj Vladika","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2402.02844v1.pdf","comment":"Accepted to EACL 2024"},{"id":"http://arxiv.org/abs/2402.02842v1","updated":"2024-02-05T09:53:08Z","published":"2024-02-05T09:53:08Z","title":"Trinity: Syncretizing Multi-/Long-tail/Long-term Interests All in One","summary":" Interest modeling in recommender system has been a constant topic for\nimproving user experience, and typical interest modeling tasks (e.g.\nmulti-interest, long-tail interest and long-term interest) have been\ninvestigated in many existing works. However, most of them only consider one\ninterest in isolation, while neglecting their interrelationships. In this\npaper, we argue that these tasks suffer from a common \"interest amnesia\"\nproblem, and a solution exists to mitigate it simultaneously. We figure that\nlong-term cues can be the cornerstone since they reveal multi-interest and\nclarify long-tail interest. Inspired by the observation, we propose a novel and\nunified framework in the retrieval stage, \"Trinity\", to solve interest amnesia\nproblem and improve multiple interest modeling tasks. We construct a real-time\nclustering system that enables us to project items into enumerable clusters,\nand calculate statistical interest histograms over these clusters. Based on\nthese histograms, Trinity recognizes underdelivered themes and remains stable\nwhen facing emerging hot topics. Trinity is more appropriate for large-scale\nindustry scenarios because of its modest computational overheads. Its derived\nretrievers have been deployed on the recommender system of Douyin,\nsignificantly improving user experience and retention. We believe that such\npractical experience can be well generalized to other scenarios.\n","authors":["Jing Yan","Liu Jiang","Jianfei Cui","Zhichen Zhao","Xingyan Bin","Feng Zhang","Zuotao Liu"],"pdf_url":"https://arxiv.org/pdf/2402.02842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02816v1","updated":"2024-02-05T08:56:24Z","published":"2024-02-05T08:56:24Z","title":"Intersectional Two-sided Fairness in Recommendation","summary":" Fairness of recommender systems (RS) has attracted increasing attention\nrecently. Based on the involved stakeholders, the fairness of RS can be divided\ninto user fairness, item fairness, and two-sided fairness which considers both\nuser and item fairness simultaneously. However, we argue that the\nintersectional two-sided unfairness may still exist even if the RS is two-sided\nfair, which is observed and shown by empirical studies on real-world data in\nthis paper, and has not been well-studied previously. To mitigate this problem,\nwe propose a novel approach called Intersectional Two-sided Fairness\nRecommendation (ITFR). Our method utilizes a sharpness-aware loss to perceive\ndisadvantaged groups, and then uses collaborative loss balance to develop\nconsistent distinguishing abilities for different intersectional groups.\nAdditionally, predicted score normalization is leveraged to align positive\npredicted scores to fairly treat positives in different intersectional groups.\nExtensive experiments and analyses on three public datasets show that our\nproposed approach effectively alleviates the intersectional two-sided\nunfairness and consistently outperforms previous state-of-the-art methods.\n","authors":["Yifan Wang","Peijie Sun","Weizhi Ma","Min Zhang","Yuan Zhang","Peng Jiang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2402.02816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02803v1","updated":"2024-02-05T08:25:22Z","published":"2024-02-05T08:25:22Z","title":"Large Language Model Distilling Medication Recommendation Model","summary":" The recommendation of medication is a vital aspect of intelligent healthcare\nsystems, as it involves prescribing the most suitable drugs based on a\npatient's specific health needs. Unfortunately, many sophisticated models\ncurrently in use tend to overlook the nuanced semantics of medical data, while\nonly relying heavily on identities. Furthermore, these models face significant\nchallenges in handling cases involving patients who are visiting the hospital\nfor the first time, as they lack prior prescription histories to draw upon. To\ntackle these issues, we harness the powerful semantic comprehension and\ninput-agnostic characteristics of Large Language Models (LLMs). Our research\naims to transform existing medication recommendation methodologies using LLMs.\nIn this paper, we introduce a novel approach called Large Language Model\nDistilling Medication Recommendation (LEADER). We begin by creating appropriate\nprompt templates that enable LLMs to suggest medications effectively. However,\nthe straightforward integration of LLMs into recommender systems leads to an\nout-of-corpus issue specific to drugs. We handle it by adapting the LLMs with a\nnovel output layer and a refined tuning loss function. Although LLM-based\nmodels exhibit remarkable capabilities, they are plagued by high computational\ncosts during inference, which is impractical for the healthcare sector. To\nmitigate this, we have developed a feature-level knowledge distillation\ntechnique, which transfers the LLM's proficiency to a more compact model.\nExtensive experiments conducted on two real-world datasets, MIMIC-III and\nMIMIC-IV, demonstrate that our proposed model not only delivers effective\nresults but also is efficient. To ease the reproducibility of our experiments,\nwe release the implementation code online.\n","authors":["Qidong Liu","Xian Wu","Xiangyu Zhao","Yuanshao Zhu","Zijian Zhang","Feng Tian","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.02803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01253v2","updated":"2024-02-05T07:25:33Z","published":"2024-02-02T09:20:48Z","title":"RimiRec: Modeling Refined Multi-interest in Hierarchical Structure for\n Recommendation","summary":" Industrial recommender systems usually consist of the retrieval stage and the\nranking stage, to handle the billion-scale of users and items. The retrieval\nstage retrieves candidate items relevant to user interests for recommendations\nand has attracted much attention. Frequently, a user shows refined\nmulti-interests in a hierarchical structure. For example, a user likes Conan\nand Kuroba Kaito, which are the roles in hierarchical structure \"Animation,\nJapanese Animation, Detective Conan\". However, most existing methods ignore\nthis hierarchical nature, and simply average the fine-grained interest\ninformation. Therefore, we propose a novel two-stage approach to explicitly\nmodeling refined multi-interest in a hierarchical structure for recommendation.\nIn the first hierarchical multi-interest mining stage, the hierarchical\nclustering and transformer-based model adaptively generate circles or\nsub-circles that users are interested in. In the second stage, the partition of\nretrieval space allows the EBR models to deal only with items within each\ncircle and accurately capture users' refined interests. Experimental results\nshow that the proposed approach achieves state-of-the-art performance. Our\nframework has also been deployed at Lofter.\n","authors":["Haolei Pei","Yuanyuan Xu","Yangping Zhu","Yuan Nie"],"pdf_url":"https://arxiv.org/pdf/2402.01253v2.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.02764v1","updated":"2024-02-05T06:52:53Z","published":"2024-02-05T06:52:53Z","title":"List-aware Reranking-Truncation Joint Model for Search and\n Retrieval-augmented Generation","summary":" The results of information retrieval (IR) are usually presented in the form\nof a ranked list of candidate documents, such as web search for humans and\nretrieval-augmented generation for large language models (LLMs). List-aware\nretrieval aims to capture the list-level contextual features to return a better\nlist, mainly including reranking and truncation. Reranking finely re-scores the\ndocuments in the list. Truncation dynamically determines the cut-off point of\nthe ranked list to achieve the trade-off between overall relevance and avoiding\nmisinformation from irrelevant documents. Previous studies treat them as two\nseparate tasks and model them separately. However, the separation is not\noptimal. First, it is hard to share the contextual information of the ranking\nlist between the two tasks. Second, the separate pipeline usually meets the\nerror accumulation problem, where the small error from the reranking stage can\nlargely affect the truncation stage. To solve these problems, we propose a\nReranking-Truncation joint model (GenRT) that can perform the two tasks\nconcurrently. GenRT integrates reranking and truncation via generative paradigm\nbased on encoder-decoder architecture. We also design the novel loss functions\nfor joint optimization to make the model learn both tasks. Sharing parameters\nby the joint model is conducive to making full use of the common modeling\ninformation of the two tasks. Besides, the two tasks are performed concurrently\nand co-optimized to solve the error accumulation problem between separate\nstages. Experiments on public learning-to-rank benchmarks and open-domain Q\\&A\ntasks show that our method achieves SOTA performance on both reranking and\ntruncation tasks for web search and retrieval-augmented LLMs.\n","authors":["Shicheng Xu","Liang Pang","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.02764v1.pdf","comment":"Accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2401.02130v3","updated":"2024-02-05T06:25:44Z","published":"2024-01-04T08:31:47Z","title":"Spectral-Based Graph Neural Networks for Complementary Item\n Recommendation","summary":" Modeling complementary relationships greatly helps recommender systems to\naccurately and promptly recommend the subsequent items when one item is\npurchased. Unlike traditional similar relationships, items with complementary\nrelationships may be purchased successively (such as iPhone and Airpods Pro),\nand they not only share relevance but also exhibit dissimilarity. Since the two\nattributes are opposites, modeling complementary relationships is challenging.\nPrevious attempts to exploit these relationships have either ignored or\noversimplified the dissimilarity attribute, resulting in ineffective modeling\nand an inability to balance the two attributes. Since Graph Neural Networks\n(GNNs) can capture the relevance and dissimilarity between nodes in the\nspectral domain, we can leverage spectral-based GNNs to effectively understand\nand model complementary relationships. In this study, we present a novel\napproach called Spectral-based Complementary Graph Neural Networks (SComGNN)\nthat utilizes the spectral properties of complementary item graphs. We make the\nfirst observation that complementary relationships consist of low-frequency and\nmid-frequency components, corresponding to the relevance and dissimilarity\nattributes, respectively. Based on this spectral observation, we design\nspectral graph convolutional networks with low-pass and mid-pass filters to\ncapture the low-frequency and mid-frequency components. Additionally, we\npropose a two-stage attention mechanism to adaptively integrate and balance the\ntwo attributes. Experimental results on four e-commerce datasets demonstrate\nthe effectiveness of our model, with SComGNN significantly outperforming\nexisting baseline models.\n","authors":["Haitong Luo","Xuying Meng","Suhang Wang","Hanyun Cao","Weiyao Zhang","Yequan Wang","Yujun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.02130v3.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2402.02718v1","updated":"2024-02-05T04:28:08Z","published":"2024-02-05T04:28:08Z","title":"Denoising Time Cycle Modeling for Recommendation","summary":" Recently, modeling temporal patterns of user-item interactions have attracted\nmuch attention in recommender systems. We argue that existing methods ignore\nthe variety of temporal patterns of user behaviors. We define the subset of\nuser behaviors that are irrelevant to the target item as noises, which limits\nthe performance of target-related time cycle modeling and affect the\nrecommendation performance. In this paper, we propose Denoising Time Cycle\nModeling (DiCycle), a novel approach to denoise user behaviors and select the\nsubset of user behaviors that are highly related to the target item. DiCycle is\nable to explicitly model diverse time cycle patterns for recommendation.\nExtensive experiments are conducted on both public benchmarks and a real-world\ndataset, demonstrating the superior performance of DiCycle over the\nstate-of-the-art recommendation methods.\n","authors":["Sicong Xie","Qunwei Li","Weidi Xu","Kaiming Shen","Shaohu Chen","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2402.02718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03486v1","updated":"2024-02-05T19:58:40Z","published":"2024-02-05T19:58:40Z","title":"Early prediction of onset of sepsis in Clinical Setting","summary":" This study proposes the use of Machine Learning models to predict the early\nonset of sepsis using deidentified clinical data from Montefiore Medical Center\nin Bronx, NY, USA. A supervised learning approach was adopted, wherein an\nXGBoost model was trained utilizing 80\\% of the train dataset, encompassing 107\nfeatures (including the original and derived features). Subsequently, the model\nwas evaluated on the remaining 20\\% of the test data. The model was validated\non prospective data that was entirely unseen during the training phase. To\nassess the model's performance at the individual patient level and timeliness\nof the prediction, a normalized utility score was employed, a widely recognized\nscoring methodology for sepsis detection, as outlined in the PhysioNet Sepsis\nChallenge paper. Metrics such as F1 Score, Sensitivity, Specificity, and Flag\nRate were also devised. The model achieved a normalized utility score of 0.494\non test data and 0.378 on prospective data at threshold 0.3. The F1 scores were\n80.8\\% and 67.1\\% respectively for the test data and the prospective data for\nthe same threshold, highlighting its potential to be integrated into clinical\ndecision-making processes effectively. These results bear testament to the\nmodel's robust predictive capabilities and its potential to substantially\nimpact clinical decision-making processes.\n","authors":["Fahim Mohammad","Lakshmi Arunachalam","Samanway Sadhu","Boudewijn Aasman","Shweta Garg","Adil Ahmed","Silvie Colman","Meena Arunachalam","Sudhir Kulkarni","Parsa Mirhaji"],"pdf_url":"https://arxiv.org/pdf/2402.03486v1.pdf","comment":"16 pages, 6 figures and 7 tables"},{"id":"http://arxiv.org/abs/2402.03484v1","updated":"2024-02-05T19:56:27Z","published":"2024-02-05T19:56:27Z","title":"Harnessing PubMed User Query Logs for Post Hoc Explanations of\n Recommended Similar Articles","summary":" Searching for a related article based on a reference article is an integral\npart of scientific research. PubMed, like many academic search engines, has a\n\"similar articles\" feature that recommends articles relevant to the current\narticle viewed by a user. Explaining recommended items can be of great utility\nto users, particularly in the literature search process. With more than a\nmillion biomedical papers being published each year, explaining the recommended\nsimilar articles would facilitate researchers and clinicians in searching for\nrelated articles. Nonetheless, the majority of current literature\nrecommendation systems lack explanations for their suggestions. We employ a\npost hoc approach to explaining recommendations by identifying relevant tokens\nin the titles of similar articles. Our major contribution is building PubCLogs\nby repurposing 5.6 million pairs of coclicked articles from PubMed's user query\nlogs. Using our PubCLogs dataset, we train the Highlight Similar Article Title\n(HSAT), a transformer-based model designed to select the most relevant parts of\nthe title of a similar article, based on the title and abstract of a seed\narticle. HSAT demonstrates strong performance in our empirical evaluations,\nachieving an F1 score of 91.72 percent on the PubCLogs test set, considerably\noutperforming several baselines including BM25 (70.62), MPNet (67.11), MedCPT\n(62.22), GPT-3.5 (46.00), and GPT-4 (64.89). Additional evaluations on a\nseparate, manually annotated test set further verifies HSAT's performance.\nMoreover, participants of our user study indicate a preference for HSAT, due to\nits superior balance between conciseness and comprehensiveness. Our study\nsuggests that repurposing user query logs of academic search engines can be a\npromising way to train state-of-the-art models for explaining literature\nrecommendation.\n","authors":["Ashley Shin","Qiao Jin","James Anibal","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03481v1","updated":"2024-02-05T19:53:34Z","published":"2024-02-05T19:53:34Z","title":"FINEST: Stabilizing Recommendations by Rank-Preserving Fine-Tuning","summary":" Modern recommender systems may output considerably different recommendations\ndue to small perturbations in the training data. Changes in the data from a\nsingle user will alter the recommendations as well as the recommendations of\nother users. In applications like healthcare, housing, and finance, this\nsensitivity can have adverse effects on user experience. We propose a method to\nstabilize a given recommender system against such perturbations. This is a\nchallenging task due to (1) the lack of a ``reference'' rank list that can be\nused to anchor the outputs; and (2) the computational challenges in ensuring\nthe stability of rank lists with respect to all possible perturbations of\ntraining data. Our method, FINEST, overcomes these challenges by obtaining\nreference rank lists from a given recommendation model and then fine-tuning the\nmodel under simulated perturbation scenarios with rank-preserving\nregularization on sampled items. Our experiments on real-world datasets\ndemonstrate that FINEST can ensure that recommender models output stable\nrecommendations under a wide range of different perturbations without\ncompromising next-item prediction accuracy.\n","authors":["Sejoon Oh","Berk Ustun","Julian McAuley","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.03481v1.pdf","comment":"Accepted at the 6th FAccTRec Workshop on Responsible Recommendation @\n ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2402.03464v1","updated":"2024-02-05T19:19:08Z","published":"2024-02-05T19:19:08Z","title":"A Fuzzy Approach to Record Linkages","summary":" Record Linkage is the process of identifying and unifying records from\nvarious independent data sources. Existing strategies, which can be either\ndeterministic or probabilistic, often fail to link records satisfactorily under\nuncertainty. This paper describes an indigenously (locally) developed fuzzy\nlinkage method, based on fuzzy set techniques, which can effectively account\nfor this uncertainty prevalent in the disparate data sources and address the\nshortcomings of the existing approaches. Extensive testing, evaluation and\ncomparisons have demonstrated the efficacy of this fuzzy approach for record\nlinkages.\n","authors":["Pratik K. Biswas"],"pdf_url":"https://arxiv.org/pdf/2402.03464v1.pdf","comment":"Journal Paper (9 pages, 6 Figures)"},{"id":"http://arxiv.org/abs/2402.03450v1","updated":"2024-02-05T19:03:23Z","published":"2024-02-05T19:03:23Z","title":"Recommendation Fairness in Social Networks Over Time","summary":" In social recommender systems, it is crucial that the recommendation models\nprovide equitable visibility for different demographic groups, such as gender\nor race. Most existing research has addressed this problem by only studying\nindividual static snapshots of networks that typically change over time. To\naddress this gap, we study the evolution of recommendation fairness over time\nand its relation to dynamic network properties. We examine three real-world\ndynamic networks by evaluating the fairness of six recommendation algorithms\nand analyzing the association between fairness and network properties over\ntime. We further study how interventions on network properties influence\nfairness by examining counterfactual scenarios with alternative evolution\noutcomes and differing network properties. Our results on empirical datasets\nsuggest that recommendation fairness improves over time, regardless of the\nrecommendation method. We also find that two network properties, minority\nratio, and homophily ratio, exhibit stable correlations with fairness over\ntime. Our counterfactual study further suggests that an extreme homophily ratio\npotentially contributes to unfair recommendations even with a balanced minority\nratio. Our work provides insights into the evolution of fairness within dynamic\nnetworks in social science. We believe that our findings will help system\noperators and policymakers to better comprehend the implications of temporal\nchanges and interventions targeting fairness in social networks.\n","authors":["Meng Cao","Hussain Hussain","Sandipan Sikdar","Denis Helic","Markus Strohmaier","Roman Kern"],"pdf_url":"https://arxiv.org/pdf/2402.03450v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.03312v1","updated":"2024-02-05T18:59:52Z","published":"2024-02-05T18:59:52Z","title":"Test-Time Adaptation for Depth Completion","summary":" It is common to observe performance degradation when transferring models\ntrained on some (source) datasets to target testing data due to a domain gap\nbetween them. Existing methods for bridging this gap, such as domain adaptation\n(DA), may require the source data on which the model was trained (often not\navailable), while others, i.e., source-free DA, require many passes through the\ntesting data. We propose an online test-time adaptation method for depth\ncompletion, the task of inferring a dense depth map from a single image and\nassociated sparse depth map, that closes the performance gap in a single pass.\nWe first present a study on how the domain shift in each data modality affects\nmodel performance. Based on our observations that the sparse depth modality\nexhibits a much smaller covariate shift than the image, we design an embedding\nmodule trained in the source domain that preserves a mapping from features\nencoding only sparse depth to those encoding image and sparse depth. During\ntest time, sparse depth features are projected using this map as a proxy for\nsource domain features and are used as guidance to train a set of auxiliary\nparameters (i.e., adaptation layer) to align image and sparse depth features\nfrom the target test domain to that of the source domain. We evaluate our\nmethod on indoor and outdoor scenarios and show that it improves over baselines\nby an average of 21.1%.\n","authors":["Hyoungseob Park","Anjali Gupta","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2402.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03311v1","updated":"2024-02-05T18:59:41Z","published":"2024-02-05T18:59:41Z","title":"HASSOD: Hierarchical Adaptive Self-Supervised Object Detection","summary":" The human visual perception system demonstrates exceptional capabilities in\nlearning without explicit supervision and understanding the part-to-whole\ncomposition of objects. Drawing inspiration from these two abilities, we\npropose Hierarchical Adaptive Self-Supervised Object Detection (HASSOD), a\nnovel approach that learns to detect objects and understand their compositions\nwithout human supervision. HASSOD employs a hierarchical adaptive clustering\nstrategy to group regions into object masks based on self-supervised visual\nrepresentations, adaptively determining the number of objects per image.\nFurthermore, HASSOD identifies the hierarchical levels of objects in terms of\ncomposition, by analyzing coverage relations between masks and constructing\ntree structures. This additional self-supervised learning task leads to\nimproved detection performance and enhanced interpretability. Lastly, we\nabandon the inefficient multi-round self-training process utilized in prior\nmethods and instead adapt the Mean Teacher framework from semi-supervised\nlearning, which leads to a smoother and more efficient training process.\nThrough extensive experiments on prevalent image datasets, we demonstrate the\nsuperiority of HASSOD over existing methods, thereby advancing the state of the\nart in self-supervised object detection. Notably, we improve Mask AR from 20.2\nto 22.5 on LVIS, and from 17.0 to 26.0 on SA-1B. Project page:\nhttps://HASSOD-NeurIPS23.github.io.\n","authors":["Shengcao Cao","Dhiraj Joshi","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03311v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.03309v1","updated":"2024-02-05T18:59:31Z","published":"2024-02-05T18:59:31Z","title":"AONeuS: A Neural Rendering Framework for Acoustic-Optical Sensor Fusion","summary":" Underwater perception and 3D surface reconstruction are challenging problems\nwith broad applications in construction, security, marine archaeology, and\nenvironmental monitoring. Treacherous operating conditions, fragile\nsurroundings, and limited navigation control often dictate that submersibles\nrestrict their range of motion and, thus, the baseline over which they can\ncapture measurements. In the context of 3D scene reconstruction, it is\nwell-known that smaller baselines make reconstruction more challenging. Our\nwork develops a physics-based multimodal acoustic-optical neural surface\nreconstruction framework (AONeuS) capable of effectively integrating\nhigh-resolution RGB measurements with low-resolution depth-resolved imaging\nsonar measurements. By fusing these complementary modalities, our framework can\nreconstruct accurate high-resolution 3D surfaces from measurements captured\nover heavily-restricted baselines. Through extensive simulations and in-lab\nexperiments, we demonstrate that AONeuS dramatically outperforms recent\nRGB-only and sonar-only inverse-differentiable-rendering--based surface\nreconstruction methods. A website visualizing the results of our paper is\nlocated at this address: https://aoneus.github.io/\n","authors":["Mohamad Qadri","Kevin Zhang","Akshay Hinduja","Michael Kaess","Adithya Pediredla","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2402.03309v1.pdf","comment":"First two authors contributed equally. Paper website:\n https://aoneus.github.io/"},{"id":"http://arxiv.org/abs/2402.03305v1","updated":"2024-02-05T18:58:38Z","published":"2024-02-05T18:58:38Z","title":"Do Diffusion Models Learn Semantically Meaningful and Efficient\n Representations?","summary":" Diffusion models are capable of impressive feats of image generation with\nuncommon juxtapositions such as astronauts riding horses on the moon with\nproperly placed shadows. These outputs indicate the ability to perform\ncompositional generalization, but how do the models do so? We perform\ncontrolled experiments on conditional DDPMs learning to generate 2D spherical\nGaussian bumps centered at specified $x$- and $y$-positions. Our results show\nthat the emergence of semantically meaningful latent representations is key to\nachieving high performance. En route to successful performance over learning,\nthe model traverses three distinct phases of latent representations: (phase A)\nno latent structure, (phase B) a 2D manifold of disordered states, and (phase\nC) a 2D ordered manifold. Corresponding to each of these phases, we identify\nqualitatively different generation behaviors: 1) multiple bumps are generated,\n2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is\ngenerated at the correct $x$ and y location. Furthermore, we show that even\nunder imbalanced datasets where features ($x$- versus $y$-positions) are\nrepresented with skewed frequencies, the learning process for $x$ and $y$ is\ncoupled rather than factorized, demonstrating that simple vanilla-flavored\ndiffusion models cannot learn efficient representations in which localization\nin $x$ and $y$ are factorized into separate 1D tasks. These findings suggest\nthe need for future work to find inductive biases that will push generative\nmodels to discover and exploit factorizable independent structures in their\ninputs, which will be required to vault these models into more data-efficient\nregimes.\n","authors":["Qiyao Liang","Ziming Liu","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2402.03305v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.03303v1","updated":"2024-02-05T18:58:19Z","published":"2024-02-05T18:58:19Z","title":"Nevermind: Instruction Override and Moderation in Large Language Models","summary":" Given the impressive capabilities of recent Large Language Models (LLMs), we\ninvestigate and benchmark the most popular proprietary and different sized open\nsource models on the task of explicit instruction following in conflicting\nsituations, e.g. overrides. These include the ability of the model to override\nthe knowledge within the weights of the model, the ability to override (or\nmoderate) extracted knowledge in the prompt, and lastly the ability to perform\na full jailbreak. Experimentation performed suggest several key findings to\nimprove instruction following - larger models perform the best in following\ninstructions that override internal and contextual instructions, and are\nobedient, even to a fault. When scaling to longer contexts via rope scaling, a\nsignificant buffer needs to be maintained from the edge of the perplexity cliff\nin order to maintain instruction following capabilities. Finally, we observe\nimproving instruction following, and subsequently instruction\noverrides/jailbreaks, is fundamentally at odds with the ability of a language\nmodel to follow given safety filters or guidelines. Thus, we postulate the most\neffective approach for safe, trustworthy AI should be dealt external to the LLM\nitself.\n","authors":["Edward Kim"],"pdf_url":"https://arxiv.org/pdf/2402.03303v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2402.03302v1","updated":"2024-02-05T18:58:11Z","published":"2024-02-05T18:58:11Z","title":"Swin-UMamba: Mamba-based UNet with ImageNet-based pretraining","summary":" Accurate medical image segmentation demands the integration of multi-scale\ninformation, spanning from local features to global dependencies. However, it\nis challenging for existing methods to model long-range global information,\nwhere convolutional neural networks (CNNs) are constrained by their local\nreceptive fields, and vision transformers (ViTs) suffer from high quadratic\ncomplexity of their attention mechanism. Recently, Mamba-based models have\ngained great attention for their impressive ability in long sequence modeling.\nSeveral studies have demonstrated that these models can outperform popular\nvision models in various tasks, offering higher accuracy, lower memory\nconsumption, and less computational burden. However, existing Mamba-based\nmodels are mostly trained from scratch and do not explore the power of\npretraining, which has been proven to be quite effective for data-efficient\nmedical image analysis. This paper introduces a novel Mamba-based model,\nSwin-UMamba, designed specifically for medical image segmentation tasks,\nleveraging the advantages of ImageNet-based pretraining. Our experimental\nresults reveal the vital role of ImageNet-based training in enhancing the\nperformance of Mamba-based models. Swin-UMamba demonstrates superior\nperformance with a large margin compared to CNNs, ViTs, and latest Mamba-based\nmodels. Notably, on AbdomenMRI, Encoscopy, and Microscopy datasets, Swin-UMamba\noutperforms its closest counterpart U-Mamba by an average score of 3.58%. The\ncode and models of Swin-UMamba are publicly available at:\nhttps://github.com/JiarunLiu/Swin-UMamba\n","authors":["Jiarun Liu","Hao Yang","Hong-Yu Zhou","Yan Xi","Lequan Yu","Yizhou Yu","Yong Liang","Guangming Shi","Shaoting Zhang","Hairong Zheng","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03302v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2402.03300v1","updated":"2024-02-05T18:55:32Z","published":"2024-02-05T18:55:32Z","title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open\n Language Models","summary":" Mathematical reasoning poses a significant challenge for language models due\nto its complex and structured nature. In this paper, we introduce DeepSeekMath\n7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B\nmath-related tokens sourced from Common Crawl, together with natural language\nand code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the\ncompetition-level MATH benchmark without relying on external toolkits and\nvoting techniques, approaching the performance level of Gemini-Ultra and GPT-4.\nSelf-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH.\nThe mathematical reasoning capability of DeepSeekMath is attributed to two key\nfactors: First, we harness the significant potential of publicly available web\ndata through a meticulously engineered data selection pipeline. Second, we\nintroduce Group Relative Policy Optimization (GRPO), a variant of Proximal\nPolicy Optimization (PPO), that enhances mathematical reasoning abilities while\nconcurrently optimizing the memory usage of PPO.\n","authors":["Zhihong Shao","Peiyi Wang","Qihao Zhu","Runxin Xu","Junxiao Song","Mingchuan Zhang","Y. K. Li","Y. Wu","Daya Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03299v1","updated":"2024-02-05T18:54:43Z","published":"2024-02-05T18:54:43Z","title":"GUARD: Role-playing to Generate Natural-language Jailbreakings to Test\n Guideline Adherence of Large Language Models","summary":" The discovery of \"jailbreaks\" to bypass safety filters of Large Language\nModels (LLMs) and harmful responses have encouraged the community to implement\nsafety measures. One major safety measure is to proactively test the LLMs with\njailbreaks prior to the release. Therefore, such testing will require a method\nthat can generate jailbreaks massively and efficiently. In this paper, we\nfollow a novel yet intuitive strategy to generate jailbreaks in the style of\nthe human generation. We propose a role-playing system that assigns four\ndifferent roles to the user LLMs to collaborate on new jailbreaks. Furthermore,\nwe collect existing jailbreaks and split them into different independent\ncharacteristics using clustering frequency and semantic patterns sentence by\nsentence. We organize these characteristics into a knowledge graph, making them\nmore accessible and easier to retrieve. Our system of different roles will\nleverage this knowledge graph to generate new jailbreaks, which have proved\neffective in inducing LLMs to generate unethical or guideline-violating\nresponses. In addition, we also pioneer a setting in our system that will\nautomatically follow the government-issued guidelines to generate jailbreaks to\ntest whether LLMs follow the guidelines accordingly. We refer to our system as\nGUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have\nempirically validated the effectiveness of GUARD on three cutting-edge\nopen-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a\nwidely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the\nrealm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing\nGUARD's versatility and contributing valuable insights for the development of\nsafer, more reliable LLM-based applications across diverse modalities.\n","authors":["Haibo Jin","Ruoxi Chen","Andy Zhou","Jinyin Chen","Yang Zhang","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03299v1.pdf","comment":"22 papges"},{"id":"http://arxiv.org/abs/2402.03295v1","updated":"2024-02-05T18:51:17Z","published":"2024-02-05T18:51:17Z","title":"Ginger: An Efficient Curvature Approximation with Linear Complexity for\n General Neural Networks","summary":" Second-order optimization approaches like the generalized Gauss-Newton method\nare considered more powerful as they utilize the curvature information of the\nobjective function with preconditioning matrices. Albeit offering tempting\ntheoretical benefits, they are not easily applicable to modern deep learning.\nThe major reason is due to the quadratic memory and cubic time complexity to\ncompute the inverse of the matrix. These requirements are infeasible even with\nstate-of-the-art hardware. In this work, we propose Ginger, an\neigendecomposition for the inverse of the generalized Gauss-Newton matrix. Our\nmethod enjoys efficient linear memory and time complexity for each iteration.\nInstead of approximating the conditioning matrix, we directly maintain its\ninverse to make the approximation more accurate. We provide the convergence\nresult of Ginger for non-convex objectives. Our experiments on different tasks\nwith different model architectures verify the effectiveness of our method. Our\ncode is publicly available.\n","authors":["Yongchang Hao","Yanshuai Cao","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2402.03295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03293v1","updated":"2024-02-05T18:50:39Z","published":"2024-02-05T18:50:39Z","title":"Flora: Low-Rank Adapters Are Secretly Gradient Compressors","summary":" Despite large neural networks demonstrating remarkable abilities to complete\ndifferent tasks, they require excessive memory usage to store the optimization\nstates for training. To alleviate this, the low-rank adaptation (LoRA) is\nproposed to reduce the optimization states by training fewer parameters.\nHowever, LoRA restricts overall weight update matrices to be low-rank, limiting\nthe model performance. In this work, we investigate the dynamics of LoRA and\nidentify that it can be approximated by a random projection. Based on this\nobservation, we propose Flora, which is able to achieve high-rank updates by\nresampling the projection matrices while enjoying the sublinear space\ncomplexity of optimization states. We conduct experiments across different\ntasks and model architectures to verify the effectiveness of our approach.\n","authors":["Yongchang Hao","Yanshuai Cao","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2402.03293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03292v1","updated":"2024-02-05T18:50:27Z","published":"2024-02-05T18:50:27Z","title":"Zero-shot Object-Level OOD Detection with Context-Aware Inpainting","summary":" Machine learning algorithms are increasingly provided as black-box cloud\nservices or pre-trained models, without access to their training data. This\nmotivates the problem of zero-shot out-of-distribution (OOD) detection.\nConcretely, we aim to detect OOD objects that do not belong to the classifier's\nlabel set but are erroneously classified as in-distribution (ID) objects. Our\napproach, RONIN, uses an off-the-shelf diffusion model to replace detected\nobjects with inpainting. RONIN conditions the inpainting process with the\npredicted ID label, drawing the input object closer to the in-distribution\ndomain. As a result, the reconstructed object is very close to the original in\nthe ID cases and far in the OOD cases, allowing RONIN to effectively\ndistinguish ID and OOD samples. Throughout extensive experiments, we\ndemonstrate that RONIN achieves competitive results compared to previous\napproaches across several datasets, both in zero-shot and non-zero-shot\nsettings.\n","authors":["Quang-Huy Nguyen","Jin Peng Zhou","Zhenzhen Liu","Khanh-Huyen Bui","Kilian Q. Weinberger","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2402.03292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03290v1","updated":"2024-02-05T18:49:17Z","published":"2024-02-05T18:49:17Z","title":"InstanceDiffusion: Instance-level Control for Image Generation","summary":" Text-to-image diffusion models produce high quality images but do not offer\ncontrol over individual instances in the image. We introduce InstanceDiffusion\nthat adds precise instance-level control to text-to-image diffusion models.\nInstanceDiffusion supports free-form language conditions per instance and\nallows flexible ways to specify instance locations such as simple single\npoints, scribbles, bounding boxes or intricate instance segmentation masks, and\ncombinations thereof. We propose three major changes to text-to-image models\nthat enable precise instance-level control. Our UniFusion block enables\ninstance-level conditions for text-to-image models, the ScaleU block improves\nimage fidelity, and our Multi-instance Sampler improves generations for\nmultiple instances. InstanceDiffusion significantly surpasses specialized\nstate-of-the-art models for each location condition. Notably, on the COCO\ndataset, we outperform previous state-of-the-art by 20.4% AP$_{50}^\\text{box}$\nfor box inputs, and 25.4% IoU for mask inputs.\n","authors":["Xudong Wang","Trevor Darrell","Sai Saketh Rambhatla","Rohit Girdhar","Ishan Misra"],"pdf_url":"https://arxiv.org/pdf/2402.03290v1.pdf","comment":"Preprint; Project page:\n https://people.eecs.berkeley.edu/~xdwang/projects/InstDiff/"},{"id":"http://arxiv.org/abs/2402.03289v1","updated":"2024-02-05T18:47:04Z","published":"2024-02-05T18:47:04Z","title":"Make Every Move Count: LLM-based High-Quality RTL Code Generation Using\n MCTS","summary":" Existing large language models (LLMs) for register transfer level code\ngeneration face challenges like compilation failures and suboptimal power,\nperformance, and area (PPA) efficiency. This is due to the lack of PPA\nawareness in conventional transformer decoding algorithms. In response, we\npresent an automated transformer decoding algorithm that integrates Monte Carlo\ntree-search for lookahead, guiding the transformer to produce compilable,\nfunctionally correct, and PPA-optimized code. Empirical evaluation with a\nfine-tuned language model on RTL codesets shows that our proposed technique\nconsistently generates functionally correct code compared to prompting-only\nmethods and effectively addresses the PPA-unawareness drawback of naive large\nlanguage models. For the largest design generated by the state-of-the-art LLM\n(16-bit adder), our technique can achieve a 31.8% improvement in the area-delay\nproduct.\n","authors":["Matthew DeLorenzo","Animesh Basak Chowdhury","Vasudev Gohil","Shailja Thakur","Ramesh Karri","Siddharth Garg","Jeyavijayan Rajendran"],"pdf_url":"https://arxiv.org/pdf/2402.03289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03287v1","updated":"2024-02-05T18:43:05Z","published":"2024-02-05T18:43:05Z","title":"A Lennard-Jones Layer for Distribution Normalization","summary":" We introduce the Lennard-Jones layer (LJL) for the equalization of the\ndensity of 2D and 3D point clouds through systematically rearranging points\nwithout destroying their overall structure (distribution normalization). LJL\nsimulates a dissipative process of repulsive and weakly attractive interactions\nbetween individual points by considering the nearest neighbor of each point at\na given moment in time. This pushes the particles into a potential valley,\nreaching a well-defined stable configuration that approximates an equidistant\nsampling after the stabilization process. We apply LJLs to redistribute\nrandomly generated point clouds into a randomized uniform distribution.\nMoreover, LJLs are embedded in the generation process of point cloud networks\nby adding them at later stages of the inference process. The improvements in 3D\npoint cloud generation utilizing LJLs are evaluated qualitatively and\nquantitatively. Finally, we apply LJLs to improve the point distribution of a\nscore-based 3D point cloud denoising network. In general, we demonstrate that\nLJLs are effective for distribution normalization which can be applied at\nnegligible cost without retraining the given neural network.\n","authors":["Mulun Na","Jonathan Klein","Biao Zhang","Wojtek Pałubicki","Sören Pirk","Dominik L. Michels"],"pdf_url":"https://arxiv.org/pdf/2402.03287v1.pdf","comment":"Upon request, we are happy to share the source code to generate the\n results presented in this paper. Please contact the first or the last author\n of this manuscript"},{"id":"http://arxiv.org/abs/2402.03286v1","updated":"2024-02-05T18:42:34Z","published":"2024-02-05T18:42:34Z","title":"Training-Free Consistent Text-to-Image Generation","summary":" Text-to-image models offer a new level of creative flexibility by allowing\nusers to guide the image generation process through natural language. However,\nusing these models to consistently portray the same subject across diverse\nprompts remains challenging. Existing approaches fine-tune the model to teach\nit new words that describe specific user-provided subjects or add image\nconditioning to the model. These methods require lengthy per-subject\noptimization or large-scale pre-training. Moreover, they struggle to align\ngenerated images with text prompts and face difficulties in portraying multiple\nsubjects. Here, we present ConsiStory, a training-free approach that enables\nconsistent subject generation by sharing the internal activations of the\npretrained model. We introduce a subject-driven shared attention block and\ncorrespondence-based feature injection to promote subject consistency between\nimages. Additionally, we develop strategies to encourage layout diversity while\nmaintaining subject consistency. We compare ConsiStory to a range of baselines,\nand demonstrate state-of-the-art performance on subject consistency and text\nalignment, without requiring a single optimization step. Finally, ConsiStory\ncan naturally extend to multi-subject scenarios, and even enable training-free\npersonalization for common objects.\n","authors":["Yoad Tewel","Omri Kaduri","Rinon Gal","Yoni Kasten","Lior Wolf","Gal Chechik","Yuval Atzmon"],"pdf_url":"https://arxiv.org/pdf/2402.03286v1.pdf","comment":"Project page is in https://consistory-paper.github.io"},{"id":"http://arxiv.org/abs/2302.02450v2","updated":"2024-02-05T18:40:58Z","published":"2023-02-05T18:22:29Z","title":"Regularization and Optimization in Model-Based Clustering","summary":" Due to their conceptual simplicity, k-means algorithm variants have been\nextensively used for unsupervised cluster analysis. However, one main\nshortcoming of these algorithms is that they essentially fit a mixture of\nidentical spherical Gaussians to data that vastly deviates from such a\ndistribution. In comparison, general Gaussian Mixture Models (GMMs) can fit\nricher structures but require estimating a quadratic number of parameters per\ncluster to represent the covariance matrices. This poses two main issues: (i)\nthe underlying optimization problems are challenging due to their larger number\nof local minima, and (ii) their solutions can overfit the data. In this work,\nwe design search strategies that circumvent both issues. We develop more\neffective optimization algorithms for general GMMs, and we combine these\nalgorithms with regularization strategies that avoid overfitting. Through\nextensive computational analyses, we observe that optimization or\nregularization in isolation does not substantially improve cluster recovery.\nHowever, combining these techniques permits a completely new level of\nperformance previously unachieved by k-means algorithm variants, unraveling\nvastly different cluster structures. These results shed new light on the\ncurrent status quo between GMM and k-means methods and suggest the more\nfrequent use of general GMMs for data exploration. To facilitate such\napplications, we provide open-source code as well as Julia packages\n(UnsupervisedClustering.jl and RegularizedCovarianceMatrices.jl) implementing\nthe proposed techniques.\n","authors":["Raphael Araujo Sampaio","Joaquim Dias Garcia","Marcus Poggi","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2302.02450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03284v1","updated":"2024-02-05T18:39:47Z","published":"2024-02-05T18:39:47Z","title":"Deal, or no deal (or who knows)? Forecasting Uncertainty in\n Conversations using Large Language Models","summary":" Effective interlocutors account for the uncertain goals, beliefs, and\nemotions of others. But even the best human conversationalist cannot perfectly\nanticipate the trajectory of a dialogue. How well can language models represent\ninherent uncertainty in conversations? We propose FortUne Dial, an expansion of\nthe long-standing \"conversation forecasting\" task: instead of just accuracy,\nevaluation is conducted with uncertainty-aware metrics, effectively enabling\nabstention on individual instances. We study two ways in which language models\npotentially represent outcome uncertainty (internally, using scores and\ndirectly, using tokens) and propose fine-tuning strategies to improve\ncalibration of both representations. Experiments on eight difficult negotiation\ncorpora demonstrate that our proposed fine-tuning strategies (a traditional\nsupervision strategy and an off-policy reinforcement learning strategy) can\ncalibrate smaller open-source models to compete with pre-trained models 10x\ntheir size.\n","authors":["Anthony Sicilia","Hyunwoo Kim","Khyathi Raghavi Chandu","Malihe Alikhani","Jack Hessel"],"pdf_url":"https://arxiv.org/pdf/2402.03284v1.pdf","comment":"2 Figures; 7 Tables; 27 pages"},{"id":"http://arxiv.org/abs/2402.03282v1","updated":"2024-02-05T18:38:55Z","published":"2024-02-05T18:38:55Z","title":"A Framework for Partially Observed Reward-States in RLHF","summary":" The study of reinforcement learning from human feedback (RLHF) has gained\nprominence in recent years due to its role in the development of LLMs.\nNeuroscience research shows that human responses to stimuli are known to depend\non partially-observed \"internal states.\" Unfortunately current models of RLHF\ndo not take take this into consideration. Moreover most RLHF models do not\naccount for intermediate feedback, which is gaining importance in empirical\nwork and can help improve both sample complexity and alignment. To address\nthese limitations, we model RLHF as reinforcement learning with partially\nobserved reward-states (PORRL). We show reductions from the the two dominant\nforms of human feedback in RLHF - cardinal and dueling feedback to PORRL. For\ncardinal feedback, we develop generic statistically efficient algorithms and\ninstantiate them to present POR-UCRL and POR-UCBVI. For dueling feedback, we\nshow that a naive reduction to cardinal feedback fails to achieve sublinear\ndueling regret. We then present the first explicit reduction that converts\nguarantees for cardinal regret to dueling regret. We show that our models and\nguarantees in both settings generalize and extend existing ones. Finally, we\nidentify a recursive structure on our model that could improve the statistical\nand computational tractability of PORRL, giving examples from past work on RLHF\nas well as learning perfect reward machines, which PORRL subsumes.\n","authors":["Chinmaya Kausik","Mirco Mutti","Aldo Pacchiano","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2402.03282v1.pdf","comment":"47 pages. 13 pages for the main paper, 34 pages for the references\n and appendix"},{"id":"http://arxiv.org/abs/2302.09167v4","updated":"2024-02-05T18:35:36Z","published":"2023-02-17T22:40:07Z","title":"Mixed Traffic Control and Coordination from Pixels","summary":" Traffic congestion is a persistent problem in our society. Previous methods\nfor traffic control have proven futile in alleviating current congestion levels\nleading researchers to explore ideas with robot vehicles given the increased\nemergence of vehicles with different levels of autonomy on our roads. This\ngives rise to mixed traffic control, where robot vehicles regulate human-driven\nvehicles through reinforcement learning (RL). However, most existing studies\nuse precise observations that require domain expertise and hand engineering for\neach road network's observation space. Additionally, precise observations use\nglobal information, such as environment outflow, and local information, i.e.,\nvehicle positions and velocities. Obtaining this information requires updating\nexisting road infrastructure with vast sensor environments and communication to\npotentially unwilling human drivers. We consider image observations, a modality\nthat has not been extensively explored for mixed traffic control via RL, as the\nalternative: 1) images do not require a complete re-imagination of the\nobservation space from environment to environment; 2) images are ubiquitous\nthrough satellite imagery, in-car camera systems, and traffic monitoring\nsystems; and 3) images only require communication to equipment. In this work,\nwe show robot vehicles using image observations can achieve competitive\nperformance to using precise information on environments, including ring,\nfigure eight, intersection, merge, and bottleneck. In certain scenarios, our\napproach even outperforms using precision observations, e.g., up to 8% increase\nin average vehicle velocity in the merge environment, despite only using local\ntraffic information as opposed to global traffic information.\n","authors":["Michael Villarreal","Bibek Poudel","Jia Pan","Weizi Li"],"pdf_url":"https://arxiv.org/pdf/2302.09167v4.pdf","comment":"Accepted to IEEE International Conference on Robotics and Automation\n (ICRA), 2024"},{"id":"http://arxiv.org/abs/2310.05707v3","updated":"2024-02-05T18:33:44Z","published":"2023-10-09T13:29:37Z","title":"Guiding Language Model Math Reasoning with Planning Tokens","summary":" Large language models (LLMs) have recently attracted considerable interest\nfor their ability to perform complex reasoning tasks, such as chain-of-thought\nreasoning. However, most of the existing approaches to enhance this ability\nrely heavily on data-driven methods, while neglecting the structural aspects of\nthe model's reasoning capacity. We find that while LLMs can manage individual\nreasoning steps well, they struggle with maintaining consistency across an\nentire reasoning chain. To solve this, we introduce planning tokens at the\nstart of each reasoning step, serving as a guide for the model, and add their\nembeddings to the model parameters. Our approach requires a negligible increase\nin trainable parameters (just 0.001%) and can be applied through either full\nfine-tuning or a more parameter-efficient scheme. We demonstrate our method's\neffectiveness by applying it to three different LLMs, showing notable accuracy\nimprovements across three math word problem datasets w.r.t. standard\nfine-tuning baselines.\n","authors":["Xinyi Wang","Lucas Caccia","Oleksiy Ostapenko","Xingdi Yuan","William Yang Wang","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2310.05707v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14652v2","updated":"2024-02-05T18:30:30Z","published":"2023-11-24T18:35:00Z","title":"One Pass Streaming Algorithm for Super Long Token Attention\n Approximation in Sublinear Space","summary":" Attention computation takes both the time complexity of $O(n^2)$ and the\nspace complexity of $O(n^2)$ simultaneously, which makes deploying Large\nLanguage Models (LLMs) in streaming applications that involve long contexts\nrequiring substantial computational resources. In recent OpenAI DevDay (Nov 6,\n2023), OpenAI released a new model that is able to support a 128K-long\ndocument, in our paper, we focus on the memory-efficient issue when context\nlength $n$ is much greater than 128K ($n \\gg 2^d$). Considering a single-layer\nself-attention with Query, Key, and Value matrices $Q, K, V \\in \\mathbb{R}^{n\n\\times d}$, the polynomial method approximates the attention output $T \\in\n\\mathbb{R}^{n \\times d}$. It accomplishes this by constructing $U_1, U_2 \\in\n\\mathbb{R}^{n \\times t}$ to expedite attention ${\\sf Attn}(Q, K, V)$\ncomputation within $n^{1+o(1)}$ time executions. Despite this, computing the\napproximated attention matrix $U_1U_2^\\top \\in \\mathbb{R}^{n \\times n}$ still\nnecessitates $O(n^2)$ space, leading to significant memory usage. In response\nto these challenges, we introduce a new algorithm that only reads one pass of\nthe data in a streaming fashion. This method employs sublinear space $o(n)$ to\nstore three sketch matrices, alleviating the need for exact $K, V$ storage.\nNotably, our algorithm exhibits exceptional memory-efficient performance with\nsuper-long tokens. As the token length $n$ increases, our error guarantee\ndiminishes while the memory usage remains nearly constant. This unique\nattribute underscores the potential of our technique in efficiently handling\nLLMs in streaming applications.\n","authors":["Raghav Addanki","Chenyang Li","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03271v1","updated":"2024-02-05T18:28:44Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to seek information is of fundamental\nimportance. In many practical applications, such as medical diagnosis and\ntroubleshooting, the information needed to solve the task is not initially\ngiven, and has to be actively sought by asking follow-up questions (for\nexample, a doctor asking a patient for more details about their symptoms). In\nthis work, we introduce Uncertainty of Thoughts (UoT), an algorithm to augment\nlarge language models with the ability to actively seek information by asking\neffective questions. UoT combines 1) an uncertainty-aware simulation approach\nwhich enables the model to simulate possible future scenarios and how likely\nthey are to occur, 2) uncertainty-based rewards motivated by information gain\nwhich incentivizes the model to seek information, and 3) a reward propagation\nscheme to select the optimal question to ask in a way that maximizes the\nexpected reward. In experiments on medical diagnosis, troubleshooting and the\n'20 Questions' game, UoT achieves an average performance improvement of 57.8%\nin the rate of successful task completion across multiple LLMs compared with\ndirect prompting, and also improves efficiency (i.e., the number of questions\nneeded to complete the task).\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.03270v1","updated":"2024-02-05T18:27:46Z","published":"2024-02-05T18:27:46Z","title":"Multiclass Classification Procedure for Detecting Attacks on MQTT-IoT\n Protocol","summary":" The large number of sensors and actuators that make up the Internet of Things\nobliges these systems to use diverse technologies and protocols. This means\nthat IoT networks are more heterogeneous than traditional networks. This gives\nrise to new challenges in cybersecurity to protect these systems and devices\nwhich are characterized by being connected continuously to the Internet.\nIntrusion detection systems (IDS) are used to protect IoT systems from the\nvarious anomalies and attacks at the network level. Intrusion Detection Systems\n(IDS) can be improved through machine learning techniques. Our work focuses on\ncreating classification models that can feed an IDS using a dataset containing\nframes under attacks of an IoT system that uses the MQTT protocol. We have\naddressed two types of method for classifying the attacks, ensemble methods and\ndeep learning models, more specifically recurrent networks with very\nsatisfactory results.\n","authors":["Hector Alaiz-Moreton","Jose Aveleira-Mata","Jorge Ondicol-Garcia","Angel Luis Muñoz-Castañeda","Isaías García","Carmen Benavides"],"pdf_url":"https://arxiv.org/pdf/2402.03270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03269v1","updated":"2024-02-05T18:27:27Z","published":"2024-02-05T18:27:27Z","title":"ISPA: Inter-Species Phonetic Alphabet for Transcribing Animal Sounds","summary":" Traditionally, bioacoustics has relied on spectrograms and continuous,\nper-frame audio representations for the analysis of animal sounds, also serving\nas input to machine learning models. Meanwhile, the International Phonetic\nAlphabet (IPA) system has provided an interpretable, language-independent\nmethod for transcribing human speech sounds. In this paper, we introduce ISPA\n(Inter-Species Phonetic Alphabet), a precise, concise, and interpretable system\ndesigned for transcribing animal sounds into text. We compare acoustics-based\nand feature-based methods for transcribing and classifying animal sounds,\ndemonstrating their comparable performance with baseline methods utilizing\ncontinuous, dense audio representations. By representing animal sounds with\ntext, we effectively treat them as a \"foreign language,\" and we show that\nestablished human language ML paradigms and models, such as language models,\ncan be successfully applied to improve performance.\n","authors":["Masato Hagiwara","Marius Miron","Jen-Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03269v1.pdf","comment":"Accepted at XAI-AI Workshop (IEEEXplore track) @ ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.03268v1","updated":"2024-02-05T18:25:51Z","published":"2024-02-05T18:25:51Z","title":"Understanding the Reasoning Ability of Language Models From the\n Perspective of Reasoning Paths Aggregation","summary":" Pre-trained language models (LMs) are able to perform complex reasoning\nwithout explicit fine-tuning. To understand how pre-training with a next-token\nprediction objective contributes to the emergence of such reasoning capability,\nwe propose that we can view an LM as deriving new conclusions by aggregating\nindirect reasoning paths seen at pre-training time. We found this perspective\neffective in two important cases of reasoning: logic reasoning with knowledge\ngraphs (KGs) and math reasoning with math word problems (MWPs). More\nspecifically, we formalize the reasoning paths as random walk paths on the\nknowledge/reasoning graphs. Analyses of learned LM distributions suggest that a\nweighted sum of relevant random walk path probabilities is a reasonable way to\nexplain how LMs reason. Experiments and analysis on multiple KG and MWP\ndatasets reveal the effect of training on random walk paths and suggest that\naugmenting unlabeled random walk reasoning paths can improve real-world\nmulti-step reasoning performance.\n","authors":["Xinyi Wang","Alfonso Amayuelas","Kexun Zhang","Liangming Pan","Wenhu Chen","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00024v2","updated":"2024-02-05T18:24:51Z","published":"2024-01-05T18:31:34Z","title":"Comparative Analysis of LLaMA and ChatGPT Embeddings for Molecule\n Embedding","summary":" Purpose: Large Language Models (LLMs) like ChatGPT and LLaMA are increasingly\nrecognized for their potential in the field of cheminformatics, particularly in\ninterpreting Simplified Molecular Input Line Entry System (SMILES), a standard\nmethod for representing chemical structures. These LLMs can decode SMILES\nstrings into vector representations, providing a novel approach to\nunderstanding chemical graphs.\n Methods: We investigate the performance of ChatGPT and LLaMA in embedding\nSMILES strings. Our evaluation focuses on two key applications: molecular\nproperty (MP) prediction and drug-drug interaction (DDI) prediction, both\nessential in drug development and healthcare.\n Results: We find that SMILES embeddings generated using LLaMA outperform\nthose from ChatGPT in both MP and DDI prediction tasks. Notably, LLaMA-based\nSMILES embeddings show results comparable to existing methods in both\nprediction tasks.\n Conclusion: The application of LLMs in cheminformatics, particularly in\nutilizing SMILES embeddings, shows significant promise for advancing drug\ndevelopment. This includes improving the prediction of chemical properties and\nfacilitating the drug discovery process. GitHub:\nhttps://github.com/sshaghayeghs/LLaMA-VS-ChatGPT\n","authors":["Shaghayegh Sadeghi","Alan Bui","Ali Forooghi","Jianguo Lu","Alioune Ngom"],"pdf_url":"https://arxiv.org/pdf/2402.00024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03264v1","updated":"2024-02-05T18:22:21Z","published":"2024-02-05T18:22:21Z","title":"MobilityGPT: Enhanced Human Mobility Modeling with a GPT model","summary":" Generative models have shown promising results in capturing human mobility\ncharacteristics and generating synthetic trajectories. However, it remains\nchallenging to ensure that the generated geospatial mobility data is\nsemantically realistic, including consistent location sequences, and reflects\nreal-world characteristics, such as constraining on geospatial limits. To\naddress these issues, we reformat human mobility modeling as an autoregressive\ngeneration task, leveraging Generative Pre-trained Transformer (GPT). To ensure\nits controllable generation to alleviate the above challenges, we propose a\ngeospatially-aware generative model, MobilityGPT. We propose a gravity-based\nsampling method to train a transformer for semantic sequence similarity. Then,\nwe constrained the training process via a road connectivity matrix that\nprovides the connectivity of sequences in trajectory generation, thereby\nkeeping generated trajectories in geospatial limits. Lastly, we constructed a\nReinforcement Learning from Trajectory Feedback (RLTF) to minimize the travel\ndistance between training and the synthetically generated trajectories. Our\nexperiments on real-world datasets demonstrate that MobilityGPT outperforms\nstate-of-the-art methods in generating high-quality mobility trajectories that\nare closest to real data in terms of origin-destination similarity, trip\nlength, travel radius, link, and gravity distributions.\n","authors":["Ammar Haydari","Dongjie Chen","Zhengfeng Lai","Chen-Nee Chuah"],"pdf_url":"https://arxiv.org/pdf/2402.03264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03256v1","updated":"2024-02-05T18:14:28Z","published":"2024-02-05T18:14:28Z","title":"Learning Best-in-Class Policies for the Predict-then-Optimize Framework","summary":" We propose a novel family of decision-aware surrogate losses, called\nPerturbation Gradient (PG) losses, for the predict-then-optimize framework.\nThese losses directly approximate the downstream decision loss and can be\noptimized using off-the-shelf gradient-based methods. Importantly, unlike\nexisting surrogate losses, the approximation error of our PG losses vanishes as\nthe number of samples grows. This implies that optimizing our surrogate loss\nyields a best-in-class policy asymptotically, even in misspecified settings.\nThis is the first such result in misspecified settings and we provide numerical\nevidence confirming our PG losses substantively outperform existing proposals\nwhen the underlying model is misspecified and the noise is not centrally\nsymmetric. Insofar as misspecification is commonplace in practice -- especially\nwhen we might prefer a simpler, more interpretable model -- PG losses offer a\nnovel, theoretically justified, method for computationally tractable\ndecision-aware learning.\n","authors":["Michael Huang","Vishal Gupta"],"pdf_url":"https://arxiv.org/pdf/2402.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03254v1","updated":"2024-02-05T18:12:28Z","published":"2024-02-05T18:12:28Z","title":"Minimum Description Length and Generalization Guarantees for\n Representation Learning","summary":" A major challenge in designing efficient statistical supervised learning\nalgorithms is finding representations that perform well not only on available\ntraining samples but also on unseen data. While the study of representation\nlearning has spurred much interest, most existing such approaches are\nheuristic; and very little is known about theoretical generalization\nguarantees.\n In this paper, we establish a compressibility framework that allows us to\nderive upper bounds on the generalization error of a representation learning\nalgorithm in terms of the \"Minimum Description Length\" (MDL) of the labels or\nthe latent variables (representations). Rather than the mutual information\nbetween the encoder's input and the representation, which is often believed to\nreflect the algorithm's generalization capability in the related literature but\nin fact, falls short of doing so, our new bounds involve the \"multi-letter\"\nrelative entropy between the distribution of the representations (or labels) of\nthe training and test sets and a fixed prior. In particular, these new bounds\nreflect the structure of the encoder and are not vacuous for deterministic\nalgorithms. Our compressibility approach, which is information-theoretic in\nnature, builds upon that of Blum-Langford for PAC-MDL bounds and introduces two\nessential ingredients: block-coding and lossy-compression. The latter allows\nour approach to subsume the so-called geometrical compressibility as a special\ncase. To the best knowledge of the authors, the established generalization\nbounds are the first of their kind for Information Bottleneck (IB) type\nencoders and representation learning. Finally, we partly exploit the\ntheoretical results by introducing a new data-dependent prior. Numerical\nsimulations illustrate the advantages of well-chosen such priors over classical\npriors used in IB.\n","authors":["Milad Sefidgaran","Abdellatif Zaidi","Piotr Krasnowski"],"pdf_url":"https://arxiv.org/pdf/2402.03254v1.pdf","comment":"Accepted and presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2402.03252v1","updated":"2024-02-05T18:09:48Z","published":"2024-02-05T18:09:48Z","title":"Fair Active Ranking from Pairwise Preferences","summary":" We investigate the problem of probably approximately correct and fair (PACF)\nranking of items by adaptively evoking pairwise comparisons. Given a set of $n$\nitems that belong to disjoint groups, our goal is to find an $(\\epsilon,\n\\delta)$-PACF-Ranking according to a fair objective function that we propose.\nWe assume access to an oracle, wherein, for each query, the learner can choose\na pair of items and receive stochastic winner feedback from the oracle. Our\nproposed objective function asks to minimize the $\\ell_q$ norm of the error of\nthe groups, where the error of a group is the $\\ell_p$ norm of the error of all\nthe items within that group, for $p, q \\geq 1$. This generalizes the objective\nfunction of $\\epsilon$-Best-Ranking, proposed by Saha & Gopalan (2019).\n By adopting our objective function, we gain the flexibility to explore\nfundamental fairness concepts like equal or proportionate errors within a\nunified framework. Adjusting parameters $p$ and $q$ allows tailoring to\nspecific fairness preferences. We present both group-blind and group-aware\nalgorithms and analyze their sample complexity. We provide matching lower\nbounds up to certain logarithmic factors for group-blind algorithms. For a\nrestricted class of group-aware algorithms, we show that we can get reasonable\nlower bounds. We conduct comprehensive experiments on both real-world and\nsynthetic datasets to complement our theoretical findings.\n","authors":["Sruthi Gorantla","Sara Ahmadian"],"pdf_url":"https://arxiv.org/pdf/2402.03252v1.pdf","comment":"39 pages, 3.1 MB"},{"id":"http://arxiv.org/abs/2402.03251v1","updated":"2024-02-05T18:09:33Z","published":"2024-02-05T18:09:33Z","title":"CLIP Can Understand Depth","summary":" Recent studies on generalizing CLIP for monocular depth estimation reveal\nthat CLIP pre-trained on web-crawled data is inefficient for deriving proper\nsimilarities between image patches and depth-related prompts. In this paper, we\nadapt CLIP for meaningful quality of monocular depth estimation with dense\nprediction, without fine-tuning its original vision-language alignment. By\njointly training a compact deconvolutional decoder with a tiny learnable\nembedding matrix named mirror, as a static prompt for its text encoder, CLIP is\nenabled to understand depth. With this approach, our model exhibits impressive\nperformance matching several previous state-of-the-art vision-only models on\nthe NYU Depth v2 and KITTI datasets, outperforming every CLIP-based depth\nestimation model with a large margin. Experiments on temporal depth consistency\nand spatial continuity demonstrate that the prior knowledge of CLIP can be\neffectively refined by our proposed framework. Furthermore, an ablation study\non mirror proves that the resulting model estimates depth utilizing knowledge\nnot only from the image encoder but also text encoder despite not being given\nany prompt written in a human way. This research demonstrates that through\nminimal adjustments, the prior knowledge of vision-language foundation models,\nsuch as CLIP, can be generalized even to domains where learning during\npretraining is challenging. We facilitate future works focused on methods to\nadjust suboptimal prior knowledge of vision-language models using non-human\nlanguage prompts, achieving performance on par with task-specific\nstate-of-the-art methodologies.\n","authors":["Dunam Kim","Seokju Lee"],"pdf_url":"https://arxiv.org/pdf/2402.03251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03244v1","updated":"2024-02-05T17:59:00Z","published":"2024-02-05T17:59:00Z","title":"Skill Set Optimization: Reinforcing Language Model Behavior via\n Transferable Skills","summary":" Large language models (LLMs) have recently been used for sequential decision\nmaking in interactive environments. However, leveraging environment reward\nsignals for continual LLM actor improvement is not straightforward. We propose\nSkill Set Optimization (SSO) for improving LLM actor performance through\nconstructing and refining sets of transferable skills. SSO constructs skills by\nextracting common subtrajectories with high rewards and generating subgoals and\ninstructions to represent each skill. These skills are provided to the LLM\nactor in-context to reinforce behaviors with high rewards. Then, SSO further\nrefines the skill set by pruning skills that do not continue to result in high\nrewards. We evaluate our method in the classic videogame NetHack and the text\nenvironment ScienceWorld to demonstrate SSO's ability to optimize a set of\nskills and perform in-context policy improvement. SSO outperforms baselines by\n40% in our custom NetHack task and outperforms the previous state-of-the-art in\nScienceWorld by 35%.\n","authors":["Kolby Nottingham","Bodhisattwa Prasad Majumder","Bhavana Dalvi Mishra","Sameer Singh","Peter Clark","Roy Fox"],"pdf_url":"https://arxiv.org/pdf/2402.03244v1.pdf","comment":"8 pages, preprint"},{"id":"http://arxiv.org/abs/2402.03243v1","updated":"2024-02-05T17:58:17Z","published":"2024-02-05T17:58:17Z","title":"PINN-BO: A Black-box Optimization Algorithm using Physics-Informed\n Neural Networks","summary":" Black-box optimization is a powerful approach for discovering global optima\nin noisy and expensive black-box functions, a problem widely encountered in\nreal-world scenarios. Recently, there has been a growing interest in leveraging\ndomain knowledge to enhance the efficacy of machine learning methods. Partial\nDifferential Equations (PDEs) often provide an effective means for elucidating\nthe fundamental principles governing the black-box functions. In this paper, we\npropose PINN-BO, a black-box optimization algorithm employing Physics-Informed\nNeural Networks that integrates the knowledge from Partial Differential\nEquations (PDEs) to improve the sample efficiency of the optimization. We\nanalyze the theoretical behavior of our algorithm in terms of regret bound\nusing advances in NTK theory and prove that the use of the PDE alongside the\nblack-box function evaluations, PINN-BO leads to a tighter regret bound. We\nperform several experiments on a variety of optimization tasks and show that\nour algorithm is more sample-efficient compared to existing methods.\n","authors":["Dat Phan-Trong","Hung The Tran","Alistair Shilton","Sunil Gupta"],"pdf_url":"https://arxiv.org/pdf/2402.03243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03241v1","updated":"2024-02-05T17:56:41Z","published":"2024-02-05T17:56:41Z","title":"FROSTER: Frozen CLIP Is A Strong Teacher for Open-Vocabulary Action\n Recognition","summary":" In this paper, we introduce FROSTER, an effective framework for\nopen-vocabulary action recognition. The CLIP model has achieved remarkable\nsuccess in a range of image-based tasks, benefiting from its strong\ngeneralization capability stemming from pretaining on massive image-text pairs.\nHowever, applying CLIP directly to the open-vocabulary action recognition task\nis challenging due to the absence of temporal information in CLIP's\npretraining. Further, fine-tuning CLIP on action recognition datasets may lead\nto overfitting and hinder its generalizability, resulting in unsatisfactory\nresults when dealing with unseen actions.\n To address these issues, FROSTER employs a residual feature distillation\napproach to ensure that CLIP retains its generalization capability while\neffectively adapting to the action recognition task. Specifically, the residual\nfeature distillation treats the frozen CLIP model as a teacher to maintain the\ngeneralizability exhibited by the original CLIP and supervises the feature\nlearning for the extraction of video-specific features to bridge the gap\nbetween images and videos. Meanwhile, it uses a residual sub-network for\nfeature distillation to reach a balance between the two distinct objectives of\nlearning generalizable and video-specific features.\n We extensively evaluate FROSTER on open-vocabulary action recognition\nbenchmarks under both base-to-novel and cross-dataset settings. FROSTER\nconsistently achieves state-of-the-art performance on all datasets across the\nboard. Project page: https://visual-ai.github.io/froster.\n","authors":["Xiaohu Huang","Hao Zhou","Kun Yao","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2402.03241v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2209.03275v2","updated":"2024-02-05T17:54:04Z","published":"2022-09-07T16:27:34Z","title":"Multimodal Speech Enhancement Using Burst Propagation","summary":" This paper proposes the MBURST, a novel multimodal solution for audio-visual\nspeech enhancements that consider the most recent neurological discoveries\nregarding pyramidal cells of the prefrontal cortex and other brain regions. The\nso-called burst propagation implements several criteria to address the credit\nassignment problem in a more biologically plausible manner: steering the sign\nand magnitude of plasticity through feedback, multiplexing the feedback and\nfeedforward information across layers through different weight connections,\napproximating feedback and feedforward connections, and linearizing the\nfeedback signals. MBURST benefits from such capabilities to learn correlations\nbetween the noisy signal and the visual stimuli, thus attributing meaning to\nthe speech by amplifying relevant information and suppressing noise.\nExperiments conducted over a Grid Corpus and CHiME3-based dataset show that\nMBURST can reproduce similar mask reconstructions to the multimodal\nbackpropagation-based baseline while demonstrating outstanding energy\nefficiency management, reducing the neuron firing rates to values up to\n\\textbf{$70\\%$} lower. Such a feature implies more sustainable implementations,\nsuitable and desirable for hearing aids or any other similar embedded systems.\n","authors":["Mohsin Raza","Leandro A. Passos","Ahmed Khubaib","Ahsan Adeel"],"pdf_url":"https://arxiv.org/pdf/2209.03275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03235v1","updated":"2024-02-05T17:52:58Z","published":"2024-02-05T17:52:58Z","title":"ActiveAnno3D -- An Active Learning Framework for Multi-Modal 3D Object\n Detection","summary":" The curation of large-scale datasets is still costly and requires much time\nand resources. Data is often manually labeled, and the challenge of creating\nhigh-quality datasets remains. In this work, we fill the research gap using\nactive learning for multi-modal 3D object detection. We propose ActiveAnno3D,\nan active learning framework to select data samples for labeling that are of\nmaximum informativeness for training. We explore various continuous training\nmethods and integrate the most efficient method regarding computational demand\nand detection performance. Furthermore, we perform extensive experiments and\nablation studies with BEVFusion and PV-RCNN on the nuScenes and TUM Traffic\nIntersection dataset. We show that we can achieve almost the same performance\nwith PV-RCNN and the entropy-based query strategy when using only half of the\ntraining data (77.25 mAP compared to 83.50 mAP) of the TUM Traffic Intersection\ndataset. BEVFusion achieved an mAP of 64.31 when using half of the training\ndata and 75.0 mAP when using the complete nuScenes dataset. We integrate our\nactive learning framework into the proAnno labeling tool to enable AI-assisted\ndata selection and labeling and minimize the labeling costs. Finally, we\nprovide code, weights, and visualization results on our website:\nhttps://active3d-framework.github.io/active3d-framework.\n","authors":["Ahmed Ghita","Bjørk Antoniussen","Walter Zimmer","Ross Greer","Christian Creß","Andreas Møgelmose","Mohan M. Trivedi","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2402.03235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03232v1","updated":"2024-02-05T17:45:12Z","published":"2024-02-05T17:45:12Z","title":"Smart Flow Matching: On The Theory of Flow Matching Algorithms with\n Applications","summary":" The paper presents the exact formula for the vector field that minimizes the\nloss for the standard flow. This formula depends analytically on a given\ndistribution \\rho_0 and an unknown one \\rho_1. Based on the presented formula,\na new loss and algorithm for training a vector field model in the style of\nConditional Flow Matching are provided. Our loss, in comparison to the standard\nConditional Flow Matching approach, exhibits smaller variance when evaluated\nthrough Monte Carlo sampling methods. Numerical experiments on synthetic models\nand models on tabular data of large dimensions demonstrate better learning\nresults with the use of the presented algorithm.\n","authors":["Gleb Ryzhakov","Svetlana Pavlova","Egor Sevriugov","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2402.03232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03231v1","updated":"2024-02-05T17:44:21Z","published":"2024-02-05T17:44:21Z","title":"Improved prediction of future user activity in online A/B testing","summary":" In online randomized experiments or A/B tests, accurate predictions of\nparticipant inclusion rates are of paramount importance. These predictions not\nonly guide experimenters in optimizing the experiment's duration but also\nenhance the precision of treatment effect estimates. In this paper we present a\nnovel, straightforward, and scalable Bayesian nonparametric approach for\npredicting the rate at which individuals will be exposed to interventions\nwithin the realm of online A/B testing. Our approach stands out by offering\ndual prediction capabilities: it forecasts both the quantity of new customers\nexpected in future time windows and, unlike available alternative methods, the\nnumber of times they will be observed. We derive closed-form expressions for\nthe posterior distributions of the quantities needed to form predictions about\nfuture user activity, thereby bypassing the need for numerical algorithms such\nas Markov chain Monte Carlo. After a comprehensive exposition of our model, we\ntest its performance on experiments on real and simulated data, where we show\nits superior performance with respect to existing alternatives in the\nliterature.\n","authors":["Lorenzo Masoero","Mario Beraha","Thomas Richardson","Stefano Favaro"],"pdf_url":"https://arxiv.org/pdf/2402.03231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03230v1","updated":"2024-02-05T17:43:02Z","published":"2024-02-05T17:43:02Z","title":"CT-based Anatomical Segmentation for Thoracic Surgical Planning: A\n Benchmark Study for 3D U-shaped Deep Learning Models","summary":" Recent rising interests in patient-specific thoracic surgical planning and\nsimulation require efficient and robust creation of digital anatomical models\nfrom automatic medical image segmentation algorithms. Deep learning (DL) is now\nstate-of-the-art in various radiological tasks, and U-shaped DL models have\nparticularly excelled in medical image segmentation since the inception of the\n2D UNet. To date, many variants of U-shaped models have been proposed by the\nintegration of different attention mechanisms and network configurations.\nLeveraging the recent development of large multi-label databases, systematic\nbenchmark studies for these models can provide valuable insights for clinical\ndeployment and future model designs, but such studies are still rare. We\nconduct the first benchmark study for variants of 3D U-shaped models (3DUNet,\nSTUNet, AttentionUNet, SwinUNETR, FocalSegNet, and a novel 3D SwinUnet with\nfour variants) with a focus on CT-based anatomical segmentation for thoracic\nsurgery. Our study systematically examines the impact of different attention\nmechanisms, number of resolution stages, and network configurations on\nsegmentation accuracy and computational complexity. To allow cross-reference\nwith other recent benchmarking studies, we also included a performance\nassessment of the BTCV abdominal structural segmentation. With the STUNet\nranking at the top, our study demonstrated the value of CNN-based U-shaped\nmodels for the investigated tasks and the benefit of residual blocks in network\nconfiguration designs to boost segmentation performance.\n","authors":["Arash Harirpoush","Amirhossein Rasoulian","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.03230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03227v1","updated":"2024-02-05T17:38:49Z","published":"2024-02-05T17:38:49Z","title":"IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of\n brain MR images","summary":" In MRI studies, the aggregation of imaging data from multiple acquisition\nsites enhances sample size but may introduce site-related variabilities that\nhinder consistency in subsequent analyses. Deep learning methods for image\ntranslation have emerged as a solution for harmonizing MR images across sites.\nIn this study, we introduce IGUANe (Image Generation with Unified Adversarial\nNetworks), an original 3D model that leverages the strengths of domain\ntranslation and straightforward application of style transfer methods for\nmulticenter brain MR image harmonization. IGUANe extends CycleGAN architecture\nby integrating an arbitrary number of domains for training through a\nmany-to-one strategy. During inference, the model can be applied to any image,\neven from an unknown acquisition site, making it a universal generator for\nharmonization. Trained on a dataset comprising T1-weighted images from 11\ndifferent scanners, IGUANe was evaluated on data from unseen sites. The\nassessments included the transformation of MR images with traveling subjects,\nthe preservation of pairwise distances between MR images within domains, the\nevolution of volumetric patterns related to age and Alzheimer$^\\prime$s disease\n(AD), and the performance in age regression and patient classification tasks.\nComparisons with other harmonization and normalization methods suggest that\nIGUANe better preserves individual information in MR images and is more\nsuitable for maintaining and reinforcing variabilities related to age and AD.\nFuture studies may further assess IGUANe in other multicenter contexts, either\nusing the same model or retraining it for applications to different image\nmodalities.\n","authors":["Vincent Roca","Grégory Kuchcinski","Jean-Pierre Pruvo","Dorian Manouvriez","Renaud Lopes"],"pdf_url":"https://arxiv.org/pdf/2402.03227v1.pdf","comment":"23 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.03226v1","updated":"2024-02-05T17:37:46Z","published":"2024-02-05T17:37:46Z","title":"FuseMoE: Mixture-of-Experts Transformers for Fleximodal Fusion","summary":" As machine learning models in critical fields increasingly grapple with\nmultimodal data, they face the dual challenges of handling a wide array of\nmodalities, often incomplete due to missing elements, and the temporal\nirregularity and sparsity of collected samples. Successfully leveraging this\ncomplex data, while overcoming the scarcity of high-quality training samples,\nis key to improving these models' predictive performance. We introduce\n``FuseMoE'', a mixture-of-experts framework incorporated with an innovative\ngating function. Designed to integrate a diverse number of modalities, FuseMoE\nis effective in managing scenarios with missing modalities and irregularly\nsampled data trajectories. Theoretically, our unique gating function\ncontributes to enhanced convergence rates, leading to better performance in\nmultiple downstream tasks. The practical utility of FuseMoE in real world is\nvalidated by a challenging set of clinical risk prediction tasks.\n","authors":["Xing Han","Huy Nguyen","Carl Harris","Nhat Ho","Suchi Saria"],"pdf_url":"https://arxiv.org/pdf/2402.03226v1.pdf","comment":"35 pages, 8 tables, 5 figures"},{"id":"http://arxiv.org/abs/2402.03220v1","updated":"2024-02-05T17:30:42Z","published":"2024-02-05T17:30:42Z","title":"The Benefits of Reusing Batches for Gradient Descent in Two-Layer\n Networks: Breaking the Curse of Information and Leap Exponents","summary":" We investigate the training dynamics of two-layer neural networks when\nlearning multi-index target functions. We focus on multi-pass gradient descent\n(GD) that reuses the batches multiple times and show that it significantly\nchanges the conclusion about which functions are learnable compared to\nsingle-pass gradient descent. In particular, multi-pass GD with finite stepsize\nis found to overcome the limitations of gradient flow and single-pass GD given\nby the information exponent (Ben Arous et al., 2021) and leap exponent (Abbe et\nal., 2023) of the target function. We show that upon re-using batches, the\nnetwork achieves in just two time steps an overlap with the target subspace\neven for functions not satisfying the staircase property (Abbe et al., 2021).\nWe characterize the (broad) class of functions efficiently learned in finite\ntime. The proof of our results is based on the analysis of the Dynamical\nMean-Field Theory (DMFT). We further provide a closed-form description of the\ndynamical process of the low-dimensional projections of the weights, and\nnumerical experiments illustrating the theory.\n","authors":["Yatin Dandi","Emanuele Troiani","Luca Arnaboldi","Luca Pesce","Lenka Zdeborová","Florent Krzakala"],"pdf_url":"https://arxiv.org/pdf/2402.03220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03216v1","updated":"2024-02-05T17:26:49Z","published":"2024-02-05T17:26:49Z","title":"BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity\n Text Embeddings Through Self-Knowledge Distillation","summary":" In this paper, we present a new embedding model, called M3-Embedding, which\nis distinguished for its versatility in Multi-Linguality, Multi-Functionality,\nand Multi-Granularity. It can support more than 100 working languages, leading\nto new state-of-the-art performances on multi-lingual and cross-lingual\nretrieval tasks. It can simultaneously perform the three common retrieval\nfunctionalities of embedding model: dense retrieval, multi-vector retrieval,\nand sparse retrieval, which provides a unified model foundation for real-world\nIR applications. It is able to process inputs of different granularities,\nspanning from short sentences to long documents of up to 8192 tokens. The\neffective training of M3-Embedding involves the following technical\ncontributions. We propose a novel self-knowledge distillation approach, where\nthe relevance scores from different retrieval functionalities can be integrated\nas the teacher signal to enhance the training quality. We also optimize the\nbatching strategy, enabling a large batch size and high training throughput to\nensure the discriminativeness of embeddings. To the best of our knowledge,\nM3-Embedding is the first embedding model which realizes such a strong\nversatility. The model and code will be publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Jianlv Chen","Shitao Xiao","Peitian Zhang","Kun Luo","Defu Lian","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03216v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.03214v1","updated":"2024-02-05T17:25:04Z","published":"2024-02-05T17:25:04Z","title":"Organic or Diffused: Can We Distinguish Human Art from AI-generated\n Images?","summary":" The advent of generative AI images has completely disrupted the art world.\nIdentifying AI generated images from human art is a challenging problem whose\nimpact is growing over time. The failure to address this problem allows bad\nactors to defraud individuals paying a premium for human art, and companies\nwhose stated policies forbid AI imagery. This is also critical for AI model\ntrainers, who need to filter training data to avoid potential model collapse.\nThere are several different approaches to distinguishing human art from AI\nimages, including classifiers trained by supervised learning, research tools\ntargeting diffusion models, and identification by professional artists using\ntheir knowledge of artistic techniques. In this paper, we seek to understand\nhow well these approaches can perform against today's modern generative models\nin both benign and adversarial settings. We curate real human art across 7\nstyles, generate matching images from 5 generative models, and apply 8\ndetectors (5 automated detectors and 3 different human groups including 180\ncrowdworkers, 4000+ professional artists, and 13 expert artists experienced at\ndetecting AI). Both Hive and expert artists do very well, but make mistakes in\ndifferent ways (Hive is weaker against adversarial perturbations while Expert\nartists produce higher false positives). We believe these weaknesses will\nremain as models continue to evolve, and use our data to demonstrate why a\ncombined team of human and automated detectors provides the best combination of\naccuracy and robustness.\n","authors":["Anna Yoo Jeong Ha","Josephine Passananti","Ronik Bhaskar","Shawn Shan","Reid Southen","Haitao Zheng","Ben Y. Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.03214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03207v1","updated":"2024-02-05T17:17:57Z","published":"2024-02-05T17:17:57Z","title":"Light and Optimal Schrödinger Bridge Matching","summary":" Schr\\\"odinger Bridges (SB) have recently gained the attention of the ML\ncommunity as a promising extension of classic diffusion models which is also\ninterconnected to the Entropic Optimal Transport (EOT). Recent solvers for SB\nexploit the pervasive bridge matching procedures. Such procedures aim to\nrecover a stochastic process transporting the mass between distributions given\nonly a transport plan between them. In particular, given the EOT plan, these\nprocedures can be adapted to solve SB. This fact is heavily exploited by recent\nworks giving rives to matching-based SB solvers. The cornerstone here is\nrecovering the EOT plan: recent works either use heuristical approximations\n(e.g., the minibatch OT) or establish iterative matching procedures which by\nthe design accumulate the error during the training. We address these\nlimitations and propose a novel procedure to learn SB which we call the\n\\textbf{optimal Schr\\\"odinger bridge matching}. It exploits the optimal\nparameterization of the diffusion process and provably recovers the SB process\n\\textbf{(a)} with a single bridge matching step and \\textbf{(b)} with arbitrary\ntransport plan as the input. Furthermore, we show that the optimal bridge\nmatching objective coincides with the recently discovered energy-based modeling\n(EBM) objectives to learn EOT/SB. Inspired by this observation, we develop a\nlight solver (which we call LightSB-M) to implement optimal matching in\npractice using the Gaussian mixture parameterization of the Schr\\\"odinger\npotential. We experimentally showcase the performance of our solver in a range\nof practical tasks. The code for the LightSB-M solver can be found at\n\\url{https://github.com/SKholkin/LightSB-Matching}.\n","authors":["Nikita Gushchin","Sergei Kholkin","Evgeny Burnaev","Alexander Korotin"],"pdf_url":"https://arxiv.org/pdf/2402.03207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03204v1","updated":"2024-02-05T17:15:00Z","published":"2024-02-05T17:15:00Z","title":"Multi-agent Reinforcement Learning for Energy Saving in Multi-Cell\n Massive MIMO Systems","summary":" We develop a multi-agent reinforcement learning (MARL) algorithm to minimize\nthe total energy consumption of multiple massive MIMO (multiple-input\nmultiple-output) base stations (BSs) in a multi-cell network while preserving\nthe overall quality-of-service (QoS) by making decisions on the multi-level\nadvanced sleep modes (ASMs) and antenna switching of these BSs. The problem is\nmodeled as a decentralized partially observable Markov decision process\n(DEC-POMDP) to enable collaboration between individual BSs, which is necessary\nto tackle inter-cell interference. A multi-agent proximal policy optimization\n(MAPPO) algorithm is designed to learn a collaborative BS control policy. To\nenhance its scalability, a modified version called MAPPO-neighbor policy is\nfurther proposed. Simulation results demonstrate that the trained MAPPO agent\nachieves better performance compared to baseline policies. Specifically,\ncompared to the auto sleep mode 1 (symbol-level sleeping) algorithm, the\nMAPPO-neighbor policy reduces power consumption by approximately 8.7% during\nlow-traffic hours and improves energy efficiency by approximately 19% during\nhigh-traffic hours, respectively.\n","authors":["Tianzhang Cai","Qichen Wang","Shuai Zhang","Özlem Tuğfe Demir","Cicek Cavdar"],"pdf_url":"https://arxiv.org/pdf/2402.03204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15222v2","updated":"2024-02-05T17:13:41Z","published":"2024-01-26T22:19:31Z","title":"Transfer Learning for the Prediction of Entity Modifiers in Clinical\n Text: Application to Opioid Use Disorder Case Detection","summary":" Background: The semantics of entities extracted from a clinical text can be\ndramatically altered by modifiers, including entity negation, uncertainty,\nconditionality, severity, and subject. Existing models for determining\nmodifiers of clinical entities involve regular expression or features weights\nthat are trained independently for each modifier.\n Methods: We develop and evaluate a multi-task transformer architecture design\nwhere modifiers are learned and predicted jointly using the publicly available\nSemEval 2015 Task 14 corpus and a new Opioid Use Disorder (OUD) data set that\ncontains modifiers shared with SemEval as well as novel modifiers specific for\nOUD. We evaluate the effectiveness of our multi-task learning approach versus\npreviously published systems and assess the feasibility of transfer learning\nfor clinical entity modifiers when only a portion of clinical modifiers are\nshared.\n Results: Our approach achieved state-of-the-art results on the ShARe corpus\nfrom SemEval 2015 Task 14, showing an increase of 1.1% on weighted accuracy,\n1.7% on unweighted accuracy, and 10% on micro F1 scores.\n Conclusions: We show that learned weights from our shared model can be\neffectively transferred to a new partially matched data set, validating the use\nof transfer learning for clinical text modifiers\n","authors":["Abdullateef I. Almudaifer","Whitney Covington","JaMor Hairston","Zachary Deitch","Ankit Anand","Caleb M. Carroll","Estera Crisan","William Bradford","Lauren Walter","Eaton Ellen","Sue S. Feldman","John D. Osborne"],"pdf_url":"https://arxiv.org/pdf/2401.15222v2.pdf","comment":"18 pages, 2 figures, 6 tables. To be submitted to the Journal of\n Biomedical Semantics"},{"id":"http://arxiv.org/abs/2402.03201v1","updated":"2024-02-05T17:12:21Z","published":"2024-02-05T17:12:21Z","title":"Guidance with Spherical Gaussian Constraint for Conditional Diffusion","summary":" Recent advances in diffusion models attempt to handle conditional generative\ntasks by utilizing a differentiable loss function for guidance without the need\nfor additional training. While these methods achieved certain success, they\noften compromise on sample quality and require small guidance step sizes,\nleading to longer sampling processes. This paper reveals that the fundamental\nissue lies in the manifold deviation during the sampling process when loss\nguidance is employed. We theoretically show the existence of manifold deviation\nby establishing a certain lower bound for the estimation error of the loss\nguidance. To mitigate this problem, we propose Diffusion with Spherical\nGaussian constraint (DSG), drawing inspiration from the concentration\nphenomenon in high-dimensional Gaussian distributions. DSG effectively\nconstrains the guidance step within the intermediate data manifold through\noptimization and enables the use of larger guidance steps. Furthermore, we\npresent a closed-form solution for DSG denoising with the Spherical Gaussian\nconstraint. Notably, DSG can seamlessly integrate as a plugin module within\nexisting training-free conditional diffusion methods. Implementing DSG merely\ninvolves a few lines of additional code with almost no extra computational\noverhead, yet it leads to significant performance improvements. Comprehensive\nexperimental results in various conditional generation tasks validate the\nsuperiority and adaptability of DSG in terms of both sample quality and time\nefficiency.\n","authors":["Lingxiao Yang","Shutong Ding","Yifan Cai","Jingyi Yu","Jingya Wang","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2402.03201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15574v2","updated":"2024-02-05T17:00:41Z","published":"2023-12-25T01:00:58Z","title":"Faster Rates for Switchback Experiments","summary":" Switchback experimental design, wherein a single unit (e.g., a whole system)\nis exposed to a single random treatment for interspersed blocks of time,\ntackles both cross-unit and temporal interference. Hu and Wager (2022) recently\nproposed a treatment-effect estimator that truncates the beginnings of blocks\nand established a $T^{-1/3}$ rate for estimating the global average treatment\neffect (GATE) in a Markov setting with rapid mixing. They claim this rate is\noptimal and suggest focusing instead on a different (and design-dependent)\nestimand so as to enjoy a faster rate. For the same design we propose an\nalternative estimator that uses the whole block and surprisingly show that it\nin fact achieves an estimation rate of $\\sqrt{\\log T/T}$ for the original\ndesign-independent GATE estimand under the same assumptions.\n","authors":["Su Jia","Nathan Kallus","Christina Lee Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03191v1","updated":"2024-02-05T16:57:24Z","published":"2024-02-05T16:57:24Z","title":"Isotropy, Clusters, and Classifiers","summary":" Whether embedding spaces use all their dimensions equally, i.e., whether they\nare isotropic, has been a recent subject of discussion. Evidence has been\naccrued both for and against enforcing isotropy in embedding spaces. In the\npresent paper, we stress that isotropy imposes requirements on the embedding\nspace that are not compatible with the presence of clusters -- which also\nnegatively impacts linear classification objectives. We demonstrate this fact\nempirically and use it to shed light on previous results from the literature.\n","authors":["Timothee Mickus","Stig-Arne Grönroos","Joseph Attieh"],"pdf_url":"https://arxiv.org/pdf/2402.03191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03190v1","updated":"2024-02-05T16:56:11Z","published":"2024-02-05T16:56:11Z","title":"Unified Hallucination Detection for Multimodal Large Language Models","summary":" Despite significant strides in multimodal tasks, Multimodal Large Language\nModels (MLLMs) are plagued by the critical issue of hallucination. The reliable\ndetection of such hallucinations in MLLMs has, therefore, become a vital aspect\nof model evaluation and the safeguarding of practical application deployment.\nPrior research in this domain has been constrained by a narrow focus on\nsingular tasks, an inadequate range of hallucination categories addressed, and\na lack of detailed granularity. In response to these challenges, our work\nexpands the investigative horizons of hallucination detection. We present a\nnovel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate\nthe evaluation of advancements in hallucination detection methods.\nAdditionally, we unveil a novel unified multimodal hallucination detection\nframework, UNIHD, which leverages a suite of auxiliary tools to validate the\noccurrence of hallucinations robustly. We demonstrate the effectiveness of\nUNIHD through meticulous evaluation and comprehensive analysis. We also provide\nstrategic insights on the application of specific tools for addressing various\ncategories of hallucinations.\n","authors":["Xiang Chen","Chenxi Wang","Yida Xue","Ningyu Zhang","Xiaoyan Yang","Qiang Li","Yue Shen","Jinjie Gu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03190v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.03187v1","updated":"2024-02-05T16:51:59Z","published":"2024-02-05T16:51:59Z","title":"How Good is a Single Basin?","summary":" The multi-modal nature of neural loss landscapes is often considered to be\nthe main driver behind the empirical success of deep ensembles. In this work,\nwe probe this belief by constructing various \"connected\" ensembles which are\nrestricted to lie in the same basin. Through our experiments, we demonstrate\nthat increased connectivity indeed negatively impacts performance. However,\nwhen incorporating the knowledge from other basins implicitly through\ndistillation, we show that the gap in performance can be mitigated by\nre-discovering (multi-basin) deep ensembles within a single basin. Thus, we\nconjecture that while the extra-basin knowledge is at least partially present\nin any given basin, it cannot be easily harnessed without learning it from\nother basins.\n","authors":["Kai Lion","Lorenzo Noci","Thomas Hofmann","Gregor Bachmann"],"pdf_url":"https://arxiv.org/pdf/2402.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03183v1","updated":"2024-02-05T16:47:13Z","published":"2024-02-05T16:47:13Z","title":"Predicting Configuration Performance in Multiple Environments with\n Sequential Meta-learning","summary":" Learning and predicting the performance of given software configurations are\nof high importance to many software engineering activities. While configurable\nsoftware systems will almost certainly face diverse running environments (e.g.,\nversion, hardware, and workload), current work often either builds performance\nmodels under a single environment or fails to properly handle data from diverse\nsettings, hence restricting their accuracy for new environments. In this paper,\nwe target configuration performance learning under multiple environments. We do\nso by designing SeMPL - a meta-learning framework that learns the common\nunderstanding from configurations measured in distinct (meta) environments and\ngeneralizes them to the unforeseen, target environment. What makes it unique is\nthat unlike common meta-learning frameworks (e.g., MAML and MetaSGD) that train\nthe meta environments in parallel, we train them sequentially, one at a time.\nThe order of training naturally allows discriminating the contributions among\nmeta environments in the meta-model built, which fits better with the\ncharacteristic of configuration data that is known to dramatically differ\nbetween different environments. Through comparing with 15 state-of-the-art\nmodels under nine systems, our extensive experimental results demonstrate that\nSeMPL performs considerably better on 89% of the systems with up to 99%\naccuracy improvement, while being data-efficient, leading to a maximum of 3.86x\nspeedup. All code and data can be found at our repository:\nhttps://github.com/ideas-labo/SeMPL.\n","authors":["Jingzhi Gong","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03183v1.pdf","comment":"This paper has been accepted by FSE'24"},{"id":"http://arxiv.org/abs/2402.03182v1","updated":"2024-02-05T16:46:35Z","published":"2024-02-05T16:46:35Z","title":"Empowering Time Series Analysis with Large Language Models: A Survey","summary":" Recently, remarkable progress has been made over large language models\n(LLMs), demonstrating their unprecedented capability in varieties of natural\nlanguage tasks. However, completely training a large general-purpose model from\nthe scratch is challenging for time series analysis, due to the large volumes\nand varieties of time series data, as well as the non-stationarity that leads\nto concept drift impeding continuous model adaptation and re-training. Recent\nadvances have shown that pre-trained LLMs can be exploited to capture complex\ndependencies in time series data and facilitate various applications. In this\nsurvey, we provide a systematic overview of existing methods that leverage LLMs\nfor time series analysis. Specifically, we first state the challenges and\nmotivations of applying language models in the context of time series as well\nas brief preliminaries of LLMs. Next, we summarize the general pipeline for\nLLM-based time series analysis, categorize existing methods into different\ngroups (i.e., direct query, tokenization, prompt design, fine-tune, and model\nintegration), and highlight the key ideas within each group. We also discuss\nthe applications of LLMs for both general and spatial-temporal time series\ndata, tailored to specific domains. Finally, we thoroughly discuss future\nresearch opportunities to empower time series analysis with LLMs.\n","authors":["Yushan Jiang","Zijie Pan","Xikun Zhang","Sahil Garg","Anderson Schneider","Yuriy Nevmyvaka","Dongjin Song"],"pdf_url":"https://arxiv.org/pdf/2402.03182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03179v1","updated":"2024-02-05T16:45:38Z","published":"2024-02-05T16:45:38Z","title":"Cool-chic video: Learned video coding with 800 parameters","summary":" We propose a lightweight learned video codec with 900 multiplications per\ndecoded pixel and 800 parameters overall. To the best of our knowledge, this is\none of the neural video codecs with the lowest decoding complexity. It is built\nupon the overfitted image codec Cool-chic and supplements it with an inter\ncoding module to leverage the video's temporal redundancies. The proposed model\nis able to compress videos using both low-delay and random access\nconfigurations and achieves rate-distortion close to AVC while out-performing\nother overfitted codecs such as FFNeRV. The system is made open-source:\norange-opensource.github.io/Cool-Chic.\n","authors":["Thomas Leguay","Théo Ladune","Pierrick Philippe","Olivier Déforges"],"pdf_url":"https://arxiv.org/pdf/2402.03179v1.pdf","comment":"10 pages, published in Data Compression Conference 2024"},{"id":"http://arxiv.org/abs/2402.03177v1","updated":"2024-02-05T16:44:17Z","published":"2024-02-05T16:44:17Z","title":"CIDAR: Culturally Relevant Instruction Dataset For Arabic","summary":" Instruction tuning has emerged as a prominent methodology for teaching Large\nLanguage Models (LLMs) to follow instructions. However, current instruction\ndatasets predominantly cater to English or are derived from English-dominated\nLLMs, resulting in inherent biases toward Western culture. This bias\nsignificantly impacts the linguistic structures of non-English languages such\nas Arabic, which has a distinct grammar reflective of the diverse cultures\nacross the Arab region. This paper addresses this limitation by introducing\nCIDAR: https://hf.co/datasets/arbml/CIDAR, the first open Arabic\ninstruction-tuning dataset culturally-aligned by human reviewers. CIDAR\ncontains 10,000 instruction and output pairs that represent the Arab region. We\ndiscuss the cultural relevance of CIDAR via the analysis and comparison to\nother models fine-tuned on other datasets. Our experiments show that CIDAR can\nhelp enrich research efforts in aligning LLMs with the Arabic culture. All the\ncode is available at https://github.com/ARBML/CIDAR.\n","authors":["Zaid Alyafeai","Khalid Almubarak","Ahmed Ashraf","Deema Alnuhait","Saied Alshahrani","Gubran A. Q. Abdulrahman","Gamil Ahmed","Qais Gawah","Zead Saleh","Mustafa Ghaleb","Yousef Ali","Maged S. Al-Shaibani"],"pdf_url":"https://arxiv.org/pdf/2402.03177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03176v1","updated":"2024-02-05T16:43:53Z","published":"2024-02-05T16:43:53Z","title":"Comparison of Topic Modelling Approaches in the Banking Context","summary":" Topic modelling is a prominent task for automatic topic extraction in many\napplications such as sentiment analysis and recommendation systems. The\napproach is vital for service industries to monitor their customer discussions.\nThe use of traditional approaches such as Latent Dirichlet Allocation (LDA) for\ntopic discovery has shown great performances, however, they are not consistent\nin their results as these approaches suffer from data sparseness and inability\nto model the word order in a document. Thus, this study presents the use of\nKernel Principal Component Analysis (KernelPCA) and K-means Clustering in the\nBERTopic architecture. We have prepared a new dataset using tweets from\ncustomers of Nigerian banks and we use this to compare the topic modelling\napproaches. Our findings showed KernelPCA and K-means in the BERTopic\narchitecture-produced coherent topics with a coherence score of 0.8463.\n","authors":["Bayode Ogunleye","Tonderai Maswera","Laurence Hirsch","Jotham Gaudoin","Teresa Brunsdon"],"pdf_url":"https://arxiv.org/pdf/2402.03176v1.pdf","comment":"14 pages, Journal of Applied Science"},{"id":"http://arxiv.org/abs/2402.03175v1","updated":"2024-02-05T16:42:10Z","published":"2024-02-05T16:42:10Z","title":"The Matrix: A Bayesian learning model for LLMs","summary":" In this paper, we introduce a Bayesian learning model to understand the\nbehavior of Large Language Models (LLMs). We explore the optimization metric of\nLLMs, which is based on predicting the next token, and develop a novel model\ngrounded in this principle. Our approach involves constructing an ideal\ngenerative text model represented by a multinomial transition probability\nmatrix with a prior, and we examine how LLMs approximate this matrix. We\ndiscuss the continuity of the mapping between embeddings and multinomial\ndistributions, and present the Dirichlet approximation theorem to approximate\nany prior. Additionally, we demonstrate how text generation by LLMs aligns with\nBayesian learning principles and delve into the implications for in-context\nlearning, specifically explaining why in-context learning emerges in larger\nmodels where prompts are considered as samples to be updated. Our findings\nindicate that the behavior of LLMs is consistent with Bayesian Learning,\noffering new insights into their functioning and potential applications.\n","authors":["Siddhartha Dalal","Vishal Misra"],"pdf_url":"https://arxiv.org/pdf/2402.03175v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03174v1","updated":"2024-02-05T16:41:17Z","published":"2024-02-05T16:41:17Z","title":"Decentralized Event-Triggered Online Learning for Safe Consensus of\n Multi-Agent Systems with Gaussian Process Regression","summary":" Consensus control in multi-agent systems has received significant attention\nand practical implementation across various domains. However, managing\nconsensus control under unknown dynamics remains a significant challenge for\ncontrol design due to system uncertainties and environmental disturbances. This\npaper presents a novel learning-based distributed control law, augmented by an\nauxiliary dynamics. Gaussian processes are harnessed to compensate for the\nunknown components of the multi-agent system. For continuous enhancement in\npredictive performance of Gaussian process model, a data-efficient online\nlearning strategy with a decentralized event-triggered mechanism is proposed.\nFurthermore, the control performance of the proposed approach is ensured via\nthe Lyapunov theory, based on a probabilistic guarantee for prediction error\nbounds. To demonstrate the efficacy of the proposed learning-based controller,\na comparative analysis is conducted, contrasting it with both conventional\ndistributed control laws and offline learning methodologies.\n","authors":["Xiaobing Dai","Zewen Yang","Mengtian Xu","Fangzhou Liu","Georges Hattab","Sandra Hirche"],"pdf_url":"https://arxiv.org/pdf/2402.03174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14736v2","updated":"2024-02-05T16:41:10Z","published":"2023-11-21T19:12:18Z","title":"Data Diversity Matters for Robust Instruction Tuning","summary":" Recent works have shown that by curating high quality and diverse instruction\ntuning datasets, we can significantly improve instruction-following\ncapabilities. However, creating such datasets is difficult and most works rely\non manual curation or proprietary language models. Automatic data curation is\ndifficult as it is still not clear how we can define diversity for instruction\ntuning, how diversity and quality depend on one other, and how we can optimize\ndataset quality and diversity. To resolve these issue, we propose a new\nalgorithm, Quality-Diversity Instruction Tuning (QDIT). QDIT provides a simple\nmethod to simultaneously control dataset diversity and quality, allowing us to\nconduct an in-depth study on the effect of diversity and quality on instruction\ntuning performance. From this study we draw two key insights (1) there is a\nnatural tradeoff between data diversity and quality and (2) increasing data\ndiversity significantly improves the worst case instruction following\nperformance, therefore improving robustness. We validate the performance of\nQDIT on several large scale instruction tuning datasets, where we find it can\nsubstantially improve worst and average case performance compared to\nquality-driven data selection.\n","authors":["Alexander Bukharin","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.14736v2.pdf","comment":"22 pages, 18 figures"},{"id":"http://arxiv.org/abs/2402.03171v1","updated":"2024-02-05T16:39:15Z","published":"2024-02-05T16:39:15Z","title":"Homograph Attacks on Maghreb Sentiment Analyzers","summary":" We examine the impact of homograph attacks on the Sentiment Analysis (SA)\ntask of different Arabic dialects from the Maghreb North-African countries.\nHomograph attacks result in a 65.3% decrease in transformer classification from\nan F1-score of 0.95 to 0.33 when data is written in \"Arabizi\". The goal of this\nstudy is to highlight LLMs weaknesses' and to prioritize ethical and\nresponsible Machine Learning.\n","authors":["Fatima Zahra Qachfar","Rakesh M. Verma"],"pdf_url":"https://arxiv.org/pdf/2402.03171v1.pdf","comment":"NAML, North Africans in Machine Leaning, NeurIPS, Neural Information\n Processing Systems"},{"id":"http://arxiv.org/abs/2402.03170v1","updated":"2024-02-05T16:39:12Z","published":"2024-02-05T16:39:12Z","title":"Is Mamba Capable of In-Context Learning?","summary":" This work provides empirical evidence that Mamba, a newly proposed selective\nstructured state space model, has similar in-context learning (ICL)\ncapabilities as transformers. We evaluated Mamba on tasks involving simple\nfunction approximation as well as more complex natural language processing\nproblems. Our results demonstrate that across both categories of tasks, Mamba\nmatches the performance of transformer models for ICL. Further analysis reveals\nthat like transformers, Mamba appears to solve ICL problems by incrementally\noptimizing its internal representations. Overall, our work suggests that Mamba\ncan be an efficient alternative to transformers for ICL tasks involving longer\ninput sequences.\n","authors":["Riccardo Grazzi","Julien Siems","Simon Schrodi","Thomas Brox","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2402.03170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03169v1","updated":"2024-02-05T16:38:30Z","published":"2024-02-05T16:38:30Z","title":"A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation","summary":" This work presents a comprehensive understanding of the estimation of a\nplanted low-rank signal from a general spiked tensor model near the\ncomputational threshold. Relying on standard tools from the theory of large\nrandom matrices, we characterize the large-dimensional spectral behavior of the\nunfoldings of the data tensor and exhibit relevant signal-to-noise ratios\ngoverning the detectability of the principal directions of the signal. These\nresults allow to accurately predict the reconstruction performance of truncated\nmultilinear SVD (MLSVD) in the non-trivial regime. This is particularly\nimportant since it serves as an initialization of the higher-order orthogonal\niteration (HOOI) scheme, whose convergence to the best low-multilinear-rank\napproximation depends entirely on its initialization. We give a sufficient\ncondition for the convergence of HOOI and show that the number of iterations\nbefore convergence tends to $1$ in the large-dimensional limit.\n","authors":["Hugo Lebeau","Florent Chatelain","Romain Couillet"],"pdf_url":"https://arxiv.org/pdf/2402.03169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18765v3","updated":"2024-02-05T16:37:42Z","published":"2023-10-28T17:28:07Z","title":"Rethinking Semi-Supervised Imbalanced Node Classification from\n Bias-Variance Decomposition","summary":" This paper introduces a new approach to address the issue of class imbalance\nin graph neural networks (GNNs) for learning on graph-structured data. Our\napproach integrates imbalanced node classification and Bias-Variance\nDecomposition, establishing a theoretical framework that closely relates data\nimbalance to model variance. We also leverage graph augmentation technique to\nestimate the variance, and design a regularization term to alleviate the impact\nof imbalance. Exhaustive tests are conducted on multiple benchmarks, including\nnaturally imbalanced datasets and public-split class-imbalanced datasets,\ndemonstrating that our approach outperforms state-of-the-art methods in various\nimbalanced scenarios. This work provides a novel theoretical perspective for\naddressing the problem of imbalanced node classification in GNNs.\n","authors":["Divin Yan","Gengchen Wei","Chen Yang","Shengzhong Zhang","Zengfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2310.18765v3.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.13397v2","updated":"2024-02-05T16:37:11Z","published":"2023-10-20T10:12:06Z","title":"Equivariant Deep Weight Space Alignment","summary":" Permutation symmetries of deep networks make basic operations like model\nmerging and similarity estimation challenging. In many cases, aligning the\nweights of the networks, i.e., finding optimal permutations between their\nweights, is necessary. Unfortunately, weight alignment is an NP-hard problem.\nPrior research has mainly focused on solving relaxed versions of the alignment\nproblem, leading to either time-consuming methods or sub-optimal solutions. To\naccelerate the alignment process and improve its quality, we propose a novel\nframework aimed at learning to solve the weight alignment problem, which we\nname Deep-Align. To that end, we first prove that weight alignment adheres to\ntwo fundamental symmetries and then, propose a deep architecture that respects\nthese symmetries. Notably, our framework does not require any labeled data. We\nprovide a theoretical analysis of our approach and evaluate Deep-Align on\nseveral types of network architectures and learning setups. Our experimental\nresults indicate that a feed-forward pass with Deep-Align produces better or\nequivalent alignments compared to those produced by current optimization\nalgorithms. Additionally, our alignments can be used as an effective\ninitialization for other methods, leading to improved solutions with a\nsignificant speedup in convergence.\n","authors":["Aviv Navon","Aviv Shamsian","Ethan Fetaya","Gal Chechik","Nadav Dym","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2310.13397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13544v2","updated":"2024-02-05T16:36:51Z","published":"2023-11-22T17:37:42Z","title":"Piecewise Polynomial Regression of Tame Functions via Integer\n Programming","summary":" We consider approximating so-called tame functions, a class of nonsmooth,\nnonconvex functions, with piecewise polynomial functions. Tame functions appear\nin a wide range of applications: functions encountered in the training of deep\nneural networks with all common activations, value functions of mixed-integer\nprograms, or wave functions of small molecules. We bound the quality of\napproximation of a tame function by a piecewise polynomial function with a\ngiven number of segments on any full-dimensional cube. We also present the\nfirst ever mixed-integer programming formulation of piecewise polynomial\nregression. Together, these can be used to estimate tame functions. We\ndemonstrate promising computational results.\n","authors":["Gilles Bareilles","Johannes Aspman","Jiri Nemecek","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2311.13544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03167v1","updated":"2024-02-05T16:35:30Z","published":"2024-02-05T16:35:30Z","title":"Decentralized Bilevel Optimization over Graphs: Loopless Algorithmic\n Update and Transient Iteration Complexity","summary":" Stochastic bilevel optimization (SBO) is becoming increasingly essential in\nmachine learning due to its versatility in handling nested structures. To\naddress large-scale SBO, decentralized approaches have emerged as effective\nparadigms in which nodes communicate with immediate neighbors without a central\nserver, thereby improving communication efficiency and enhancing algorithmic\nrobustness. However, current decentralized SBO algorithms face challenges,\nincluding expensive inner-loop updates and unclear understanding of the\ninfluence of network topology, data heterogeneity, and the nested bilevel\nalgorithmic structures. In this paper, we introduce a single-loop decentralized\nSBO (D-SOBA) algorithm and establish its transient iteration complexity, which,\nfor the first time, clarifies the joint influence of network topology and data\nheterogeneity on decentralized bilevel algorithms. D-SOBA achieves the\nstate-of-the-art asymptotic rate, asymptotic gradient/Hessian complexity, and\ntransient iteration complexity under more relaxed assumptions compared to\nexisting methods. Numerical experiments validate our theoretical findings.\n","authors":["Boao Kong","Shuchen Zhu","Songtao Lu","Xinmeng Huang","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2402.03167v1.pdf","comment":"37 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.15393v2","updated":"2024-02-05T16:33:05Z","published":"2023-10-23T22:51:58Z","title":"DoGE: Domain Reweighting with Generalization Estimation","summary":" The coverage and composition of the pretraining data significantly impacts\nthe generalization ability of Large Language Models (LLMs). Despite its\nimportance, recent LLMs still rely on heuristics and trial and error to\nincrease or reduce the influence of data-domains. We propose DOmain reweighting\nwith Generalization Estimation (DoGE), which optimizes the probability of\nsampling from each domain (domain weights) in a principled way. Our approach is\na two-stage process consisting of (i) training a proxy model to obtain domain\nweights using a bi-level optimization algorithm; (ii) training a larger base\nmodel by sampling training domains according to the learned domain weights. In\nour experiments, we extensively show how DoGE improves the generalization of\nthe base model to any target data mixture. On the SlimPajama dataset, our base\nmodel gets better perplexity and few-shot reasoning accuracies across $6$ tasks\ncompared to baseline methods. Moreover, aiming to generalize to out-of-domain\ntarget tasks, which is unseen in the pretraining corpus (OOD domain), DoGE can\neffectively identify inter-domain dependencies, and consistently achieves\nbetter test perplexity on the target domain.\n","authors":["Simin Fan","Matteo Pagliardini","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2310.15393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09944v2","updated":"2024-02-05T16:33:03Z","published":"2023-09-18T17:04:04Z","title":"DiffusionWorldViewer: Exposing and Broadening the Worldview Reflected by\n Generative Text-to-Image Models","summary":" Generative text-to-image (TTI) models produce high-quality images from short\ntextual descriptions and are widely used in academic and creative domains. Like\nhumans, TTI models have a worldview, a conception of the world learned from\ntheir training data and task that influences the images they generate for a\ngiven prompt. However, the worldviews of TTI models are often hidden from\nusers, making it challenging for users to build intuition about TTI outputs,\nand they are often misaligned with users' worldviews, resulting in output\nimages that do not match user expectations. In response, we introduce\nDiffusionWorldViewer, an interactive interface that exposes a TTI model's\nworldview across output demographics and provides editing tools for aligning\noutput images with user perspectives. In a user study with 18 diverse TTI\nusers, we find that DiffusionWorldViewer helps users represent their varied\nviewpoints in generated images and challenge the limited worldview reflected in\ncurrent TTI models.\n","authors":["Zoe De Simone","Angie Boggust","Arvind Satyanarayan","Ashia Wilson"],"pdf_url":"https://arxiv.org/pdf/2309.09944v2.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.03158v1","updated":"2024-02-05T16:27:59Z","published":"2024-02-05T16:27:59Z","title":"Optimal and Near-Optimal Adaptive Vector Quantization","summary":" Quantization is a fundamental optimization for many machine-learning use\ncases, including compressing gradients, model weights and activations, and\ndatasets. The most accurate form of quantization is \\emph{adaptive}, where the\nerror is minimized with respect to a given input, rather than optimizing for\nthe worst case. However, optimal adaptive quantization methods are considered\ninfeasible in terms of both their runtime and memory requirements.\n We revisit the Adaptive Vector Quantization (AVQ) problem and present\nalgorithms that find optimal solutions with asymptotically improved time and\nspace complexity. We also present an even faster near-optimal algorithm for\nlarge inputs. Our experiments show our algorithms may open the door to using\nAVQ more extensively in a variety of machine learning applications.\n","authors":["Ran Ben-Basat","Yaniv Ben-Itzhak","Michael Mitzenmacher","Shay Vargaftik"],"pdf_url":"https://arxiv.org/pdf/2402.03158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03156v1","updated":"2024-02-05T16:24:12Z","published":"2024-02-05T16:24:12Z","title":"DogSurf: Quadruped Robot Capable of GRU-based Surface Recognition for\n Blind Person Navigation","summary":" This paper introduces DogSurf - a newapproach of using quadruped robots to\nhelp visually impaired people navigate in real world. The presented method\nallows the quadruped robot to detect slippery surfaces, and to use audio and\nhaptic feedback to inform the user when to stop. A state-of-the-art GRU-based\nneural network architecture with mean accuracy of 99.925% was proposed for the\ntask of multiclass surface classification for quadruped robots. A dataset was\ncollected on a Unitree Go1 Edu robot. The dataset and code have been posted to\nthe public domain.\n","authors":["Artem Bazhenov","Vladimir Berman","Sergei Satsevich","Olga Shalopanova","Miguel Altamirano Cabrera","Artem Lykov","Dzmitry Tsetserukou"],"pdf_url":"https://arxiv.org/pdf/2402.03156v1.pdf","comment":"This paper has been accepted for publication at the HRI2024\n conference"},{"id":"http://arxiv.org/abs/2305.16905v3","updated":"2024-02-05T16:22:57Z","published":"2023-05-26T13:19:15Z","title":"Improving Neural Additive Models with Bayesian Principles","summary":" Neural additive models (NAMs) enhance the transparency of deep neural\nnetworks by handling input features in separate additive sub-networks. However,\nthey lack inherent mechanisms that provide calibrated uncertainties and enable\nselection of relevant features and interactions. Approaching NAMs from a\nBayesian perspective, we augment them in three primary ways, namely by a)\nproviding credible intervals for the individual additive sub-networks; b)\nestimating the marginal likelihood to perform an implicit selection of features\nvia an empirical Bayes procedure; and c) facilitating the ranking of feature\npairs as candidates for second-order interaction in fine-tuned models. In\nparticular, we develop Laplace-approximated NAMs (LA-NAMs), which show improved\nempirical performance on tabular datasets and challenging real-world medical\ntasks.\n","authors":["Kouroche Bouchiat","Alexander Immer","Hugo Yèche","Gunnar Rätsch","Vincent Fortuin"],"pdf_url":"https://arxiv.org/pdf/2305.16905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16368v2","updated":"2024-02-05T16:20:08Z","published":"2023-05-25T11:45:46Z","title":"Neural incomplete factorization: learning preconditioners for the\n conjugate gradient method","summary":" Finding suitable preconditioners to accelerate iterative solution methods,\nsuch as the conjugate gradient method, is an active area of research. In this\npaper, we develop a computationally efficient data-driven approach to replace\nthe typically hand-engineered algorithms with neural networks. Optimizing the\ncondition number of the linear system directly is computationally infeasible.\nInstead, our method generates an incomplete factorization of the matrix and is,\ntherefore, referred to as neural incomplete factorization (NeuralIF). For\nefficient training, we utilize a stochastic approximation of the Frobenius loss\nwhich only requires matrix-vector multiplications. At the core of our method is\na novel messagepassing block, inspired by sparse matrix theory, that aligns\nwith the objective of finding a sparse factorization of the matrix. By\nreplacing conventional preconditioners used within the conjugate gradient\nmethod by data-driven models based on graph neural networks, we accelerate the\niterative solving procedure. We evaluate our proposed method on both a\nsynthetic and a real-world problem arising from scientific computing and show\nits ability to reduce the solving time while remaining computationally\nefficient.\n","authors":["Paul Häusner","Ozan Öktem","Jens Sjölund"],"pdf_url":"https://arxiv.org/pdf/2305.16368v2.pdf","comment":"Under review. 18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.03153v1","updated":"2024-02-05T16:19:53Z","published":"2024-02-05T16:19:53Z","title":"Learning solutions of parametric Navier-Stokes with physics-informed\n neural networks","summary":" We leverage Physics-Informed Neural Networks (PINNs) to learn solution\nfunctions of parametric Navier-Stokes Equations (NSE). Our proposed approach\nresults in a feasible optimization problem setup that bypasses PINNs'\nlimitations in converging to solutions of highly nonlinear parametric-PDEs like\nNSE. We consider the parameter(s) of interest as inputs of PINNs along with\nspatio-temporal coordinates, and train PINNs on generated numerical solutions\nof parametric-PDES for instances of the parameters. We perform experiments on\nthe classical 2D flow past cylinder problem aiming to learn velocities and\npressure functions over a range of Reynolds numbers as parameter of interest.\nProvision of training data from generated numerical simulations allows for\ninterpolation of the solution functions for a range of parameters. Therefore,\nwe compare PINNs with unconstrained conventional Neural Networks (NN) on this\nproblem setup to investigate the effectiveness of considering the PDEs\nregularization in the loss function. We show that our proposed approach results\nin optimizing PINN models that learn the solution functions while making sure\nthat flow predictions are in line with conservational laws of mass and\nmomentum. Our results show that PINN results in accurate prediction of\ngradients compared to NN model, this is clearly visible in predicted vorticity\nfields given that none of these models were trained on vorticity labels.\n","authors":["M. Naderibeni","M. J. T. Reinders","L. Wu","D. M. J. Tax"],"pdf_url":"https://arxiv.org/pdf/2402.03153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03149v1","updated":"2024-02-05T16:16:17Z","published":"2024-02-05T16:16:17Z","title":"A Comparative Analysis of Microrings Based Incoherent Photonic GEMM\n Accelerators","summary":" Several microring resonator (MRR) based analog photonic architectures have\nbeen proposed to accelerate general matrix-matrix multiplications (GEMMs) in\ndeep neural networks with exceptional throughput and energy efficiency. To\nimplement GEMM functions, these MRR-based architectures, in general, manipulate\noptical signals in five different ways: (i) Splitting (copying) of multiple\noptical signals to achieve a certain fan-out, (ii) Aggregation (multiplexing)\nof multiple optical signals to achieve a certain fan-in, (iii) Modulation of\noptical signals to imprint input values onto analog signal amplitude, (iv)\nWeighting of modulated optical signals to achieve analog input-weight\nmultiplication, (v) Summation of optical signals. The MRR-based GEMM\naccelerators undertake the first four ways of signal manipulation in an\narbitrary order ignoring the possible impact of the order of these\nmanipulations on their performance. In this paper, we conduct a detailed\nanalysis of accelerator organizations with three different orders of these\nmanipulations: (1) Modulation-Aggregation-Splitting-Weighting (MASW), (2)\nAggregation-Splitting-Modulation-Weighting (ASMW), and (3)\nSplitting-Modulation-Weighting-Aggregation (SMWA). We show that these\norganizations affect the crosstalk noise and optical signal losses in different\nmagnitudes, which renders these organizations with different levels of\nprocessing parallelism at the circuit level, and different magnitudes of\nthroughput and energy-area efficiency at the system level. Our evaluation\nresults for four CNN models show that SMWA organization achieves up to\n4.4$\\times$, 5$\\times$, and 5.2$\\times$ better throughput, energy efficiency,\nand area-energy efficiency, respectively, compared to ASMW and MASW\norganizations on average.\n","authors":["Sairam Sri Vatsavai","Venkata Sai Praneeth Karempudi","Alo Oluwaseun","Ishan Thakkar"],"pdf_url":"https://arxiv.org/pdf/2402.03149v1.pdf","comment":"Accepted at ISQED 2024"},{"id":"http://arxiv.org/abs/2402.03146v1","updated":"2024-02-05T16:13:00Z","published":"2024-02-05T16:13:00Z","title":"A Multi-step Loss Function for Robust Learning of the Dynamics in\n Model-based Reinforcement Learning","summary":" In model-based reinforcement learning, most algorithms rely on simulating\ntrajectories from one-step models of the dynamics learned on data. A critical\nchallenge of this approach is the compounding of one-step prediction errors as\nthe length of the trajectory grows. In this paper we tackle this issue by using\na multi-step objective to train one-step models. Our objective is a weighted\nsum of the mean squared error (MSE) loss at various future horizons. We find\nthat this new loss is particularly useful when the data is noisy (additive\nGaussian noise in the observations), which is often the case in real-life\nenvironments. To support the multi-step loss, first we study its properties in\ntwo tractable cases: i) uni-dimensional linear system, and ii) two-parameter\nnon-linear system. Second, we show in a variety of tasks (environments or\ndatasets) that the models learned with this loss achieve a significant\nimprovement in terms of the averaged R2-score on future prediction horizons.\nFinally, in the pure batch reinforcement learning setting, we demonstrate that\none-step models serve as strong baselines when dynamics are deterministic,\nwhile multi-step models would be more advantageous in the presence of noise,\nhighlighting the potential of our approach in real-world applications.\n","authors":["Abdelhakim Benechehab","Albert Thomas","Giuseppe Paolo","Maurizio Filippone","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2402.03146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03145v1","updated":"2024-02-05T16:12:36Z","published":"2024-02-05T16:12:36Z","title":"SafEDMD: A certified learning architecture tailored to data-driven\n control of nonlinear dynamical systems","summary":" The Koopman operator serves as the theoretical backbone for machine learning\nof dynamical control systems, where the operator is heuristically approximated\nby extended dynamic mode decomposition (EDMD). In this paper, we propose\nStability- and certificate-oriented EDMD (SafEDMD): a novel EDMD-based learning\narchitecture which comes along with rigorous certificates, resulting in a\nreliable surrogate model generated in a data-driven fashion. To ensure\ntrustworthiness of SafEDMD, we derive proportional error bounds, which vanish\nat the origin and are tailored for control tasks, leading to certified\ncontroller design based on semi-definite programming. We illustrate the\ndeveloped machinery by means of several benchmark examples and highlight the\nadvantages over state-of-the-art methods.\n","authors":["Robin Strässer","Manuel Schaller","Karl Worthmann","Julian Berberich","Frank Allgöwer"],"pdf_url":"https://arxiv.org/pdf/2402.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01951v7","updated":"2024-02-05T16:12:30Z","published":"2023-06-02T23:23:34Z","title":"GAD-NR: Graph Anomaly Detection via Neighborhood Reconstruction","summary":" Graph Anomaly Detection (GAD) is a technique used to identify abnormal nodes\nwithin graphs, finding applications in network security, fraud detection,\nsocial media spam detection, and various other domains. A common method for GAD\nis Graph Auto-Encoders (GAEs), which encode graph data into node\nrepresentations and identify anomalies by assessing the reconstruction quality\nof the graphs based on these representations. However, existing GAE models are\nprimarily optimized for direct link reconstruction, resulting in nodes\nconnected in the graph being clustered in the latent space. As a result, they\nexcel at detecting cluster-type structural anomalies but struggle with more\ncomplex structural anomalies that do not conform to clusters. To address this\nlimitation, we propose a novel solution called GAD-NR, a new variant of GAE\nthat incorporates neighborhood reconstruction for graph anomaly detection.\nGAD-NR aims to reconstruct the entire neighborhood of a node, encompassing the\nlocal structure, self-attributes, and neighbor attributes, based on the\ncorresponding node representation. By comparing the neighborhood reconstruction\nloss between anomalous nodes and normal nodes, GAD-NR can effectively detect\nany anomalies. Extensive experimentation conducted on six real-world datasets\nvalidates the effectiveness of GAD-NR, showcasing significant improvements (by\nup to 30% in AUC) over state-of-the-art competitors. The source code for GAD-NR\nis openly available. Importantly, the comparative analysis reveals that the\nexisting methods perform well only in detecting one or two types of anomalies\nout of the three types studied. In contrast, GAD-NR excels at detecting all\nthree types of anomalies across the datasets, demonstrating its comprehensive\nanomaly detection capabilities.\n","authors":["Amit Roy","Juan Shu","Jia Li","Carl Yang","Olivier Elshocht","Jeroen Smeets","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2306.01951v7.pdf","comment":"Accepted at the 17th ACM International Conference on Web Search and\n Data Mining (WSDM-2024)"},{"id":"http://arxiv.org/abs/2402.03142v1","updated":"2024-02-05T16:11:43Z","published":"2024-02-05T16:11:43Z","title":"Less is KEN: a Universal and Simple Non-Parametric Pruning Algorithm for\n Large Language Models","summary":" Neural network pruning has become increasingly crucial due to the complexity\nof neural network models and their widespread use in various fields. Existing\npruning algorithms often suffer from limitations such as architecture\nspecificity, excessive complexity and reliance on complex calculations,\nrendering them impractical for real-world applications. In this paper, we\npropose KEN: a straightforward, universal and unstructured pruning algorithm\nbased on Kernel Density Estimation (KDE). KEN aims to construct optimized\ntransformer models by selectively preserving the most significant parameters\nwhile restoring others to their pre-training state. This approach maintains\nmodel performance while allowing storage of only the optimized subnetwork,\nleading to significant memory savings. Extensive evaluations on seven\ntransformer models demonstrate that KEN achieves equal or better performance\nthan the original models with a minimum parameter reduction of 25%. In-depth\ncomparisons against other pruning and PEFT algorithms confirm KEN\neffectiveness. Furthermore, we introduce KEN_viz, an explainable tool that\nvisualizes the optimized model composition and the subnetwork selected by KEN.\n","authors":["Michele Mastromattei","Fabio Massimo Zanzotto"],"pdf_url":"https://arxiv.org/pdf/2402.03142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03141v1","updated":"2024-02-05T16:11:03Z","published":"2024-02-05T16:11:03Z","title":"Boosting Long-Delayed Reinforcement Learning with Auxiliary\n Short-Delayed Task","summary":" Reinforcement learning is challenging in delayed scenarios, a common\nreal-world situation where observations and interactions occur with delays.\nState-of-the-art (SOTA) state-augmentation techniques either suffer from the\nstate-space explosion along with the delayed steps, or performance degeneration\nin stochastic environments. To address these challenges, our novel\nAuxiliary-Delayed Reinforcement Learning (AD-RL) leverages an auxiliary\nshort-delayed task to accelerate the learning on a long-delayed task without\ncompromising the performance in stochastic environments. Specifically, AD-RL\nlearns the value function in the short-delayed task and then employs it with\nthe bootstrapping and policy improvement techniques in the long-delayed task.\nWe theoretically show that this can greatly reduce the sample complexity\ncompared to directly learning on the original long-delayed task. On\ndeterministic and stochastic benchmarks, our method remarkably outperforms the\nSOTAs in both sample efficiency and policy performance.\n","authors":["Qingyuan Wu","Simon Sinong Zhan","Yixuan Wang","Chung-Wei Lin","Chen Lv","Qi Zhu","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.03141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03139v1","updated":"2024-02-05T16:09:35Z","published":"2024-02-05T16:09:35Z","title":"Enhancing Neural Subset Selection: Integrating Background Information\n into Set Representations","summary":" Learning neural subset selection tasks, such as compound selection in\nAI-aided drug discovery, have become increasingly pivotal across diverse\napplications. The existing methodologies in the field primarily concentrate on\nconstructing models that capture the relationship between utility function\nvalues and subsets within their respective supersets. However, these approaches\ntend to overlook the valuable information contained within the superset when\nutilizing neural networks to model set functions. In this work, we address this\noversight by adopting a probabilistic perspective. Our theoretical findings\ndemonstrate that when the target value is conditioned on both the input set and\nsubset, it is essential to incorporate an \\textit{invariant sufficient\nstatistic} of the superset into the subset of interest for effective learning.\nThis ensures that the output value remains invariant to permutations of the\nsubset and its corresponding superset, enabling identification of the specific\nsuperset from which the subset originated. Motivated by these insights, we\npropose a simple yet effective information aggregation module designed to merge\nthe representations of subsets and supersets from a permutation invariance\nperspective. Comprehensive empirical evaluations across diverse tasks and\ndatasets validate the enhanced efficacy of our approach over conventional\nmethods, underscoring the practicality and potency of our proposed strategies\nin real-world contexts.\n","authors":["Binghui Xie","Yatao Bian","Kaiwen zhou","Yongqiang Chen","Peilin Zhao","Bo Han","Wei Meng","James Cheng"],"pdf_url":"https://arxiv.org/pdf/2402.03139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03138v1","updated":"2024-02-05T16:08:58Z","published":"2024-02-05T16:08:58Z","title":"Just Cluster It: An Approach for Exploration in High-Dimensions using\n Clustering and Pre-Trained Representations","summary":" In this paper we adopt a representation-centric perspective on exploration in\nreinforcement learning, viewing exploration fundamentally as a density\nestimation problem. We investigate the effectiveness of clustering\nrepresentations for exploration in 3-D environments, based on the observation\nthat the importance of pixel changes between transitions is less pronounced in\n3-D environments compared to 2-D environments, where pixel changes between\ntransitions are typically distinct and significant. We propose a method that\nperforms episodic and global clustering on random representations and on\npre-trained DINO representations to count states, i.e, estimate pseudo-counts.\nSurprisingly, even random features can be clustered effectively to count states\nin 3-D environments, however when these become visually more complex,\npre-trained DINO representations are more effective thanks to the pre-trained\ninductive biases in the representations. Overall, this presents a pathway for\nintegrating pre-trained biases into exploration. We evaluate our approach on\nthe VizDoom and Habitat environments, demonstrating that our method surpasses\nother well-known exploration methods in these settings.\n","authors":["Stefan Sylvius Wagner","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2402.03138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12420v2","updated":"2024-02-05T16:06:14Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts\nextend beyond technology, influencing environmental and societal aspects. This\nevolution has increased publications, making manual literature analysis\nincreasingly challenging. We address this with a Natural Language Processing\n(NLP)-based systematic literature review method to explore the intersection of\nDistributed Ledger Technology (DLT) with its Environmental, Social, and\nGovernance (ESG) aspects. Our approach involves building and refining a\ndirected citation network from 107 seed papers to a corpus of 24,539\npublications and fine-tuning a transformer-based language model for Named\nEntity Recognition (NER) on DLT and ESG domains. Applying this model, we\ndistilled the corpus to 505 key publications, enabling an inaugural literature\nreview and temporal graph analysis of DLT's evolution in ESG contexts. Our\ncontributions include an adaptable and scalable NLP-driven systematic\nliterature review methodology and a unique NER dataset of 54,808 entities,\ntailored for DLT and ESG research. Our inaugural literature review demonstrates\ntheir applicability and effectiveness in analyzing DLT's evolution and impacts,\nproving invaluable for stakeholders in the DLT domain.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03137v1","updated":"2024-02-05T16:05:32Z","published":"2024-02-05T16:05:32Z","title":"Sociolinguistically Informed Interpretability: A Case Study on Hinglish\n Emotion Classification","summary":" Emotion classification is a challenging task in NLP due to the inherent\nidiosyncratic and subjective nature of linguistic expression, especially with\ncode-mixed data. Pre-trained language models (PLMs) have achieved high\nperformance for many tasks and languages, but it remains to be seen whether\nthese models learn and are robust to the differences in emotional expression\nacross languages. Sociolinguistic studies have shown that Hinglish speakers\nswitch to Hindi when expressing negative emotions and to English when\nexpressing positive emotions. To understand if language models can learn these\nassociations, we study the effect of language on emotion prediction across 3\nPLMs on a Hinglish emotion classification dataset. Using LIME and token level\nlanguage ID, we find that models do learn these associations between language\nchoice and emotional expression. Moreover, having code-mixed data present in\nthe pre-training can augment that learning when task-specific data is scarce.\nWe also conclude from the misclassifications that the models may overgeneralise\nthis heuristic to other infrequent examples where this sociolinguistic\nphenomenon does not apply.\n","authors":["Kushal Tatariya","Heather Lent","Johannes Bjerva","Miryam de Lhoneux"],"pdf_url":"https://arxiv.org/pdf/2402.03137v1.pdf","comment":"5 pages, Accepted to SIGTYP 2024 @ EACL"},{"id":"http://arxiv.org/abs/2402.03131v1","updated":"2024-02-05T15:57:32Z","published":"2024-02-05T15:57:32Z","title":"Constrained Decoding for Cross-lingual Label Projection","summary":" Zero-shot cross-lingual transfer utilizing multilingual LLMs has become a\npopular learning paradigm for low-resource languages with no labeled training\ndata. However, for NLP tasks that involve fine-grained predictions on words and\nphrases, the performance of zero-shot cross-lingual transfer learning lags far\nbehind supervised fine-tuning methods. Therefore, it is common to exploit\ntranslation and label projection to further improve the performance by (1)\ntranslating training data that is available in a high-resource language (e.g.,\nEnglish) together with the gold labels into low-resource languages, and/or (2)\ntranslating test data in low-resource languages to a high-source language to\nrun inference on, then projecting the predicted span-level labels back onto the\noriginal test data. However, state-of-the-art marker-based label projection\nmethods suffer from translation quality degradation due to the extra label\nmarkers injected in the input to the translation model. In this work, we\nexplore a new direction that leverages constrained decoding for label\nprojection to overcome the aforementioned issues. Our new method not only can\npreserve the quality of translated texts but also has the versatility of being\napplicable to both translating training and translating test data strategies.\nThis versatility is crucial as our experiments reveal that translating test\ndata can lead to a considerable boost in performance compared to translating\nonly training data. We evaluate on two cross-lingual transfer tasks, namely\nNamed Entity Recognition and Event Argument Extraction, spanning 20 languages.\nThe results demonstrate that our approach outperforms the state-of-the-art\nmarker-based method by a large margin and also shows better performance than\nother label projection methods that rely on external word alignment.\n","authors":["Duong Minh Le","Yang Chen","Alan Ritter","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.03131v1.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.03126v1","updated":"2024-02-05T15:51:49Z","published":"2024-02-05T15:51:49Z","title":"How Free is Parameter-Free Stochastic Optimization?","summary":" We study the problem of parameter-free stochastic optimization, inquiring\nwhether, and under what conditions, do fully parameter-free methods exist:\nthese are methods that achieve convergence rates competitive with optimally\ntuned methods, without requiring significant knowledge of the true problem\nparameters. Existing parameter-free methods can only be considered\n``partially'' parameter-free, as they require some non-trivial knowledge of the\ntrue problem parameters, such as a bound on the stochastic gradient norms, a\nbound on the distance to a minimizer, etc. In the non-convex setting, we\ndemonstrate that a simple hyperparameter search technique results in a fully\nparameter-free method that outperforms more sophisticated state-of-the-art\nalgorithms. We also provide a similar result in the convex setting with access\nto noisy function values under mild noise assumptions. Finally, assuming only\naccess to stochastic gradients, we establish a lower bound that renders fully\nparameter-free stochastic convex optimization infeasible, and provide a method\nwhich is (partially) parameter-free up to the limit indicated by our lower\nbound.\n","authors":["Amit Attia","Tomer Koren"],"pdf_url":"https://arxiv.org/pdf/2402.03126v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2402.03124v1","updated":"2024-02-05T15:51:34Z","published":"2024-02-05T15:51:34Z","title":"Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks","summary":" Gradient inversion attacks aim to reconstruct local training data from\nintermediate gradients exposed in the federated learning framework. Despite\nsuccessful attacks, all previous methods, starting from reconstructing a single\ndata point and then relaxing the single-image limit to batch level, are only\ntested under hard label constraints. Even for single-image reconstruction, we\nstill lack an analysis-based algorithm to recover augmented soft labels. In\nthis work, we change the focus from enlarging batchsize to investigating the\nhard label constraints, considering a more realistic circumstance where label\nsmoothing and mixup techniques are used in the training process. In particular,\nwe are the first to initiate a novel algorithm to simultaneously recover the\nground-truth augmented label and the input feature of the last fully-connected\nlayer from single-input gradients, and provide a necessary condition for any\nanalytical-based label recovery methods. Extensive experiments testify to the\nlabel recovery accuracy, as well as the benefits to the following image\nreconstruction. We believe soft labels in classification tasks are worth\nfurther attention in gradient inversion attacks.\n","authors":["Yanbo Wang","Jian Liang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2402.03124v1.pdf","comment":"ICLR2024 poster The prior submission version had a bug in the image\n reconstruction implementation, which has been corrected without harm to the\n main conclusions"},{"id":"http://arxiv.org/abs/2402.03119v1","updated":"2024-02-05T15:47:54Z","published":"2024-02-05T15:47:54Z","title":"Good Teachers Explain: Explanation-Enhanced Knowledge Distillation","summary":" Knowledge Distillation (KD) has proven effective for compressing large\nteacher models into smaller student models. While it is well known that student\nmodels can achieve similar accuracies as the teachers, it has also been shown\nthat they nonetheless often do not learn the same function. It is, however,\noften highly desirable that the student's and teacher's functions share similar\nproperties such as basing the prediction on the same input features, as this\nensures that students learn the 'right features' from the teachers. In this\nwork, we explore whether this can be achieved by not only optimizing the\nclassic KD loss but also the similarity of the explanations generated by the\nteacher and the student. Despite the idea being simple and intuitive, we find\nthat our proposed 'explanation-enhanced' KD (e$^2$KD) (1) consistently provides\nlarge gains in terms of accuracy and student-teacher agreement, (2) ensures\nthat the student learns from the teacher to be right for the right reasons and\nto give similar explanations, and (3) is robust with respect to the model\narchitectures, the amount of training data, and even works with 'approximate',\npre-computed explanations.\n","authors":["Amin Parchami-Araghi","Moritz Böhle","Sukrut Rao","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2402.03119v1.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.13147v2","updated":"2024-02-05T15:47:06Z","published":"2023-07-24T22:01:22Z","title":"Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent\n Observation Framework","summary":" The Path-Dependent Neural Jump Ordinary Differential Equation (PD-NJ-ODE) is\na model for predicting continuous-time stochastic processes with irregular and\nincomplete observations. In particular, the method learns optimal forecasts\ngiven irregularly sampled time series of incomplete past observations. So far\nthe process itself and the coordinate-wise observation times were assumed to be\nindependent and observations were assumed to be noiseless. In this work we\ndiscuss two extensions to lift these restrictions and provide theoretical\nguarantees as well as empirical examples for them. In particular, we can lift\nthe assumption of independence by extending the theory to much more realistic\nsettings of conditional independence without any need to change the algorithm.\nMoreover, we introduce a new loss function, which allows us to deal with noisy\nobservations and explain why the previously used loss function did not lead to\na consistent estimator.\n","authors":["William Andersson","Jakob Heiss","Florian Krach","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2307.13147v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03116v1","updated":"2024-02-05T15:45:59Z","published":"2024-02-05T15:45:59Z","title":"Feature-Action Design Patterns for Storytelling Visualizations with Time\n Series Data","summary":" We present a method to create storytelling visualization with time series\ndata. Many personal decisions nowadays rely on access to dynamic data\nregularly, as we have seen during the COVID-19 pandemic. It is thus desirable\nto construct storytelling visualization for dynamic data that is selected by an\nindividual for a specific context. Because of the need to tell data-dependent\nstories, predefined storyboards based on known data cannot accommodate dynamic\ndata easily nor scale up to many different individuals and contexts. Motivated\ninitially by the need to communicate time series data during the COVID-19\npandemic, we developed a novel computer-assisted method for meta-authoring of\nstories, which enables the design of storyboards that include feature-action\npatterns in anticipation of potential features that may appear in dynamically\narrived or selected data. In addition to meta-storyboards involving COVID-19\ndata, we also present storyboards for telling stories about progress in a\nmachine learning workflow. Our approach is complementary to traditional methods\nfor authoring storytelling visualization, and provides an efficient means to\nconstruct data-dependent storyboards for different data-streams of similar\ncontexts.\n","authors":["Saiful Khan","Scott Jones","Benjamin Bach","Jaehoon Cha","Min Chen","Julie Meikle","Jonathan C Roberts","Jeyan Thiyagalingam","Jo Wood","Panagiotis D. Ritsos"],"pdf_url":"https://arxiv.org/pdf/2402.03116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03115v1","updated":"2024-02-05T15:45:55Z","published":"2024-02-05T15:45:55Z","title":"Discovering interpretable models of scientific image data with deep\n learning","summary":" How can we find interpretable, domain-appropriate models of natural phenomena\ngiven some complex, raw data such as images? Can we use such models to derive\nscientific insight from the data? In this paper, we propose some methods for\nachieving this. In particular, we implement disentangled representation\nlearning, sparse deep neural network training and symbolic regression, and\nassess their usefulness in forming interpretable models of complex image data.\nWe demonstrate their relevance to the field of bioimaging using a well-studied\ntest problem of classifying cell states in microscopy data. We find that such\nmethods can produce highly parsimonious models that achieve $\\sim98\\%$ of the\naccuracy of black-box benchmark models, with a tiny fraction of the complexity.\nWe explore the utility of such interpretable models in producing scientific\nexplanations of the underlying biological phenomenon.\n","authors":["Christopher J. Soelistyo","Alan R. Lowe"],"pdf_url":"https://arxiv.org/pdf/2402.03115v1.pdf","comment":"33 pages (including appendices), 27 figures"},{"id":"http://arxiv.org/abs/2402.03112v1","updated":"2024-02-05T15:44:43Z","published":"2024-02-05T15:44:43Z","title":"Infrared Spectra Prediction for Diazo Groups Utilizing a Machine\n Learning Approach with Structural Attention Mechanism","summary":" Infrared (IR) spectroscopy is a pivotal technique in chemical research for\nelucidating molecular structures and dynamics through vibrational and\nrotational transitions. However, the intricate molecular fingerprints\ncharacterized by unique vibrational and rotational patterns present substantial\nanalytical challenges. Here, we present a machine learning approach employing a\nStructural Attention Mechanism tailored to enhance the prediction and\ninterpretation of infrared spectra, particularly for diazo compounds. Our model\ndistinguishes itself by honing in on chemical information proximal to\nfunctional groups, thereby significantly bolstering the accuracy, robustness,\nand interpretability of spectral predictions. This method not only demystifies\nthe correlations between infrared spectral features and molecular structures\nbut also offers a scalable and efficient paradigm for dissecting complex\nmolecular interactions.\n","authors":["Chengchun Liu","Fanyang Mo"],"pdf_url":"https://arxiv.org/pdf/2402.03112v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.03110v1","updated":"2024-02-05T15:38:01Z","published":"2024-02-05T15:38:01Z","title":"Non-Stationary Latent Auto-Regressive Bandits","summary":" We consider the stochastic multi-armed bandit problem with non-stationary\nrewards. We present a novel formulation of non-stationarity in the environment\nwhere changes in the mean reward of the arms over time are due to some unknown,\nlatent, auto-regressive (AR) state of order $k$. We call this new environment\nthe latent AR bandit. Different forms of the latent AR bandit appear in many\nreal-world settings, especially in emerging scientific fields such as\nbehavioral health or education where there are few mechanistic models of the\nenvironment. If the AR order $k$ is known, we propose an algorithm that\nachieves $\\tilde{O}(k\\sqrt{T})$ regret in this setting. Empirically, our\nalgorithm outperforms standard UCB across multiple non-stationary environments,\neven if $k$ is mis-specified.\n","authors":["Anna L. Trella","Walter Dempsey","Finale Doshi-Velez","Susan A. Murphy"],"pdf_url":"https://arxiv.org/pdf/2402.03110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15022v2","updated":"2024-02-05T15:36:44Z","published":"2024-01-26T17:29:01Z","title":"Applications of artificial intelligence in the analysis of\n histopathology images of gliomas: a review","summary":" In recent years, the diagnosis of gliomas has become increasingly complex.\nAnalysis of glioma histopathology images using artificial intelligence (AI)\noffers new opportunities to support diagnosis and outcome prediction. To give\nan overview of the current state of research, this review examines 70 publicly\navailable research studies that have proposed AI-based methods for whole-slide\nhistopathology images of human gliomas, covering the diagnostic tasks of\nsubtyping (16/70), grading (23/70), molecular marker prediction (13/70), and\nsurvival prediction (27/70). All studies were reviewed with regard to\nmethodological aspects as well as clinical applicability. It was found that the\nfocus of current research is the assessment of hematoxylin and eosin-stained\ntissue sections of adult-type diffuse gliomas. The majority of studies (49/70)\nare based on the publicly available glioblastoma and low-grade glioma datasets\nfrom The Cancer Genome Atlas (TCGA) and only a few studies employed other\ndatasets in isolation (10/70) or in addition to the TCGA datasets (11/70).\nCurrent approaches mostly rely on convolutional neural networks (53/70) for\nanalyzing tissue at 20x magnification (30/70). A new field of research is the\nintegration of clinical data, omics data, or magnetic resonance imaging\n(27/70). So far, AI-based methods have achieved promising results, but are not\nyet used in real clinical settings. Future work should focus on the independent\nvalidation of methods on larger, multi-site datasets with high-quality and\nup-to-date clinical and molecular pathology annotations to demonstrate routine\napplicability.\n","authors":["Jan-Philipp Redlich","Friedrich Feuerhake","Joachim Weis","Nadine S. Schaadt","Sarah Teuber-Hanselmann","Christoph Buck","Sabine Luttmann","Andrea Eberle","Stefan Nikolin","Arno Appenzeller","Andreas Portmann","André Homeyer"],"pdf_url":"https://arxiv.org/pdf/2401.15022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11677v2","updated":"2024-02-05T15:33:18Z","published":"2023-10-18T03:00:15Z","title":"Improved Sample Complexity Analysis of Natural Policy Gradient Algorithm\n with General Parameterization for Infinite Horizon Discounted Reward Markov\n Decision Processes","summary":" We consider the problem of designing sample efficient learning algorithms for\ninfinite horizon discounted reward Markov Decision Process. Specifically, we\npropose the Accelerated Natural Policy Gradient (ANPG) algorithm that utilizes\nan accelerated stochastic gradient descent process to obtain the natural policy\ngradient. ANPG achieves $\\mathcal{O}({\\epsilon^{-2}})$ sample complexity and\n$\\mathcal{O}(\\epsilon^{-1})$ iteration complexity with general parameterization\nwhere $\\epsilon$ defines the optimality error. This improves the\nstate-of-the-art sample complexity by a $\\log(\\frac{1}{\\epsilon})$ factor. ANPG\nis a first-order algorithm and unlike some existing literature, does not\nrequire the unverifiable assumption that the variance of importance sampling\n(IS) weights is upper bounded. In the class of Hessian-free and IS-free\nalgorithms, ANPG beats the best-known sample complexity by a factor of\n$\\mathcal{O}(\\epsilon^{-\\frac{1}{2}})$ and simultaneously matches their\nstate-of-the-art iteration complexity.\n","authors":["Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2310.11677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03104v1","updated":"2024-02-05T15:32:10Z","published":"2024-02-05T15:32:10Z","title":"High-dimensional Bayesian Optimization via Covariance Matrix Adaptation\n Strategy","summary":" Bayesian Optimization (BO) is an effective method for finding the global\noptimum of expensive black-box functions. However, it is well known that\napplying BO to high-dimensional optimization problems is challenging. To\naddress this issue, a promising solution is to use a local search strategy that\npartitions the search domain into local regions with high likelihood of\ncontaining the global optimum, and then use BO to optimize the objective\nfunction within these regions. In this paper, we propose a novel technique for\ndefining the local regions using the Covariance Matrix Adaptation (CMA)\nstrategy. Specifically, we use CMA to learn a search distribution that can\nestimate the probabilities of data points being the global optimum of the\nobjective function. Based on this search distribution, we then define the local\nregions consisting of data points with high probabilities of being the global\noptimum. Our approach serves as a meta-algorithm as it can incorporate existing\nblack-box BO optimizers, such as BO, TuRBO, and BAxUS, to find the global\noptimum of the objective function within our derived local regions. We evaluate\nour proposed method on various benchmark synthetic and real-world problems. The\nresults demonstrate that our method outperforms existing state-of-the-art\ntechniques.\n","authors":["Lam Ngo","Huong Ha","Jeffrey Chan","Vu Nguyen","Hongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03104v1.pdf","comment":"31 pages, 17 figures"},{"id":"http://arxiv.org/abs/2402.03099v1","updated":"2024-02-05T15:28:43Z","published":"2024-02-05T15:28:43Z","title":"Intent-based Prompt Calibration: Enhancing prompt optimization with\n synthetic boundary cases","summary":" Prompt engineering is a challenging and important task due to the high\nsensitivity of Large Language Models (LLMs) to the given prompt and the\ninherent ambiguity of a textual task instruction. Automatic prompt engineering\nis essential to achieve optimized performance from LLMs. Recent studies have\ndemonstrated the capabilities of LLMs to automatically conduct prompt\nengineering by employing a meta-prompt that incorporates the outcomes of the\nlast trials and proposes an improved prompt. However, this requires a\nhigh-quality benchmark to compare different prompts, which is difficult and\nexpensive to acquire in many real-world use cases. In this work, we introduce a\nnew method for automatic prompt engineering, using a calibration process that\niteratively refines the prompt to the user intent. During the optimization\nprocess, the system jointly generates synthetic data of boundary use cases and\noptimizes the prompt according to the generated dataset. We demonstrate the\neffectiveness of our method with respect to strong proprietary models on\nreal-world tasks such as moderation and generation. Our method outperforms\nstate-of-the-art methods with a limited number of annotated samples.\nFurthermore, we validate the advantages of each one of the system's key\ncomponents. Our system is built in a modular way, facilitating easy adaptation\nto other tasks. The code is available\n$\\href{https://github.com/Eladlev/AutoPrompt}{here}$.\n","authors":["Elad Levi","Eli Brosh","Matan Friedmann"],"pdf_url":"https://arxiv.org/pdf/2402.03099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03095v1","updated":"2024-02-05T15:25:40Z","published":"2024-02-05T15:25:40Z","title":"Transcending Adversarial Perturbations: Manifold-Aided Adversarial\n Examples with Legitimate Semantics","summary":" Deep neural networks were significantly vulnerable to adversarial examples\nmanipulated by malicious tiny perturbations. Although most conventional\nadversarial attacks ensured the visual imperceptibility between adversarial\nexamples and corresponding raw images by minimizing their geometric distance,\nthese constraints on geometric distance led to limited attack transferability,\ninferior visual quality, and human-imperceptible interpretability. In this\npaper, we proposed a supervised semantic-transformation generative model to\ngenerate adversarial examples with real and legitimate semantics, wherein an\nunrestricted adversarial manifold containing continuous semantic variations was\nconstructed for the first time to realize a legitimate transition from\nnon-adversarial examples to adversarial ones. Comprehensive experiments on\nMNIST and industrial defect datasets showed that our adversarial examples not\nonly exhibited better visual quality but also achieved superior attack\ntransferability and more effective explanations for model vulnerabilities,\nindicating their great potential as generic adversarial examples. The code and\npre-trained models were available at https://github.com/shuaili1027/MAELS.git.\n","authors":["Shuai Li","Xiaoyu Jiang","Xiaoguang Ma"],"pdf_url":"https://arxiv.org/pdf/2402.03095v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03094v1","updated":"2024-02-05T15:25:32Z","published":"2024-02-05T15:25:32Z","title":"Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object\n Detector","summary":" This paper addresses the challenge of cross-domain few-shot object detection\n(CD-FSOD), aiming to develop an accurate object detector for novel domains with\nminimal labeled examples. While transformer-based open-set detectors e.g.,\nDE-ViT~\\cite{zhang2023detect} have excelled in both open-vocabulary object\ndetection and traditional few-shot object detection, detecting categories\nbeyond those seen during training, we thus naturally raise two key questions:\n1) can such open-set detection methods easily generalize to CD-FSOD? 2) If no,\nhow to enhance the results of open-set methods when faced with significant\ndomain gaps? To address the first question, we introduce several metrics to\nquantify domain variances and establish a new CD-FSOD benchmark with diverse\ndomain metric values. Some State-Of-The-Art (SOTA) open-set object detection\nmethods are evaluated on this benchmark, with evident performance degradation\nobserved across out-of-domain datasets. This indicates the failure of adopting\nopen-set detectors directly for CD-FSOD. Sequentially, to overcome the\nperformance degradation issue and also to answer the second proposed question,\nwe endeavor to enhance the vanilla DE-ViT. With several novel components\nincluding finetuning, a learnable prototype module, and a lightweight attention\nmodule, we present an improved Cross-Domain Vision Transformer for CD-FSOD\n(CD-ViTO). Experiments show that our CD-ViTO achieves impressive results on\nboth out-of-domain and in-domain target datasets, establishing new SOTAs for\nboth CD-FSOD and FSOD. All the datasets, codes, and models will be released to\nthe community.\n","authors":["Yuqian Fu","Yu Wang","Yixuan Pan","Lian Huai","Xingyu Qiu","Zeyu Shangguan","Tong Liu","Lingjie Kong","Yanwei Fu","Luc Van Gool","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.03094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03415v2","updated":"2024-02-05T15:16:56Z","published":"2023-11-06T09:44:00Z","title":"PowerFlowNet: Power Flow Approximation Using Message Passing Graph\n Neural Networks","summary":" Accurate and efficient power flow (PF) analysis is crucial in modern\nelectrical networks' operation and planning. Therefore, there is a need for\nscalable algorithms that can provide accurate and fast solutions for both small\nand large scale power networks. As the power network can be interpreted as a\ngraph, Graph Neural Networks (GNNs) have emerged as a promising approach for\nimproving the accuracy and speed of PF approximations by exploiting information\nsharing via the underlying graph structure. In this study, we introduce\nPowerFlowNet, a novel GNN architecture for PF approximation that showcases\nsimilar performance with the traditional Newton-Raphson method but achieves it\n4 times faster in the simple IEEE 14-bus system and 145 times faster in the\nrealistic case of the French high voltage network (6470rte). Meanwhile, it\nsignificantly outperforms other traditional approximation methods, such as the\nDC relaxation method, in terms of performance and execution time; therefore,\nmaking PowerFlowNet a highly promising solution for real-world PF analysis.\nFurthermore, we verify the efficacy of our approach by conducting an in-depth\nexperimental evaluation, thoroughly examining the performance, scalability,\ninterpretability, and architectural dependability of PowerFlowNet. The\nevaluation provides insights into the behavior and potential applications of\nGNNs in power system analysis.\n","authors":["Nan Lin","Stavros Orfanoudakis","Nathan Ordonez Cardenas","Juan S. Giraldo","Pedro P. Vergara"],"pdf_url":"https://arxiv.org/pdf/2311.03415v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.03086v1","updated":"2024-02-05T15:14:08Z","published":"2024-02-05T15:14:08Z","title":"Dual Lagrangian Learning for Conic Optimization","summary":" This paper presents Dual Lagrangian Learning (DLL), a principled learning\nmethodology that combines conic duality theory with the representation power of\nML models. DLL leverages conic duality to provide dual-feasible solutions, and\ntherefore valid Lagrangian dual bounds, for parametric linear and nonlinear\nconic optimization problems. The paper introduces differentiable conic\nprojection layers, a systematic dual completion procedure, and a\nself-supervised learning framework. The effectiveness of DLL is demonstrated on\nlinear and nonlinear parametric optimization problems for which DLL provides\nvalid dual bounds within 0.5% of optimality.\n","authors":["Mathieu Tanneau","Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2402.03086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03082v1","updated":"2024-02-05T15:13:20Z","published":"2024-02-05T15:13:20Z","title":"Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual\n Text Processing","summary":" Visual text, a pivotal element in both document and scene images, speaks\nvolumes and attracts significant attention in the computer vision domain.\nBeyond visual text detection and recognition, the field of visual text\nprocessing has experienced a surge in research, driven by the advent of\nfundamental generative models. However, challenges persist due to the unique\nproperties and features that distinguish text from general objects. Effectively\nleveraging these unique textual characteristics is crucial in visual text\nprocessing, as observed in our study. In this survey, we present a\ncomprehensive, multi-perspective analysis of recent advancements in this field.\nInitially, we introduce a hierarchical taxonomy encompassing areas ranging from\ntext image enhancement and restoration to text image manipulation, followed by\ndifferent learning paradigms. Subsequently, we conduct an in-depth discussion\nof how specific textual features such as structure, stroke, semantics, style,\nand spatial context are seamlessly integrated into various tasks. Furthermore,\nwe explore available public datasets and benchmark the reviewed methods on\nseveral widely-used datasets. Finally, we identify principal challenges and\npotential avenues for future research. Our aim is to establish this survey as a\nfundamental resource, fostering continued exploration and innovation in the\ndynamic area of visual text processing.\n","authors":["Yan Shu","Weichao Zeng","Zhenhang Li","Fangmin Zhao","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.03082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03081v1","updated":"2024-02-05T15:12:15Z","published":"2024-02-05T15:12:15Z","title":"Preference-Conditioned Language-Guided Abstraction","summary":" Learning from demonstrations is a common way for users to teach robots, but\nit is prone to spurious feature correlations. Recent work constructs state\nabstractions, i.e. visual representations containing task-relevant features,\nfrom language as a way to perform more generalizable learning. However, these\nabstractions also depend on a user's preference for what matters in a task,\nwhich may be hard to describe or infeasible to exhaustively specify using\nlanguage alone. How do we construct abstractions to capture these latent\npreferences? We observe that how humans behave reveals how they see the world.\nOur key insight is that changes in human behavior inform us that there are\ndifferences in preferences for how humans see the world, i.e. their state\nabstractions. In this work, we propose using language models (LMs) to query for\nthose preferences directly given knowledge that a change in behavior has\noccurred. In our framework, we use the LM in two ways: first, given a text\ndescription of the task and knowledge of behavioral change between states, we\nquery the LM for possible hidden preferences; second, given the most likely\npreference, we query the LM to construct the state abstraction. In this\nframework, the LM is also able to ask the human directly when uncertain about\nits own estimate. We demonstrate our framework's ability to construct effective\npreference-conditioned abstractions in simulated experiments, a user study, as\nwell as on a real Spot robot performing mobile manipulation tasks.\n","authors":["Andi Peng","Andreea Bobu","Belinda Z. Li","Theodore R. Sumers","Ilia Sucholutsky","Nishanth Kumar","Thomas L. Griffiths","Julie A. Shah"],"pdf_url":"https://arxiv.org/pdf/2402.03081v1.pdf","comment":"HRI 2024"},{"id":"http://arxiv.org/abs/2402.03077v1","updated":"2024-02-05T15:09:41Z","published":"2024-02-05T15:09:41Z","title":"Markov Persuasion Processes: Learning to Persuade from Scratch","summary":" In Bayesian persuasion, an informed sender strategically discloses\ninformation to a receiver so as to persuade them to undertake desirable\nactions. Recently, a growing attention has been devoted to settings in which\nsender and receivers interact sequentially. Recently, Markov persuasion\nprocesses (MPPs) have been introduced to capture sequential scenarios where a\nsender faces a stream of myopic receivers in a Markovian environment. The MPPs\nstudied so far in the literature suffer from issues that prevent them from\nbeing fully operational in practice, e.g., they assume that the sender knows\nreceivers' rewards. We fix such issues by addressing MPPs where the sender has\nno knowledge about the environment. We design a learning algorithm for the\nsender, working with partial feedback. We prove that its regret with respect to\nan optimal information-disclosure policy grows sublinearly in the number of\nepisodes, as it is the case for the loss in persuasiveness cumulated while\nlearning. Moreover, we provide a lower bound for our setting matching the\nguarantees of our algorithm.\n","authors":["Francesco Bacchiocchi","Francesco Emanuele Stradi","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2402.03077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03927v2","updated":"2024-02-05T15:04:23Z","published":"2023-07-08T07:58:53Z","title":"Fast Empirical Scenarios","summary":" We seek to extract a small number of representative scenarios from large and\nhigh-dimensional panel data that are consistent with sample moments. Among two\nnovel algorithms, the first identifies scenarios that have not been observed\nbefore, and comes with a scenario-based representation of covariance matrices.\nThe second proposal picks important data points from states of the world that\nhave already realized, and are consistent with higher-order sample moment\ninformation. Both algorithms are efficient to compute, and lend themselves to\nconsistent scenario-based modeling and high-dimensional numerical integration.\nExtensive numerical benchmarking studies and an application in portfolio\noptimization favor the proposed algorithms.\n","authors":["Michael Multerer","Paul Schneider","Rohan Sen"],"pdf_url":"https://arxiv.org/pdf/2307.03927v2.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.03072v1","updated":"2024-02-05T15:02:35Z","published":"2024-02-05T15:02:35Z","title":"Learning to Abstract Visuomotor Mappings using Meta-Reinforcement\n Learning","summary":" We investigated the human capacity to acquire multiple visuomotor mappings\nfor de novo skills. Using a grid navigation paradigm, we tested whether\ncontextual cues implemented as different \"grid worlds\", allow participants to\nlearn two distinct key-mappings more efficiently. Our results indicate that\nwhen contextual information is provided, task performance is significantly\nbetter. The same held true for meta-reinforcement learning agents that differed\nin whether or not they receive contextual information when performing the task.\nWe evaluated their accuracy in predicting human performance in the task and\nanalyzed their internal representations. The results indicate that contextual\ncues allow the formation of separate representations in space and time when\nusing different visuomotor mappings, whereas the absence of them favors sharing\none representation. While both strategies can allow learning of multiple\nvisuomotor mappings, we showed contextual cues provide a computational\nadvantage in terms of how many mappings can be learned.\n","authors":["Carlos A. Velazquez-Vargas","Isaac Ray Christian","Jordan A. Taylor","Sreejan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.03072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06644v2","updated":"2024-02-05T15:02:32Z","published":"2023-10-10T14:07:37Z","title":"Zero-Level-Set Encoder for Neural Distance Fields","summary":" Neural shape representation generally refers to representing 3D geometry\nusing neural networks, e.g., to compute a signed distance or occupancy value at\na specific spatial position. In this paper, we present a novel encoder-decoder\nneural network for embedding 3D shapes in a single forward pass. Our\narchitecture is based on a multi-scale hybrid system incorporating graph-based\nand voxel-based components, as well as a continuously differentiable decoder.\nFurthermore, the network is trained to solve the Eikonal equation and only\nrequires knowledge of the zero-level set for training and inference. This means\nthat in contrast to most previous work, our network is able to output valid\nsigned distance fields without explicit prior knowledge of non-zero distance\nvalues or shape occupancy. We further propose a modification of the loss\nfunction in case that surface normals are not well defined, e.g., in the\ncontext of non-watertight surfaces and non-manifold geometry. Overall, this can\nhelp reduce the computational overhead of training and evaluating neural\ndistance fields, as well as enabling the application to difficult shapes. We\nfinally demonstrate the efficacy, generalizability and scalability of our\nmethod on datasets consisting of deforming shapes, both based on simulated data\nand raw 3D scans. We further show single-class and multi-class encoding, on\nboth fixed and variable vertex-count inputs, showcasing a wide range of\npossible applications.\n","authors":["Stefan Rhys Jeske","Jonathan Klein","Dominik L. Michels","Jan Bender"],"pdf_url":"https://arxiv.org/pdf/2310.06644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00494v2","updated":"2024-02-05T15:00:21Z","published":"2023-07-02T06:55:31Z","title":"Improving Protein Optimization with Smoothed Fitness Landscapes","summary":" The ability to engineer novel proteins with higher fitness for a desired\nproperty would be revolutionary for biotechnology and medicine. Modeling the\ncombinatorially large space of sequences is infeasible; prior methods often\nconstrain optimization to a small mutational radius, but this drastically\nlimits the design space. Instead of heuristics, we propose smoothing the\nfitness landscape to facilitate protein optimization. First, we formulate\nprotein fitness as a graph signal then use Tikunov regularization to smooth the\nfitness landscape. We find optimizing in this smoothed landscape leads to\nimproved performance across multiple methods in the GFP and AAV benchmarks.\nSecond, we achieve state-of-the-art results utilizing discrete energy-based\nmodels and MCMC in the smoothed landscape. Our method, called Gibbs sampling\nwith Graph-based Smoothing (GGS), demonstrates a unique ability to achieve 2.5\nfold fitness improvement (with in-silico evaluation) over its training set. GGS\ndemonstrates potential to optimize proteins in the limited data regime. Code:\nhttps://github.com/kirjner/GGS\n","authors":["Andrew Kirjner","Jason Yim","Raman Samusevich","Shahar Bracha","Tommi Jaakkola","Regina Barzilay","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2307.00494v2.pdf","comment":"ICLR 2024. Code: https://github.com/kirjner/GGS"},{"id":"http://arxiv.org/abs/2402.03055v1","updated":"2024-02-05T14:42:45Z","published":"2024-02-05T14:42:45Z","title":"Probabilistic Actor-Critic: Learning to Explore with PAC-Bayes\n Uncertainty","summary":" We introduce Probabilistic Actor-Critic (PAC), a novel reinforcement learning\nalgorithm with improved continuous control performance thanks to its ability to\nmitigate the exploration-exploitation trade-off. PAC achieves this by\nseamlessly integrating stochastic policies and critics, creating a dynamic\nsynergy between the estimation of critic uncertainty and actor training. The\nkey contribution of our PAC algorithm is that it explicitly models and infers\nepistemic uncertainty in the critic through Probably Approximately\nCorrect-Bayesian (PAC-Bayes) analysis. This incorporation of critic uncertainty\nenables PAC to adapt its exploration strategy as it learns, guiding the actor's\ndecision-making process. PAC compares favorably against fixed or pre-scheduled\nexploration schemes of the prior art. The synergy between stochastic policies\nand critics, guided by PAC-Bayes analysis, represents a fundamental step\ntowards a more adaptive and effective exploration strategy in deep\nreinforcement learning. We report empirical evaluations demonstrating PAC's\nenhanced stability and improved performance over the state of the art in\ndiverse continuous control problems.\n","authors":["Bahareh Tasdighi","Nicklas Werge","Yi-Shan Wu","Melih Kandemir"],"pdf_url":"https://arxiv.org/pdf/2402.03055v1.pdf","comment":"18 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2204.04510v2","updated":"2024-02-05T14:39:29Z","published":"2022-04-09T16:38:25Z","title":"Translating Subgraphs to Nodes Makes Simple GNNs Strong and Efficient\n for Subgraph Representation Learning","summary":" Subgraph representation learning has emerged as an important problem, but it\nis by default approached with specialized graph neural networks on a large\nglobal graph. These models demand extensive memory and computational resources\nbut challenge modeling hierarchical structures of subgraphs. In this paper, we\npropose Subgraph-To-Node (S2N) translation, a novel formulation for learning\nrepresentations of subgraphs. Specifically, given a set of subgraphs in the\nglobal graph, we construct a new graph by coarsely transforming subgraphs into\nnodes. Demonstrating both theoretical and empirical evidence, S2N not only\nsignificantly reduces memory and computational costs compared to\nstate-of-the-art models but also outperforms them by capturing both local and\nglobal structures of the subgraph. By leveraging graph coarsening methods, our\nmethod outperforms baselines even in a data-scarce setting with insufficient\nsubgraphs. Our experiments on eight benchmarks demonstrate that fined-tuned\nmodels with S2N translation can process 183 -- 711 times more subgraph samples\nthan state-of-the-art models at a better or similar performance level.\n","authors":["Dongkwan Kim","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2204.04510v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2402.03053v1","updated":"2024-02-05T14:36:51Z","published":"2024-02-05T14:36:51Z","title":"Multi-Lingual Malaysian Embedding: Leveraging Large Language Models for\n Semantic Representations","summary":" In this work, we present a comprehensive exploration of finetuning Malaysian\nlanguage models, specifically Llama2 and Mistral, on embedding tasks involving\nnegative and positive pairs. We release two distinct models tailored for\nSemantic Similarity and Retrieval-Augmented Generation (RAG).\n For Semantic Similarity, our 600 million parameter Llama2 model outperforms\nOpenAI text-embedding-ada-002 across all recall@k metrics for b.cari.com.my,\nc.cari.com.my, Malay news, and Malaysian Twitter test sets.\n In the realm of RAG models, our approach proves competitive with OpenAI\ntext-embedding-ada-002 in the Malaysian context. Notably, our 2 billion\nparameter Llama2 model achieves superior Recall@5, Recall@10 for the \"Melayu\"\nkeyword research papers dataset and excels in Recall@3, Recall@5, and Recall@10\nfor the lom.agc.gov.my dataset.\n These findings underscore the effectiveness of our finetuning strategy and\nhighlight the performance gains in both Semantic Similarity and RAG tasks.\n All models released at\nhttps://huggingface.co/collections/mesolitica/malaysian-embedding-6523612bfe5881ad35f81b99\n","authors":["Husein Zolkepli","Aisyah Razak","Kamarul Adha","Ariff Nazhan"],"pdf_url":"https://arxiv.org/pdf/2402.03053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03049v1","updated":"2024-02-05T14:33:56Z","published":"2024-02-05T14:33:56Z","title":"EasyInstruct: An Easy-to-use Instruction Processing Framework for Large\n Language Models","summary":" In recent years, instruction tuning has gained increasing attention and\nemerged as a crucial technique to enhance the capabilities of Large Language\nModels (LLMs). To construct high-quality instruction datasets, many instruction\nprocessing approaches have been proposed, aiming to achieve a delicate balance\nbetween data quantity and data quality. Nevertheless, due to inconsistencies\nthat persist among various instruction processing methods, there is no standard\nopen-source instruction processing implementation framework available for the\ncommunity, which hinders practitioners from further developing and advancing.\nTo facilitate instruction processing research and development, we present\nEasyInstruct, an easy-to-use instruction processing framework for LLMs, which\nmodularizes instruction generation, selection, and prompting, while also\nconsidering their combination and interaction. EasyInstruct is publicly\nreleased and actively maintained at https://github.com/zjunlp/EasyInstruct,\nalong with a running demo App at\nhttps://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for\nbroader research centered on instruction data.\n","authors":["Yixin Ou","Ningyu Zhang","Honghao Gui","Ziwen Xu","Shuofei Qiao","Zhen Bi","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03049v1.pdf","comment":"Ongoing work; the project website is at\n https://zjunlp.github.io/project/EasyInstruct, code is at\n https://github.com/zjunlp/EasyInstruct, demo is at\n https://huggingface.co/spaces/zjunlp/EasyInstruct"},{"id":"http://arxiv.org/abs/2402.03048v1","updated":"2024-02-05T14:33:52Z","published":"2024-02-05T14:33:52Z","title":"Cooperative Learning with Gaussian Processes for Euler-Lagrange Systems\n Tracking Control under Switching Topologies","summary":" This work presents an innovative learning-based approach to tackle the\ntracking control problem of Euler-Lagrange multi-agent systems with partially\nunknown dynamics operating under switching communication topologies. The\napproach leverages a correlation-aware cooperative algorithm framework built\nupon Gaussian process regression, which adeptly captures inter-agent\ncorrelations for uncertainty predictions. A standout feature is its exceptional\nefficiency in deriving the aggregation weights achieved by circumventing the\ncomputationally intensive posterior variance calculations. Through Lyapunov\nstability analysis, the distributed control law ensures bounded tracking errors\nwith high probability. Simulation experiments validate the protocol's efficacy\nin effectively managing complex scenarios, establishing it as a promising\nsolution for robust tracking control in multi-agent systems characterized by\nuncertain dynamics and dynamic communication structures.\n","authors":["Zewen Yang","Songbo Dong","Armin Lederer","Xiaobing Dai","Siyu Chen","Stefan Sosnowski","Georges Hattab","Sandra Hirche"],"pdf_url":"https://arxiv.org/pdf/2402.03048v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2402.03047v1","updated":"2024-02-05T14:32:57Z","published":"2024-02-05T14:32:57Z","title":"PFDM: Parser-Free Virtual Try-on via Diffusion Model","summary":" Virtual try-on can significantly improve the garment shopping experiences in\nboth online and in-store scenarios, attracting broad interest in computer\nvision. However, to achieve high-fidelity try-on performance, most\nstate-of-the-art methods still rely on accurate segmentation masks, which are\noften produced by near-perfect parsers or manual labeling. To overcome the\nbottleneck, we propose a parser-free virtual try-on method based on the\ndiffusion model (PFDM). Given two images, PFDM can \"wear\" garments on the\ntarget person seamlessly by implicitly warping without any other information.\nTo learn the model effectively, we synthesize many pseudo-images and construct\nsample pairs by wearing various garments on persons. Supervised by the\nlarge-scale expanded dataset, we fuse the person and garment features using a\nproposed Garment Fusion Attention (GFA) mechanism. Experiments demonstrate that\nour proposed PFDM can successfully handle complex cases, synthesize\nhigh-fidelity images, and outperform both state-of-the-art parser-free and\nparser-based models.\n","authors":["Yunfang Niu","Dong Yi","Lingxiang Wu","Zhiwei Liu","Pengxiang Cai","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03047v1.pdf","comment":"Accepted by IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.03046v1","updated":"2024-02-05T14:32:00Z","published":"2024-02-05T14:32:00Z","title":"Open RL Benchmark: Comprehensive Tracked Experiments for Reinforcement\n Learning","summary":" In many Reinforcement Learning (RL) papers, learning curves are useful\nindicators to measure the effectiveness of RL algorithms. However, the complete\nraw data of the learning curves are rarely available. As a result, it is\nusually necessary to reproduce the experiments from scratch, which can be\ntime-consuming and error-prone. We present Open RL Benchmark, a set of fully\ntracked RL experiments, including not only the usual data such as episodic\nreturn, but also all algorithm-specific and system metrics. Open RL Benchmark\nis community-driven: anyone can download, use, and contribute to the data. At\nthe time of writing, more than 25,000 runs have been tracked, for a cumulative\nduration of more than 8 years. Open RL Benchmark covers a wide range of RL\nlibraries and reference implementations. Special care is taken to ensure that\neach experiment is precisely reproducible by providing not only the full\nparameters, but also the versions of the dependencies used to generate it. In\naddition, Open RL Benchmark comes with a command-line interface (CLI) for easy\nfetching and generating figures to present the results. In this document, we\ninclude two case studies to demonstrate the usefulness of Open RL Benchmark in\npractice. To the best of our knowledge, Open RL Benchmark is the first RL\nbenchmark of its kind, and the authors hope that it will improve and facilitate\nthe work of researchers in the field.\n","authors":["Shengyi Huang","Quentin Gallouédec","Florian Felten","Antonin Raffin","Rousslan Fernand Julien Dossa","Yanxiao Zhao","Ryan Sullivan","Viktor Makoviychuk","Denys Makoviichuk","Mohamad H. Danesh","Cyril Roumégous","Jiayi Weng","Chufan Chen","Md Masudur Rahman","João G. M. Araújo","Guorui Quan","Daniel Tan","Timo Klein","Rujikorn Charakorn","Mark Towers","Yann Berthelot","Kinal Mehta","Dipam Chakraborty","Arjun KG","Valentin Charraut","Chang Ye","Zichen Liu","Lucas N. Alegre","Alexander Nikulin","Xiao Hu","Tianlin Liu","Jongwook Choi","Brent Yi"],"pdf_url":"https://arxiv.org/pdf/2402.03046v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.03043v1","updated":"2024-02-05T14:29:54Z","published":"2024-02-05T14:29:54Z","title":"SIDU-TXT: An XAI Algorithm for NLP with a Holistic Assessment Approach","summary":" Explainable AI (XAI) aids in deciphering 'black-box' models. While several\nmethods have been proposed and evaluated primarily in the image domain, the\nexploration of explainability in the text domain remains a growing research\narea. In this paper, we delve into the applicability of XAI methods for the\ntext domain. In this context, the 'Similarity Difference and Uniqueness' (SIDU)\nXAI method, recognized for its superior capability in localizing entire salient\nregions in image-based classification is extended to textual data. The extended\nmethod, SIDU-TXT, utilizes feature activation maps from 'black-box' models to\ngenerate heatmaps at a granular, word-based level, thereby providing\nexplanations that highlight contextually significant textual elements crucial\nfor model predictions. Given the absence of a unified standard for assessing\nXAI methods, this study applies a holistic three-tiered comprehensive\nevaluation framework: Functionally-Grounded, Human-Grounded and\nApplication-Grounded, to assess the effectiveness of the proposed SIDU-TXT\nacross various experiments. We find that, in sentiment analysis task of a movie\nreview dataset, SIDU-TXT excels in both functionally and human-grounded\nevaluations, demonstrating superior performance through quantitative and\nqualitative analyses compared to benchmarks like Grad-CAM and LIME. In the\napplication-grounded evaluation within the sensitive and complex legal domain\nof asylum decision-making, SIDU-TXT and Grad-CAM demonstrate comparable\nperformances, each with its own set of strengths and weaknesses. However, both\nmethods fall short of entirely fulfilling the sophisticated criteria of expert\nexpectations, highlighting the imperative need for additional research in XAI\nmethods suitable for such domains.\n","authors":["Mohammad N. S. Jahromi","Satya. M. Muddamsetty","Asta Sofie Stage Jarlner","Anna Murphy Høgenhaug","Thomas Gammeltoft-Hansen","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2402.03043v1.pdf","comment":"Preprint submitted to Elsevier on Jan 5th, 2024"},{"id":"http://arxiv.org/abs/2306.14233v2","updated":"2024-02-05T14:25:03Z","published":"2023-06-25T12:50:17Z","title":"Attention-Refined Unrolling for Sparse Sequential micro-Doppler\n Reconstruction","summary":" The reconstruction of micro-Doppler signatures of human movements is a key\nenabler for fine-grained activity recognition wireless sensing. In Joint\nCommunication and Sensing (JCS) systems, unlike in dedicated radar sensing\nsystems, a suitable trade-off between sensing accuracy and communication\noverhead has to be attained. It follows that the micro-Doppler has to be\nreconstructed from incomplete windows of channel estimates obtained from\ncommunication packets. Existing approaches exploit compressed sensing, but\nproduce very poor reconstructions when only a few channel measurements are\navailable, which is often the case with real communication patterns. In\naddition, the large number of iterations they need to converge hinders their\nuse in real-time systems. In this work, we propose and validate STAR, a neural\nnetwork that reconstructs micro-Doppler sequences of human movement even from\nhighly incomplete channel measurements. STAR is based upon a new architectural\ndesign that combines a single unrolled iterative hard-thresholding layer with\nan attention mechanism, used at its output. This results in an interpretable\nand lightweight architecture that reaps the benefits of both model-based and\ndata driven solutions. STAR is evaluated on a public JCS dataset of 60 GHz\nchannel measurements of human activity traces. Experimental results show that\nit substantially outperforms state-of-the-art techniques in terms of the\nreconstructed micro-Doppler quality. Remarkably, STAR enables human activity\nrecognition with satisfactory accuracy even with 90% of missing channel\nmeasurements, for which existing techniques fail.\n","authors":["Riccardo Mazzieri","Jacopo Pegoraro","Michele Rossi"],"pdf_url":"https://arxiv.org/pdf/2306.14233v2.pdf","comment":"16 pages, 10 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.03040v1","updated":"2024-02-05T14:24:46Z","published":"2024-02-05T14:24:46Z","title":"InteractiveVideo: User-Centric Controllable Video Generation with\n Synergistic Multimodal Instructions","summary":" We introduce $\\textit{InteractiveVideo}$, a user-centric framework for video\ngeneration. Different from traditional generative approaches that operate based\non user-provided images or text, our framework is designed for dynamic\ninteraction, allowing users to instruct the generative model through various\nintuitive mechanisms during the whole generation process, e.g. text and image\nprompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal\nInstruction mechanism, designed to seamlessly integrate users' multimodal\ninstructions into generative models, thus facilitating a cooperative and\nresponsive interaction between user inputs and the generative process. This\napproach enables iterative and fine-grained refinement of the generation result\nthrough precise and effective user instructions. With\n$\\textit{InteractiveVideo}$, users are given the flexibility to meticulously\ntailor key aspects of a video. They can paint the reference image, edit\nsemantics, and adjust video motions until their requirements are fully met.\nCode, models, and demo are available at\nhttps://github.com/invictus717/InteractiveVideo\n","authors":["Yiyuan Zhang","Yuhao Kang","Zhixin Zhang","Xiaohan Ding","Sanyuan Zhao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2402.03040v1.pdf","comment":"Code, models, and demo are available at\n https://github.com/invictus717/InteractiveVideo"},{"id":"http://arxiv.org/abs/2402.03038v1","updated":"2024-02-05T14:23:43Z","published":"2024-02-05T14:23:43Z","title":"Automatic Combination of Sample Selection Strategies for Few-Shot\n Learning","summary":" In few-shot learning, such as meta-learning, few-shot fine-tuning or\nin-context learning, the limited number of samples used to train a model have a\nsignificant impact on the overall success. Although a large number of sample\nselection strategies exist, their impact on the performance of few-shot\nlearning is not extensively known, as most of them have been so far evaluated\nin typical supervised settings only. In this paper, we thoroughly investigate\nthe impact of 20 sample selection strategies on the performance of 5 few-shot\nlearning approaches over 8 image and 6 text datasets. In addition, we propose a\nnew method for automatic combination of sample selection strategies (ACSESS)\nthat leverages the strengths and complementary information of the individual\nstrategies. The experimental results show that our method consistently\noutperforms the individual selection strategies, as well as the recently\nproposed method for selecting support examples for in-context learning. We also\nshow a strong modality, dataset and approach dependence for the majority of\nstrategies as well as their dependence on the number of shots - demonstrating\nthat the sample selection strategies play a significant role for lower number\nof shots, but regresses to random selection at higher number of shots.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2402.03038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01271v2","updated":"2024-02-05T14:21:14Z","published":"2023-06-02T05:07:42Z","title":"Towards Understanding Clean Generalization and Robust Overfitting in\n Adversarial Training","summary":" Similar to surprising performance in the standard deep learning, deep nets\ntrained by adversarial training also generalize well for $\\textit{unseen clean\ndata (natural data)}$. However, despite adversarial training can achieve low\nrobust training error, there exists a significant $\\textit{robust\ngeneralization gap}$. We call this phenomenon the $\\textit{Clean Generalization\nand Robust Overfitting (CGRO)}$. In this work, we study the CGRO phenomenon in\nadversarial training from two views: $\\textit{representation complexity}$ and\n$\\textit{training dynamics}$. Specifically, we consider a binary classification\nsetting with $N$ separated training data points. $\\textit{First}$, we prove\nthat, based on the assumption that we assume there is\n$\\operatorname{poly}(D)$-size clean classifier (where $D$ is the data\ndimension), ReLU net with only $O(N D)$ extra parameters is able to leverages\nrobust memorization to achieve the CGRO, while robust classifier still requires\nexponential representation complexity in worst case. $\\textit{Next}$, we focus\non a structured-data case to analyze training dynamics, where we train a\ntwo-layer convolutional network with $O(N D)$ width against adversarial\nperturbation. We then show that a three-stage phase transition occurs during\nlearning process and the network provably converges to robust memorization\nregime, which thereby results in the CGRO. $\\textit{Besides}$, we also\nempirically verify our theoretical analysis by experiments in real-image\nrecognition datasets.\n","authors":["Binghui Li","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2306.01271v2.pdf","comment":"28 pages, comments welcome"},{"id":"http://arxiv.org/abs/2402.01262v2","updated":"2024-02-05T14:14:02Z","published":"2024-02-02T09:33:07Z","title":"Cascaded Scaling Classifier: class incremental learning with probability\n scaling","summary":" Humans are capable of acquiring new knowledge and transferring learned\nknowledge into different domains, incurring a small forgetting. The same\nability, called Continual Learning, is challenging to achieve when operating\nwith neural networks due to the forgetting affecting past learned tasks when\nlearning new ones. This forgetting can be mitigated by replaying stored samples\nfrom past tasks, but a large memory size may be needed for long sequences of\ntasks; moreover, this could lead to overfitting on saved samples. In this\npaper, we propose a novel regularisation approach and a novel incremental\nclassifier called, respectively, Margin Dampening and Cascaded Scaling\nClassifier. The first combines a soft constraint and a knowledge distillation\napproach to preserve past learned knowledge while allowing the model to learn\nnew patterns effectively. The latter is a gated incremental classifier, helping\nthe model modify past predictions without directly interfering with them. This\nis achieved by modifying the output of the model with auxiliary scaling\nfunctions. We empirically show that our approach performs well on multiple\nbenchmarks against well-established baselines, and we also study each component\nof our proposal and how the combinations of such components affect the final\nresults.\n","authors":["Jary Pomponi","Alessio Devoto","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2402.01262v2.pdf","comment":"Paper under review. The official code is available\n https://github.com/jaryP/Cascaded-Scaling-Classifier"},{"id":"http://arxiv.org/abs/2402.03028v1","updated":"2024-02-05T14:12:35Z","published":"2024-02-05T14:12:35Z","title":"Functional SDE approximation inspired by a deep operator network\n architecture","summary":" A novel approach to approximate solutions of Stochastic Differential\nEquations (SDEs) by Deep Neural Networks is derived and analysed. The\narchitecture is inspired by the notion of Deep Operator Networks (DeepONets),\nwhich is based on operator learning in function spaces in terms of a reduced\nbasis also represented in the network. In our setting, we make use of a\npolynomial chaos expansion (PCE) of stochastic processes and call the\ncorresponding architecture SDEONet. The PCE has been used extensively in the\narea of uncertainty quantification (UQ) with parametric partial differential\nequations. This however is not the case with SDE, where classical sampling\nmethods dominate and functional approaches are seen rarely. A main challenge\nwith truncated PCEs occurs due to the drastic growth of the number of\ncomponents with respect to the maximum polynomial degree and the number of\nbasis elements. The proposed SDEONet architecture aims to alleviate the issue\nof exponential complexity by learning an optimal sparse truncation of the\nWiener chaos expansion. A complete convergence and complexity analysis is\npresented, making use of recent Neural Network approximation results. Numerical\nexperiments illustrate the promising performance of the suggested approach in\n1D and higher dimensions.\n","authors":["Martin Eigel","Charles Miranda"],"pdf_url":"https://arxiv.org/pdf/2402.03028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03025v1","updated":"2024-02-05T14:06:15Z","published":"2024-02-05T14:06:15Z","title":"Understanding and Guiding Weakly Supervised Entity Alignment with\n Potential Isomorphism Propagation","summary":" Weakly Supervised Entity Alignment (EA) is the task of identifying equivalent\nentities across diverse knowledge graphs (KGs) using only a limited number of\nseed alignments. Despite substantial advances in aggregation-based weakly\nsupervised EA, the underlying mechanisms in this setting remain unexplored. In\nthis paper, we present a propagation perspective to analyze weakly supervised\nEA and explain the existing aggregation-based EA models. Our theoretical\nanalysis reveals that these models essentially seek propagation operators for\npairwise entity similarities. We further prove that, despite the structural\nheterogeneity of different KGs, the potentially aligned entities within\naggregation-based EA models have isomorphic subgraphs, which is the core\npremise of EA but has not been investigated. Leveraging this insight, we\nintroduce a potential isomorphism propagation operator to enhance the\npropagation of neighborhood information across KGs. We develop a general EA\nframework, PipEA, incorporating this operator to improve the accuracy of every\ntype of aggregation-based model without altering the learning process.\nExtensive experiments substantiate our theoretical findings and demonstrate\nPipEA's significant performance gains over state-of-the-art weakly supervised\nEA methods. Our work not only advances the field but also enhances our\ncomprehension of aggregation-based weakly supervised EA.\n","authors":["Yuanyi Wang","Wei Tang","Haifeng Sun","Zirui Zhuang","Xiaoyuan Fu","Jingyu Wang","Qi Qi","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2402.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03021v1","updated":"2024-02-05T14:00:53Z","published":"2024-02-05T14:00:53Z","title":"Data-induced multiscale losses and efficient multirate gradient descent\n schemes","summary":" This paper investigates the impact of multiscale data on machine learning\nalgorithms, particularly in the context of deep learning. A dataset is\nmultiscale if its distribution shows large variations in scale across different\ndirections. This paper reveals multiscale structures in the loss landscape,\nincluding its gradients and Hessians inherited from the data. Correspondingly,\nit introduces a novel gradient descent approach, drawing inspiration from\nmultiscale algorithms used in scientific computing. This approach seeks to\ntranscend empirical learning rate selection, offering a more systematic,\ndata-informed strategy to enhance training efficiency, especially in the later\nstages.\n","authors":["Juncai He","Liangchen Liu"," Yen-Hsi"," Tsai"],"pdf_url":"https://arxiv.org/pdf/2402.03021v1.pdf","comment":"28 pages, 4 figures, submitted under review"},{"id":"http://arxiv.org/abs/2402.03019v1","updated":"2024-02-05T14:00:13Z","published":"2024-02-05T14:00:13Z","title":"Taylor Videos for Action Recognition","summary":" Effectively extracting motions from video is a critical and long-standing\nproblem for action recognition. This problem is very challenging because\nmotions (i) do not have an explicit form, (ii) have various concepts such as\ndisplacement, velocity, and acceleration, and (iii) often contain noise caused\nby unstable pixels. Addressing these challenges, we propose the Taylor video, a\nnew video format that highlights the dominate motions (e.g., a waving hand) in\neach of its frames named the Taylor frame. Taylor video is named after Taylor\nseries, which approximates a function at a given point using important terms.\nIn the scenario of videos, we define an implicit motion-extraction function\nwhich aims to extract motions from video temporal block. In this block, using\nthe frames, the difference frames, and higher-order difference frames, we\nperform Taylor expansion to approximate this function at the starting frame. We\nshow the summation of the higher-order terms in the Taylor series gives us\ndominant motion patterns, where static objects, small and unstable motions are\nremoved. Experimentally we show that Taylor videos are effective inputs to\npopular architectures including 2D CNNs, 3D CNNs, and transformers. When used\nindividually, Taylor videos yield competitive action recognition accuracy\ncompared to RGB videos and optical flow. When fused with RGB or optical flow\nvideos, further accuracy improvement is achieved.\n","authors":["Lei Wang","Xiuyuan Yuan","Tom Gedeon","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.03019v1.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2402.03017v1","updated":"2024-02-05T13:55:54Z","published":"2024-02-05T13:55:54Z","title":"Toward Green and Human-Like Artificial Intelligence: A Complete Survey\n on Contemporary Few-Shot Learning Approaches","summary":" Despite deep learning's widespread success, its data-hungry and\ncomputationally expensive nature makes it impractical for many data-constrained\nreal-world applications. Few-Shot Learning (FSL) aims to address these\nlimitations by enabling rapid adaptation to novel learning tasks, seeing\nsignificant growth in recent years. This survey provides a comprehensive\noverview of the field's latest advancements. Initially, FSL is formally\ndefined, and its relationship with different learning fields is presented. A\nnovel taxonomy is introduced, extending previously proposed ones, and\nreal-world applications in classic and novel fields are described. Finally,\nrecent trends shaping the field, outstanding challenges, and promising future\nresearch directions are discussed.\n","authors":["Georgios Tsoumplekas","Vladislav Li","Vasileios Argyriou","Anastasios Lytos","Eleftherios Fountoukidis","Sotirios K. Goudos","Ioannis D. Moscholios","Panagiotis Sarigiannidis"],"pdf_url":"https://arxiv.org/pdf/2402.03017v1.pdf","comment":"35 pages, 9 figures. Submitted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2402.03014v1","updated":"2024-02-05T13:52:56Z","published":"2024-02-05T13:52:56Z","title":"Whom to Trust? Elective Learning for Distributed Gaussian Process\n Regression","summary":" This paper introduces an innovative approach to enhance distributed\ncooperative learning using Gaussian process (GP) regression in multi-agent\nsystems (MASs). The key contribution of this work is the development of an\nelective learning algorithm, namely prior-aware elective distributed GP\n(Pri-GP), which empowers agents with the capability to selectively request\npredictions from neighboring agents based on their trustworthiness. The\nproposed Pri-GP effectively improves individual prediction accuracy, especially\nin cases where the prior knowledge of an agent is incorrect. Moreover, it\neliminates the need for computationally intensive variance calculations for\ndetermining aggregation weights in distributed GP. Furthermore, we establish a\nprediction error bound within the Pri-GP framework, ensuring the reliability of\npredictions, which is regarded as a crucial property in safety-critical MAS\napplications.\n","authors":["Zewen Yang","Xiaobing Dai","Akshat Dubey","Sandra Hirche","Georges Hattab"],"pdf_url":"https://arxiv.org/pdf/2402.03014v1.pdf","comment":"9 pages, conference preprint"},{"id":"http://arxiv.org/abs/2308.10699v2","updated":"2024-02-05T13:52:22Z","published":"2023-08-21T13:09:31Z","title":"Cost-Efficient Online Decision Making: A Combinatorial Multi-Armed\n Bandit Approach","summary":" Online decision making plays a crucial role in numerous real-world\napplications. In many scenarios, the decision is made based on performing a\nsequence of tests on the incoming data points. However, performing all tests\ncan be expensive and is not always possible. In this paper, we provide a novel\nformulation of the online decision making problem based on combinatorial\nmulti-armed bandits and take the (possibly stochastic) cost of performing tests\ninto account. Based on this formulation, we provide a new framework for\ncost-efficient online decision making which can utilize posterior sampling or\nBayesUCB for exploration. We provide a theoretical analysis of Thompson\nSampling for cost-efficient online decision making, and present various\nexperimental results that demonstrate the applicability of our framework to\nreal-world problems.\n","authors":["Arman Rahbar","Niklas Åkerblom","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2308.10699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03011v1","updated":"2024-02-05T13:50:08Z","published":"2024-02-05T13:50:08Z","title":"On the Impact of Output Perturbation on Fairness in Binary Linear\n Classification","summary":" We theoretically study how differential privacy interacts with both\nindividual and group fairness in binary linear classification. More precisely,\nwe focus on the output perturbation mechanism, a classic approach in\nprivacy-preserving machine learning. We derive high-probability bounds on the\nlevel of individual and group fairness that the perturbed models can achieve\ncompared to the original model. Hence, for individual fairness, we prove that\nthe impact of output perturbation on the level of fairness is bounded but grows\nwith the dimension of the model. For group fairness, we show that this impact\nis determined by the distribution of so-called angular margins, that is signed\nmargins of the non-private model re-scaled by the norm of each example.\n","authors":["Vitalii Emelianov","Michaël Perrot"],"pdf_url":"https://arxiv.org/pdf/2402.03011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03008v1","updated":"2024-02-05T13:47:41Z","published":"2024-02-05T13:47:41Z","title":"Diffusive Gibbs Sampling","summary":" The inadequate mixing of conventional Markov Chain Monte Carlo (MCMC) methods\nfor multi-modal distributions presents a significant challenge in practical\napplications such as Bayesian inference and molecular dynamics. Addressing\nthis, we propose Diffusive Gibbs Sampling (DiGS), an innovative family of\nsampling methods designed for effective sampling from distributions\ncharacterized by distant and disconnected modes. DiGS integrates recent\ndevelopments in diffusion models, leveraging Gaussian convolution to create an\nauxiliary noisy distribution that bridges isolated modes in the original space\nand applying Gibbs sampling to alternately draw samples from both spaces. Our\napproach exhibits a better mixing property for sampling multi-modal\ndistributions than state-of-the-art methods such as parallel tempering. We\ndemonstrate that our sampler attains substantially improved results across\nvarious tasks, including mixtures of Gaussians, Bayesian neural networks and\nmolecular dynamics.\n","authors":["Wenlin Chen","Mingtian Zhang","Brooks Paige","José Miguel Hernández-Lobato","David Barber"],"pdf_url":"https://arxiv.org/pdf/2402.03008v1.pdf","comment":"15 pages, 11 figures, 4 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2402.03006v1","updated":"2024-02-05T13:46:04Z","published":"2024-02-05T13:46:04Z","title":"On the development of a practical Bayesian optimisation algorithm for\n expensive experiments and simulations with changing environmental conditions","summary":" Experiments in engineering are typically conducted in controlled environments\nwhere parameters can be set to any desired value. This assumes that the same\napplies in a real-world setting -- an assumption that is often incorrect as\nmany experiments are influenced by uncontrollable environmental conditions such\nas temperature, humidity and wind speed. When optimising such experiments, the\nfocus should lie on finding optimal values conditionally on these\nuncontrollable variables. This article extends Bayesian optimisation to the\noptimisation of systems in changing environments that include controllable and\nuncontrollable parameters. The extension fits a global surrogate model over all\ncontrollable and environmental variables but optimises only the controllable\nparameters conditional on measurements of the uncontrollable variables. The\nmethod is validated on two synthetic test functions and the effects of the\nnoise level, the number of the environmental parameters, the parameter\nfluctuation, the variability of the uncontrollable parameters, and the\neffective domain size are investigated. ENVBO, the proposed algorithm resulting\nfrom this investigation, is applied to a wind farm simulator with eight\ncontrollable and one environmental parameter. ENVBO finds solutions for the\nfull domain of the environmental variable that outperforms results from\noptimisation algorithms that only focus on a fixed environmental value in all\nbut one case while using a fraction of their evaluation budget. This makes the\nproposed approach very sample-efficient and cost-effective. An off-the-shelf\nopen-source version of ENVBO is available via the NUBO Python package.\n","authors":["Mike Diessner","Kevin J. Wilson","Richard D. Whalley"],"pdf_url":"https://arxiv.org/pdf/2402.03006v1.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2402.02998v1","updated":"2024-02-05T13:37:00Z","published":"2024-02-05T13:37:00Z","title":"Careful with that Scalpel: Improving Gradient Surgery with an EMA","summary":" Beyond minimizing a single training loss, many deep learning estimation\npipelines rely on an auxiliary objective to quantify and encourage desirable\nproperties of the model (e.g. performance on another dataset, robustness,\nagreement with a prior). Although the simplest approach to incorporating an\nauxiliary loss is to sum it with the training loss as a regularizer, recent\nworks have shown that one can improve performance by blending the gradients\nbeyond a simple sum; this is known as gradient surgery. We cast the problem as\na constrained minimization problem where the auxiliary objective is minimized\namong the set of minimizers of the training loss. To solve this bilevel\nproblem, we follow a parameter update direction that combines the training loss\ngradient and the orthogonal projection of the auxiliary gradient to the\ntraining gradient. In a setting where gradients come from mini-batches, we\nexplain how, using a moving average of the training loss gradients, we can\ncarefully maintain this critical orthogonality property. We demonstrate that\nour method, Bloop, can lead to much better performances on NLP and vision\nexperiments than other gradient surgery methods without EMA.\n","authors":["Yu-Guan Hsieh","James Thornton","Eugene Ndiaye","Michal Klein","Marco Cuturi","Pierre Ablin"],"pdf_url":"https://arxiv.org/pdf/2402.02998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02996v1","updated":"2024-02-05T13:34:21Z","published":"2024-02-05T13:34:21Z","title":"Text-Guided Image Clustering","summary":" Image clustering divides a collection of images into meaningful groups,\ntypically interpreted post-hoc via human-given annotations. Those are usually\nin the form of text, begging the question of using text as an abstraction for\nimage clustering. Current image clustering methods, however, neglect the use of\ngenerated textual descriptions. We, therefore, propose Text-Guided Image\nClustering, i.e., generating text using image captioning and visual\nquestion-answering (VQA) models and subsequently clustering the generated text.\nFurther, we introduce a novel approach to inject task- or domain knowledge for\nclustering by prompting VQA models. Across eight diverse image clustering\ndatasets, our results show that the obtained text representations often\noutperform image features. Additionally, we propose a counting-based cluster\nexplainability method. Our evaluations show that the derived keyword-based\nexplanations describe clusters better than the respective cluster accuracy\nsuggests. Overall, this research challenges traditional approaches and paves\nthe way for a paradigm shift in image clustering, using generated text.\n","authors":["Andreas Stephan","Lukas Miklautz","Kevin Sidak","Jan Philip Wahle","Bela Gipp","Claudia Plant","Benjamin Roth"],"pdf_url":"https://arxiv.org/pdf/2402.02996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02992v1","updated":"2024-02-05T13:31:28Z","published":"2024-02-05T13:31:28Z","title":"Decoding-time Realignment of Language Models","summary":" Aligning language models with human preferences is crucial for reducing\nerrors and biases in these models. Alignment techniques, such as reinforcement\nlearning from human feedback (RLHF), are typically cast as optimizing a\ntradeoff between human preference rewards and a proximity regularization term\nthat encourages staying close to the unaligned model. Selecting an appropriate\nlevel of regularization is critical: insufficient regularization can lead to\nreduced model capabilities due to reward hacking, whereas excessive\nregularization hinders alignment. Traditional methods for finding the optimal\nregularization level require retraining multiple models with varying\nregularization strengths. This process, however, is resource-intensive,\nespecially for large models. To address this challenge, we propose\ndecoding-time realignment (DeRa), a simple method to explore and evaluate\ndifferent regularization strengths in aligned models without retraining. DeRa\nenables control over the degree of alignment, allowing users to smoothly\ntransition between unaligned and aligned models. It also enhances the\nefficiency of hyperparameter tuning by enabling the identification of effective\nregularization strengths using a validation dataset.\n","authors":["Tianlin Liu","Shangmin Guo","Leonardo Bianco","Daniele Calandriello","Quentin Berthet","Felipe Llinares","Jessica Hoffmann","Lucas Dixon","Michal Valko","Mathieu Blondel"],"pdf_url":"https://arxiv.org/pdf/2402.02992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02989v1","updated":"2024-02-05T13:27:41Z","published":"2024-02-05T13:27:41Z","title":"DexDiffuser: Generating Dexterous Grasps with Diffusion Models","summary":" We introduce DexDiffuser, a novel dexterous grasping method that generates,\nevaluates, and refines grasps on partial object point clouds. DexDiffuser\nincludes the conditional diffusion-based grasp sampler DexSampler and the\ndexterous grasp evaluator DexEvaluator. DexSampler generates high-quality\ngrasps conditioned on object point clouds by iterative denoising of randomly\nsampled grasps. We also introduce two grasp refinement strategies:\nEvaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR).\nOur simulation and real-world experiments on the Allegro Hand consistently\ndemonstrate that DexDiffuser outperforms the state-of-the-art multi-finger\ngrasp generation method FFHNet with an, on average, 21.71--22.20\\% higher grasp\nsuccess rate.\n","authors":["Zehang Weng","Haofei Lu","Danica Kragic","Jens Lundell"],"pdf_url":"https://arxiv.org/pdf/2402.02989v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2305.14383v2","updated":"2024-02-05T13:23:17Z","published":"2023-05-22T11:49:21Z","title":"A Rational Model of Dimension-reduced Human Categorization","summary":" Humans tend to categorize objects based on a few key features. We propose a\nrational model of categorization that utilizes a mixture of probabilistic\nprincipal component analyzers (mPPCA). This model represents each category with\nreduced feature dimensions and allows local features to be shared across\ncategories to facilitate few-shot learning. Theoretically, we identify the\nnecessary and sufficient condition for dimension-reduced representation to\noutperform full-dimension representation. We then show the superior performance\nof mPPCA in predicting human categorization over exemplar and prototype models\nin a behavioral experiment. When combined with the convolutional neural\nnetwork, the mPPCA classifier with a single principal component dimension for\neach category achieves comparable performance to ResNet with a linear\nclassifier on the ${\\tt CIFAR-10H}$ human categorization dataset.\n","authors":["Yifan Hong","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2305.14383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02986v1","updated":"2024-02-05T13:16:38Z","published":"2024-02-05T13:16:38Z","title":"A Safety-Adapted Loss for Pedestrian Detection in Automated Driving","summary":" In safety-critical domains like automated driving (AD), errors by the object\ndetector may endanger pedestrians and other vulnerable road users (VRU). As\ncommon evaluation metrics are not an adequate safety indicator, recent works\nemploy approaches to identify safety-critical VRU and back-annotate the risk to\nthe object detector. However, those approaches do not consider the safety\nfactor in the deep neural network (DNN) training process. Thus,\nstate-of-the-art DNN penalizes all misdetections equally irrespective of their\ncriticality. Subsequently, to mitigate the occurrence of critical failure\ncases, i.e., false negatives, a safety-aware training strategy might be\nrequired to enhance the detection performance for critical pedestrians. In this\npaper, we propose a novel safety-aware loss variation that leverages the\nestimated per-pedestrian criticality scores during training. We exploit the\nreachability set-based time-to-collision (TTC-RSB) metric from the motion\ndomain along with distance information to account for the worst-case threat\nquantifying the criticality. Our evaluation results using RetinaNet and FCOS on\nthe nuScenes dataset demonstrate that training the models with our safety-aware\nloss function mitigates the misdetection of critical pedestrians without\nsacrificing performance for the general case, i.e., pedestrians outside the\nsafety-critical zone.\n","authors":["Maria Lyssenko","Piyush Pimplikar","Maarten Bieshaar","Farzad Nozarian","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2402.02986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02985v1","updated":"2024-02-05T13:16:12Z","published":"2024-02-05T13:16:12Z","title":"Unsupervised semantic segmentation of high-resolution UAV imagery for\n road scene parsing","summary":" Two challenges are presented when parsing road scenes in UAV images. First,\nthe high resolution of UAV images makes processing difficult. Second,\nsupervised deep learning methods require a large amount of manual annotations\nto train robust and accurate models. In this paper, an unsupervised road\nparsing framework that leverages recent advances in vision language models and\nfundamental computer vision model is introduced.Initially, a vision language\nmodel is employed to efficiently process ultra-large resolution UAV images to\nquickly detect road regions of interest in the images. Subsequently, the vision\nfoundation model SAM is utilized to generate masks for the road regions without\ncategory information. Following that, a self-supervised representation learning\nnetwork extracts feature representations from all masked regions. Finally, an\nunsupervised clustering algorithm is applied to cluster these feature\nrepresentations and assign IDs to each cluster. The masked regions are combined\nwith the corresponding IDs to generate initial pseudo-labels, which initiate an\niterative self-training process for regular semantic segmentation. The proposed\nmethod achieves an impressive 89.96% mIoU on the development dataset without\nrelying on any manual annotation. Particularly noteworthy is the extraordinary\nflexibility of the proposed method, which even goes beyond the limitations of\nhuman-defined categories and is able to acquire knowledge of new categories\nfrom the dataset itself.\n","authors":["Zihan Ma","Yongshang Li","Ronggui Ma","Chen Liang"],"pdf_url":"https://arxiv.org/pdf/2402.02985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01171v2","updated":"2024-02-05T13:15:39Z","published":"2023-07-03T17:30:09Z","title":"Quantum Neural Estimation of Entropies","summary":" Entropy measures quantify the amount of information and correlation present\nin a quantum system. In practice, when the quantum state is unknown and only\ncopies thereof are available, one must resort to the estimation of such entropy\nmeasures. Here we propose a variational quantum algorithm for estimating the\nvon Neumann and R\\'enyi entropies, as well as the measured relative entropy and\nmeasured R\\'enyi relative entropy. Our approach first parameterizes a\nvariational formula for the measure of interest by a quantum circuit and a\nclassical neural network, and then optimizes the resulting objective over\nparameter space. Numerical simulations of our quantum algorithm are provided,\nusing a noiseless quantum simulator. The algorithm provides accurate estimates\nof the various entropy measures for the examples tested, which renders it as a\npromising approach for usage in downstream tasks.\n","authors":["Ziv Goldfeld","Dhrumil Patel","Sreejith Sreekumar","Mark M. Wilde"],"pdf_url":"https://arxiv.org/pdf/2307.01171v2.pdf","comment":"14 pages, 2 figures; see also independent works of Shin, Lee, and\n Jeong at arXiv:2306.14566v1 and Lee, Kwon, and Lee at arXiv:2307.13511v2"},{"id":"http://arxiv.org/abs/2402.02980v1","updated":"2024-02-05T13:12:33Z","published":"2024-02-05T13:12:33Z","title":"Review on Fault Diagnosis and Fault-Tolerant Control Scheme for Robotic\n Manipulators: Recent Advances in AI, Machine Learning, and Digital Twin","summary":" This comprehensive review article delves into the intricate realm of\nfault-tolerant control (FTC) schemes tailored for robotic manipulators. Our\nexploration spans the historical evolution of FTC, tracing its development over\ntime, and meticulously examines the recent breakthroughs fueled by the\nsynergistic integration of cutting-edge technologies such as artificial\nintelligence (AI), machine learning (ML), and digital twin technologies (DTT).\nThe article places a particular emphasis on the transformative influence these\ncontemporary trends exert on the landscape of robotic manipulator control and\nfault tolerance.\n By delving into the historical context, our aim is to provide a comprehensive\nunderstanding of the evolution of FTC schemes. This journey encompasses the\ntransition from model-based and signal-based schemes to the role of sensors,\nsetting the stage for an exploration of the present-day paradigm shift enabled\nby AI, ML, and DTT. The narrative unfolds as we dissect the intricate interplay\nbetween these advanced technologies and their applications in enhancing fault\ntolerance within the domain of robotic manipulators. Our review critically\nevaluates the impact of these advancements, shedding light on the novel\nmethodologies, techniques, and applications that have emerged in recent times.\n The overarching goal of this article is to present a comprehensive\nperspective on the current state of fault diagnosis and fault-tolerant control\nwithin the context of robotic manipulators, positioning our exploration within\nthe broader framework of AI, ML, and DTT advancements. Through a meticulous\nexamination of both historical foundations and contemporary innovations, this\nreview significantly contributes to the existing body of knowledge, offering\nvaluable insights for researchers, practitioners, and enthusiasts navigating\nthe dynamic landscape of robotic manipulator control.\n","authors":["Md Muzakkir Quamar","Ali Nasir"],"pdf_url":"https://arxiv.org/pdf/2402.02980v1.pdf","comment":"24 pages, 6 figures"},{"id":"http://arxiv.org/abs/2211.05006v2","updated":"2024-02-05T13:00:51Z","published":"2022-11-09T16:35:42Z","title":"Almost Tight Error Bounds on Differentially Private Continual Counting","summary":" The first large-scale deployment of private federated learning uses\ndifferentially private counting in the continual release model as a subroutine\n(Google AI blog titled \"Federated Learning with Formal Differential Privacy\nGuarantees\"). In this case, a concrete bound on the error is very relevant to\nreduce the privacy parameter. The standard mechanism for continual counting is\nthe binary mechanism. We present a novel mechanism and show that its mean\nsquared error is both asymptotically optimal and a factor 10 smaller than the\nerror of the binary mechanism. We also show that the constants in our analysis\nare almost tight by giving non-asymptotic lower and upper bounds that differ\nonly in the constants of lower-order terms. Our algorithm is a matrix mechanism\nfor the counting matrix and takes constant time per release. We also use our\nexplicit factorization of the counting matrix to give an upper bound on the\nexcess risk of the private learning algorithm of Denisov et al. (NeurIPS 2022).\nOur lower bound for any continual counting mechanism is the first tight lower\nbound on continual counting under approximate differential privacy. It is\nachieved using a new lower bound on a certain factorization norm, denoted by\n$\\gamma_F(\\cdot)$, in terms of the singular values of the matrix. In\nparticular, we show that for any complex matrix, $A \\in \\mathbb{C}^{m \\times\nn}$, \\[ \\gamma_F(A) \\geq \\frac{1}{\\sqrt{m}}\\|A\\|_1, \\] where $\\|\\cdot \\|$\ndenotes the Schatten-1 norm.\n We believe this technique will be useful in proving lower bounds for a larger\nclass of linear queries. To illustrate the power of this technique, we show the\nfirst lower bound on the mean squared error for answering parity queries.\n","authors":["Monika Henzinger","Jalaj Upadhyay","Sarvagya Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2211.05006v2.pdf","comment":"Updated the citations to include two papers we learned about since\n version 01"},{"id":"http://arxiv.org/abs/2402.02977v1","updated":"2024-02-05T12:58:29Z","published":"2024-02-05T12:58:29Z","title":"Variational Flow Models: Flowing in Your Style","summary":" We introduce a variational inference interpretation for models of \"posterior\nflows\" - generalizations of \"probability flows\" to a broader class of\nstochastic processes not necessarily diffusion processes. We coin the resulting\nmodels as \"Variational Flow Models\". Additionally, we propose a systematic\ntraining-free method to transform the posterior flow of a \"linear\" stochastic\nprocess characterized by the equation Xt = at * X0 + st * X1 into a straight\nconstant-speed (SC) flow, reminiscent of Rectified Flow. This transformation\nfacilitates fast sampling along the original posterior flow without training a\nnew model of the SC flow. The flexibility of our approach allows us to extend\nour transformation to inter-convert two posterior flows from distinct \"linear\"\nstochastic processes. Moreover, we can easily integrate high-order numerical\nsolvers into the transformed SC flow, further enhancing sampling accuracy and\nefficiency. Rigorous theoretical analysis and extensive experimental results\nsubstantiate the advantages of our framework.\n","authors":["Kien Do","Duc Kieu","Toan Nguyen","Dang Nguyen","Hung Le","Dung Nguyen","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2402.02977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02976v1","updated":"2024-02-05T12:58:03Z","published":"2024-02-05T12:58:03Z","title":"Boosting, Voting Classifiers and Randomized Sample Compression Schemes","summary":" In boosting, we aim to leverage multiple weak learners to produce a strong\nlearner. At the center of this paradigm lies the concept of building the strong\nlearner as a voting classifier, which outputs a weighted majority vote of the\nweak learners. While many successful boosting algorithms, such as the iconic\nAdaBoost, produce voting classifiers, their theoretical performance has long\nremained sub-optimal: the best known bounds on the number of training examples\nnecessary for a voting classifier to obtain a given accuracy has so far always\ncontained at least two logarithmic factors above what is known to be achievable\nby general weak-to-strong learners. In this work, we break this barrier by\nproposing a randomized boosting algorithm that outputs voting classifiers whose\ngeneralization error contains a single logarithmic dependency on the sample\nsize. We obtain this result by building a general framework that extends sample\ncompression methods to support randomized learning algorithms based on\nsub-sampling.\n","authors":["Arthur da Cunha","Kasper Green Larsen","Martin Ritzert"],"pdf_url":"https://arxiv.org/pdf/2402.02976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03482v2","updated":"2024-02-05T12:56:43Z","published":"2022-02-07T19:40:20Z","title":"Navigating Neural Space: Revisiting Concept Activation Vectors to\n Overcome Directional Divergence","summary":" With a growing interest in understanding neural network prediction\nstrategies, Concept Activation Vectors (CAVs) have emerged as a popular tool\nfor modeling human-understandable concepts in the latent space. Commonly, CAVs\nare computed by leveraging linear classifiers optimizing the separability of\nlatent representations of samples with and without a given concept. However, in\nthis paper we show that such a separability-oriented computation leads to\nsolutions, which may diverge from the actual goal of precisely modeling the\nconcept direction. This discrepancy can be attributed to the significant\ninfluence of distractor directions, i.e., signals unrelated to the concept,\nwhich are picked up by filters (i.e., weights) of linear models to optimize\nclass-separability. To address this, we introduce pattern-based CAVs, solely\nfocussing on concept signals, thereby providing more accurate concept\ndirections. We evaluate various CAV methods in terms of their alignment with\nthe true concept direction and their impact on CAV applications, including\nconcept sensitivity testing and model correction for shortcut behavior caused\nby data artifacts. We demonstrate the benefits of pattern-based CAVs using the\nPediatric Bone Age, ISIC2019, and FunnyBirds datasets with VGG, ResNet, and\nEfficientNet model architectures.\n","authors":["Frederik Pahde","Maximilian Dreyer","Leander Weber","Moritz Weckbecker","Christopher J. Anders","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2202.03482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02972v1","updated":"2024-02-05T12:50:30Z","published":"2024-02-05T12:50:30Z","title":"Retrieval-Augmented Score Distillation for Text-to-3D Generation","summary":" Text-to-3D generation has achieved significant success by incorporating\npowerful 2D diffusion models, but insufficient 3D prior knowledge also leads to\nthe inconsistency of 3D geometry. Recently, since large-scale multi-view\ndatasets have been released, fine-tuning the diffusion model on the multi-view\ndatasets becomes a mainstream to solve the 3D inconsistency problem. However,\nit has confronted with fundamental difficulties regarding the limited quality\nand diversity of 3D data, compared with 2D data. To sidestep these trade-offs,\nwe explore a retrieval-augmented approach tailored for score distillation,\ndubbed RetDream. We postulate that both expressiveness of 2D diffusion models\nand geometric consistency of 3D assets can be fully leveraged by employing the\nsemantically relevant assets directly within the optimization process. To this\nend, we introduce novel framework for retrieval-based quality enhancement in\ntext-to-3D generation. We leverage the retrieved asset to incorporate its\ngeometric prior in the variational objective and adapt the diffusion model's 2D\nprior toward view consistency, achieving drastic improvements in both geometry\nand fidelity of generated scenes. We conduct extensive experiments to\ndemonstrate that RetDream exhibits superior quality with increased geometric\nconsistency. Project page is available at https://ku-cvlab.github.io/RetDream/.\n","authors":["Junyoung Seo","Susung Hong","Wooseok Jang","Inès Hyeonsu Kim","Minseop Kwak","Doyup Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2402.02972v1.pdf","comment":"Project Page: https://ku-cvlab.github.io/RetDream/"},{"id":"http://arxiv.org/abs/2402.02969v1","updated":"2024-02-05T12:47:19Z","published":"2024-02-05T12:47:19Z","title":"Towards Understanding the Word Sensitivity of Attention Layers: A Study\n via Random Features","summary":" Unveiling the reasons behind the exceptional success of transformers requires\na better understanding of why attention layers are suitable for NLP tasks. In\nparticular, such tasks require predictive models to capture contextual meaning\nwhich often depends on one or few words, even if the sentence is long. Our work\nstudies this key property, dubbed word sensitivity (WS), in the prototypical\nsetting of random features. We show that attention layers enjoy high WS,\nnamely, there exists a vector in the space of embeddings that largely perturbs\nthe random attention features map. The argument critically exploits the role of\nthe softmax in the attention layer, highlighting its benefit compared to other\nactivations (e.g., ReLU). In contrast, the WS of standard random features is of\norder $1/\\sqrt{n}$, $n$ being the number of words in the textual sample, and\nthus it decays with the length of the context. We then translate these results\non the word sensitivity into generalization bounds: due to their low WS, random\nfeatures provably cannot learn to distinguish between two sentences that differ\nonly in a single word; in contrast, due to their high WS, random attention\nfeatures have higher generalization capabilities. We validate our theoretical\nresults with experimental evidence over the BERT-Base word embeddings of the\nimdb review dataset.\n","authors":["Simone Bombari","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2402.02969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02968v1","updated":"2024-02-05T12:47:09Z","published":"2024-02-05T12:47:09Z","title":"Delving into Multi-modal Multi-task Foundation Models for Road Scene\n Understanding: From Learning Paradigm Perspectives","summary":" Foundation models have indeed made a profound impact on various fields,\nemerging as pivotal components that significantly shape the capabilities of\nintelligent systems. In the context of intelligent vehicles, leveraging the\npower of foundation models has proven to be transformative, offering notable\nadvancements in visual understanding. Equipped with multi-modal and multi-task\nlearning capabilities, multi-modal multi-task visual understanding foundation\nmodels (MM-VUFMs) effectively process and fuse data from diverse modalities and\nsimultaneously handle various driving-related tasks with powerful adaptability,\ncontributing to a more holistic understanding of the surrounding scene. In this\nsurvey, we present a systematic analysis of MM-VUFMs specifically designed for\nroad scenes. Our objective is not only to provide a comprehensive overview of\ncommon practices, referring to task-specific models, unified multi-modal\nmodels, unified multi-task models, and foundation model prompting techniques,\nbut also to highlight their advanced capabilities in diverse learning\nparadigms. These paradigms include open-world understanding, efficient transfer\nfor road scenes, continual learning, interactive and generative capability.\nMoreover, we provide insights into key challenges and future trends, such as\nclosed-loop driving systems, interpretability, embodied driving agents, and\nworld models. To facilitate researchers in staying abreast of the latest\ndevelopments in MM-VUFMs for road scenes, we have established a continuously\nupdated repository at https://github.com/rolsheng/MM-VUFM4DS\n","authors":["Sheng Luo","Wei Chen","Wanxin Tian","Rui Liu","Luanxuan Hou","Xiubao Zhang","Haifeng Shen","Ruiqi Wu","Shuyi Geng","Yi Zhou","Ling Shao","Yi Yang","Bojun Gao","Qun Li","Guobin Wu"],"pdf_url":"https://arxiv.org/pdf/2402.02968v1.pdf","comment":"24 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.17870v2","updated":"2024-02-05T12:43:24Z","published":"2024-01-31T14:27:35Z","title":"Efficient Subseasonal Weather Forecast using Teleconnection-informed\n Transformers","summary":" Subseasonal forecasting, which is pivotal for agriculture, water resource\nmanagement, and early warning of disasters, faces challenges due to the chaotic\nnature of the atmosphere. Recent advances in machine learning (ML) have\nrevolutionized weather forecasting by achieving competitive predictive skills\nto numerical models. However, training such foundation models requires\nthousands of GPU days, which causes substantial carbon emissions and limits\ntheir broader applicability. Moreover, ML models tend to fool the pixel-wise\nerror scores by producing smoothed results which lack physical consistency and\nmeteorological meaning. To deal with the aforementioned problems, we propose a\nteleconnection-informed transformer. Our architecture leverages the pretrained\nPangu model to achieve good initial weights and integrates a\nteleconnection-informed temporal module to improve predictability in an\nextended temporal range. Remarkably, by adjusting 1.1% of the Pangu model's\nparameters, our method enhances predictability on four surface and five\nupper-level atmospheric variables at a two-week lead time. Furthermore, the\nteleconnection-filtered features improve the spatial granularity of outputs\nsignificantly, indicating their potential physical consistency. Our research\nunderscores the importance of atmospheric and oceanic teleconnections in\ndriving future weather conditions. Besides, it presents a resource-efficient\npathway for researchers to leverage existing foundation models on versatile\ndownstream tasks.\n","authors":["Shan Zhao","Zhitong Xiong","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.17870v2.pdf","comment":"Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2402.01376v2","updated":"2024-02-05T12:42:52Z","published":"2024-02-02T13:00:38Z","title":"LoTR: Low Tensor Rank Weight Adaptation","summary":" In this paper we generalize and extend an idea of low-rank adaptation (LoRA)\nof large language models (LLMs) based on Transformer architecture. Widely used\nLoRA-like methods of fine-tuning LLMs are based on matrix factorization of\ngradient update. We introduce LoTR, a novel approach for parameter-efficient\nfine-tuning of LLMs which represents a gradient update to parameters in a form\nof tensor decomposition. Low-rank adapter for each layer is constructed as a\nproduct of three matrices, and tensor structure arises from sharing left and\nright multipliers of this product among layers. Simultaneous compression of a\nsequence of layers with low-rank tensor representation allows LoTR to archive\neven better parameter efficiency then LoRA especially for deep models.\nMoreover, the core tensor does not depend on original weight dimension and can\nbe made arbitrary small, which allows for extremely cheap and fast downstream\nfine-tuning.\n","authors":["Daniel Bershatsky","Daria Cherniuk","Talgat Daulbaev","Aleksandr Mikhalev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2402.01376v2.pdf","comment":"Submitted; missing author and sections were added;"},{"id":"http://arxiv.org/abs/2402.02964v1","updated":"2024-02-05T12:42:21Z","published":"2024-02-05T12:42:21Z","title":"Mixed Noise and Posterior Estimation with Conditional DeepGEM","summary":" Motivated by indirect measurements and applications from nanometrology with a\nmixed noise model, we develop a novel algorithm for jointly estimating the\nposterior and the noise parameters in Bayesian inverse problems. We propose to\nsolve the problem by an expectation maximization (EM) algorithm. Based on the\ncurrent noise parameters, we learn in the E-step a conditional normalizing flow\nthat approximates the posterior. In the M-step, we propose to find the noise\nparameter updates again by an EM algorithm, which has analytical formulas. We\ncompare the training of the conditional normalizing flow with the forward and\nreverse KL, and show that our model is able to incorporate information from\nmany measurements, unlike previous approaches.\n","authors":["Paul Hagemann","Johannes Hertrich","Maren Casfor","Sebastian Heidenreich","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2402.02964v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2209.03275v2","updated":"2024-02-05T17:54:04Z","published":"2022-09-07T16:27:34Z","title":"Multimodal Speech Enhancement Using Burst Propagation","summary":" This paper proposes the MBURST, a novel multimodal solution for audio-visual\nspeech enhancements that consider the most recent neurological discoveries\nregarding pyramidal cells of the prefrontal cortex and other brain regions. The\nso-called burst propagation implements several criteria to address the credit\nassignment problem in a more biologically plausible manner: steering the sign\nand magnitude of plasticity through feedback, multiplexing the feedback and\nfeedforward information across layers through different weight connections,\napproximating feedback and feedforward connections, and linearizing the\nfeedback signals. MBURST benefits from such capabilities to learn correlations\nbetween the noisy signal and the visual stimuli, thus attributing meaning to\nthe speech by amplifying relevant information and suppressing noise.\nExperiments conducted over a Grid Corpus and CHiME3-based dataset show that\nMBURST can reproduce similar mask reconstructions to the multimodal\nbackpropagation-based baseline while demonstrating outstanding energy\nefficiency management, reducing the neuron firing rates to values up to\n\\textbf{$70\\%$} lower. Such a feature implies more sustainable implementations,\nsuitable and desirable for hearing aids or any other similar embedded systems.\n","authors":["Mohsin Raza","Leandro A. Passos","Ahmed Khubaib","Ahsan Adeel"],"pdf_url":"https://arxiv.org/pdf/2209.03275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03190v1","updated":"2024-02-05T16:56:11Z","published":"2024-02-05T16:56:11Z","title":"Unified Hallucination Detection for Multimodal Large Language Models","summary":" Despite significant strides in multimodal tasks, Multimodal Large Language\nModels (MLLMs) are plagued by the critical issue of hallucination. The reliable\ndetection of such hallucinations in MLLMs has, therefore, become a vital aspect\nof model evaluation and the safeguarding of practical application deployment.\nPrior research in this domain has been constrained by a narrow focus on\nsingular tasks, an inadequate range of hallucination categories addressed, and\na lack of detailed granularity. In response to these challenges, our work\nexpands the investigative horizons of hallucination detection. We present a\nnovel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate\nthe evaluation of advancements in hallucination detection methods.\nAdditionally, we unveil a novel unified multimodal hallucination detection\nframework, UNIHD, which leverages a suite of auxiliary tools to validate the\noccurrence of hallucinations robustly. We demonstrate the effectiveness of\nUNIHD through meticulous evaluation and comprehensive analysis. We also provide\nstrategic insights on the application of specific tools for addressing various\ncategories of hallucinations.\n","authors":["Xiang Chen","Chenxi Wang","Yida Xue","Ningyu Zhang","Xiaoyan Yang","Qiang Li","Yue Shen","Jinjie Gu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03190v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2402.03040v1","updated":"2024-02-05T14:24:46Z","published":"2024-02-05T14:24:46Z","title":"InteractiveVideo: User-Centric Controllable Video Generation with\n Synergistic Multimodal Instructions","summary":" We introduce $\\textit{InteractiveVideo}$, a user-centric framework for video\ngeneration. Different from traditional generative approaches that operate based\non user-provided images or text, our framework is designed for dynamic\ninteraction, allowing users to instruct the generative model through various\nintuitive mechanisms during the whole generation process, e.g. text and image\nprompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal\nInstruction mechanism, designed to seamlessly integrate users' multimodal\ninstructions into generative models, thus facilitating a cooperative and\nresponsive interaction between user inputs and the generative process. This\napproach enables iterative and fine-grained refinement of the generation result\nthrough precise and effective user instructions. With\n$\\textit{InteractiveVideo}$, users are given the flexibility to meticulously\ntailor key aspects of a video. They can paint the reference image, edit\nsemantics, and adjust video motions until their requirements are fully met.\nCode, models, and demo are available at\nhttps://github.com/invictus717/InteractiveVideo\n","authors":["Yiyuan Zhang","Yuhao Kang","Zhixin Zhang","Xiaohan Ding","Sanyuan Zhao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2402.03040v1.pdf","comment":"Code, models, and demo are available at\n https://github.com/invictus717/InteractiveVideo"},{"id":"http://arxiv.org/abs/2402.02936v1","updated":"2024-02-05T11:58:08Z","published":"2024-02-05T11:58:08Z","title":"Panoramic Image Inpainting With Gated Convolution And Contextual\n Reconstruction Loss","summary":" Deep learning-based methods have demonstrated encouraging results in tackling\nthe task of panoramic image inpainting. However, it is challenging for existing\nmethods to distinguish valid pixels from invalid pixels and find suitable\nreferences for corrupted areas, thus leading to artifacts in the inpainted\nresults. In response to these challenges, we propose a panoramic image\ninpainting framework that consists of a Face Generator, a Cube Generator, a\nside branch, and two discriminators. We use the Cubemap Projection (CMP) format\nas network input. The generator employs gated convolutions to distinguish valid\npixels from invalid ones, while a side branch is designed utilizing contextual\nreconstruction (CR) loss to guide the generators to find the most suitable\nreference patch for inpainting the missing region. The proposed method is\ncompared with state-of-the-art (SOTA) methods on SUN360 Street View dataset in\nterms of PSNR and SSIM. Experimental results and ablation study demonstrate\nthat the proposed method outperforms SOTA both quantitatively and\nqualitatively.\n","authors":["Li Yu","Yanjun Gao","Farhad Pakdaman","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2402.02936v1.pdf","comment":"Copyright 2024 IEEE - to appear in IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.02836v1","updated":"2024-02-05T09:45:38Z","published":"2024-02-05T09:45:38Z","title":"Perceptual Learned Image Compression via End-to-End JND-Based\n Optimization","summary":" Emerging Learned image Compression (LC) achieves significant improvements in\ncoding efficiency by end-to-end training of neural networks for compression. An\nimportant benefit of this approach over traditional codecs is that any\noptimization criteria can be directly applied to the encoder-decoder networks\nduring training. Perceptual optimization of LC to comply with the Human Visual\nSystem (HVS) is among such criteria, which has not been fully explored yet.\nThis paper addresses this gap by proposing a novel framework to integrate Just\nNoticeable Distortion (JND) principles into LC. Leveraging existing JND\ndatasets, three perceptual optimization methods are proposed to integrate JND\ninto the LC training process: (1) Pixel-Wise JND Loss (PWL) prioritizes\npixel-by-pixel fidelity in reproducing JND characteristics, (2) Image-Wise JND\nLoss (IWL) emphasizes on overall imperceptible degradation levels, and (3)\nFeature-Wise JND Loss (FWL) aligns the reconstructed image features with\nperceptually significant features. Experimental evaluations demonstrate the\neffectiveness of JND integration, highlighting improvements in rate-distortion\nperformance and visual quality, compared to baseline methods. The proposed\nmethods add no extra complexity after training.\n","authors":["Farhad Pakdaman","Sanaz Nami","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2402.02836v1.pdf","comment":"Copyright 2024 IEEE - Submitted to IEEE ICIP 2024"},{"id":"http://arxiv.org/abs/2402.02733v1","updated":"2024-02-05T05:25:33Z","published":"2024-02-05T05:25:33Z","title":"ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer","summary":" Face re-aging is a prominent field in computer vision and graphics, with\nsignificant applications in photorealistic domains such as movies, advertising,\nand live streaming. Recently, the need to apply face re-aging to\nnon-photorealistic images, like comics, illustrations, and animations, has\nemerged as an extension in various entertainment sectors. However, the absence\nof a network capable of seamlessly editing the apparent age on NPR images means\nthat these tasks have been confined to a naive approach, applying each task\nsequentially. This often results in unpleasant artifacts and a loss of facial\nattributes due to domain discrepancies. In this paper, we introduce a novel\none-stage method for face re-aging combined with portrait style transfer,\nexecuted in a single generative step. We leverage existing face re-aging and\nstyle transfer networks, both trained within the same PR domain. Our method\nuniquely fuses distinct latent vectors, each responsible for managing\naging-related attributes and NPR appearance. Adopting an exemplar-based\napproach, our method offers greater flexibility than domain-level fine-tuning\napproaches, which typically require separate training or fine-tuning for each\ndomain. This effectively addresses the limitation of requiring paired datasets\nfor re-aging and domain-level, data-driven approaches for stylization. Our\nexperiments show that our model can effortlessly generate re-aged images while\nsimultaneously transferring the style of examples, maintaining both natural\nappearance and controllability.\n","authors":["Bumsoo Kim","Abdul Muqeet","Kyuchul Lee","Sanghyun Seo"],"pdf_url":"https://arxiv.org/pdf/2402.02733v1.pdf","comment":"8 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.04369v2","updated":"2024-02-05T03:08:52Z","published":"2023-08-08T16:15:35Z","title":"SSTFormer: Bridging Spiking Neural Network and Memory Support\n Transformer for Frame-Event based Recognition","summary":" Event camera-based pattern recognition is a newly arising research topic in\nrecent years. Current researchers usually transform the event streams into\nimages, graphs, or voxels, and adopt deep neural networks for event-based\nclassification. Although good performance can be achieved on simple event\nrecognition datasets, however, their results may be still limited due to the\nfollowing two issues. Firstly, they adopt spatial sparse event streams for\nrecognition only, which may fail to capture the color and detailed texture\ninformation well. Secondly, they adopt either Spiking Neural Networks (SNN) for\nenergy-efficient recognition with suboptimal results, or Artificial Neural\nNetworks (ANN) for energy-intensive, high-performance recognition. However,\nseldom of them consider achieving a balance between these two aspects. In this\npaper, we formally propose to recognize patterns by fusing RGB frames and event\nstreams simultaneously and propose a new RGB frame-event recognition framework\nto address the aforementioned issues. The proposed method contains four main\nmodules, i.e., memory support Transformer network for RGB frame encoding,\nspiking neural network for raw event stream encoding, multi-modal bottleneck\nfusion module for RGB-Event feature aggregation, and prediction head. Due to\nthe scarce of RGB-Event based classification dataset, we also propose a\nlarge-scale PokerEvent dataset which contains 114 classes, and 27102\nframe-event pairs recorded using a DVS346 event camera. Extensive experiments\non two RGB-Event based classification datasets fully validated the\neffectiveness of our proposed framework. We hope this work will boost the\ndevelopment of pattern recognition by fusing RGB frames and event streams. Both\nour dataset and source code of this work will be released at\nhttps://github.com/Event-AHU/SSTFormer.\n","authors":["Xiao Wang","Zongzhen Wu","Yao Rong","Lin Zhu","Bo Jiang","Jin Tang","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.04369v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2309.01516v3","updated":"2024-02-05T22:43:45Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As Multimodal Large Language Models (MLLMs) grow in size, adapting them to\nspecialized tasks becomes increasingly challenging due to high computational\nand memory demands. Indeed, traditional fine-tuning methods are costly, due to\nthe need for extensive, task-specific training. While efficient adaptation\nmethods exist that aim to reduce these costs, in practice they suffer from\nshallow inter-modal alignment, which severely hurts model effectiveness. To\ntackle these computational challenges and improve inter-modal alignment, we\nintroduce the MultiWay-Adapter (MWA), a novel framework featuring an 'Alignment\nEnhancer'. This enhancer deepens inter-modal alignment, enabling high\ntransferability with minimal tuning effort. Our experiments show that unlike\nprior efficient tuning approaches, MWA maintains model effectiveness, while\nreducing training time by up-to 57%. MWA is also lightweight, increasing model\nsize by only 2-3% (in terms of parameters) for state-of-the-art foundation\nmodels like BEiT-3 Large. These results demonstrate that MWA provides an\nefficient and effective adaptation method for MLLMs, significantly broadening\ntheir applicability.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03513v1","updated":"2024-02-05T21:01:01Z","published":"2024-02-05T21:01:01Z","title":"Video Super-Resolution for Optimized Bitrate and Green Online Streaming","summary":" Conventional per-title encoding schemes strive to optimize encoding\nresolutions to deliver the utmost perceptual quality for each bitrate ladder\nrepresentation. Nevertheless, maintaining encoding time within an acceptable\nthreshold is equally imperative in online streaming applications. Furthermore,\nmodern client devices are equipped with the capability for fast\ndeep-learning-based video super-resolution (VSR) techniques, enhancing the\nperceptual quality of the decoded bitstream. This suggests that opting for\nlower resolutions in representations during the encoding process can curtail\nthe overall energy consumption without substantially compromising perceptual\nquality. In this context, this paper introduces a video super-resolution-based\nlatency-aware optimized bitrate encoding scheme (ViSOR) designed for online\nadaptive streaming applications. ViSOR determines the encoding resolution for\neach target bitrate, ensuring the highest achievable perceptual quality after\nVSR within the bound of a maximum acceptable latency. Random forest-based\nprediction models are trained to predict the perceptual quality after VSR and\nthe encoding time for each resolution using the spatiotemporal features\nextracted for each video segment. Experimental results show that ViSOR\ntargeting fast super-resolution convolutional neural network (FSRCNN) achieves\nan overall average bitrate reduction of 24.65 % and 32.70 % to maintain the\nsame PSNR and VMAF, compared to the HTTP Live Streaming (HLS) bitrate ladder\nencoding of 4 s segments using the x265 encoder, when the maximum acceptable\nlatency for each representation is set as two seconds. Considering a just\nnoticeable difference (JND) of six VMAF points, the average cumulative storage\nconsumption and encoding energy for each segment is reduced by 79.32 % and\n68.21 %, respectively, contributing towards greener streaming.\n","authors":["Vignesh V Menon","Prajit T Rajendran","Amritha Premkumar","Benjamin Bross","Detlev Marpe"],"pdf_url":"https://arxiv.org/pdf/2402.03513v1.pdf","comment":"2024 Picture Coding Symposium (PCS)"},{"id":"http://arxiv.org/abs/2402.03413v1","updated":"2024-02-05T16:13:52Z","published":"2024-02-05T16:13:52Z","title":"Perceptual Video Quality Assessment: A Survey","summary":" Perceptual video quality assessment plays a vital role in the field of video\nprocessing due to the existence of quality degradations introduced in various\nstages of video signal acquisition, compression, transmission and display. With\nthe advancement of internet communication and cloud service technology, video\ncontent and traffic are growing exponentially, which further emphasizes the\nrequirement for accurate and rapid assessment of video quality. Therefore,\nnumerous subjective and objective video quality assessment studies have been\nconducted over the past two decades for both generic videos and specific videos\nsuch as streaming, user-generated content (UGC), 3D, virtual and augmented\nreality (VR and AR), high frame rate (HFR), audio-visual, etc. This survey\nprovides an up-to-date and comprehensive review of these video quality\nassessment studies. Specifically, we first review the subjective video quality\nassessment methodologies and databases, which are necessary for validating the\nperformance of video quality metrics. Second, the objective video quality\nassessment algorithms for general purposes are surveyed and concluded according\nto the methodologies utilized in the quality measures. Third, we overview the\nobjective video quality assessment measures for specific applications and\nemerging topics. Finally, the performances of the state-of-the-art video\nquality assessment measures are compared and analyzed. This survey provides a\nsystematic overview of both classical works and recent progresses in the realm\nof video quality assessment, which can help other researchers quickly access\nthe field and conduct relevant research.\n","authors":["Xiongkuo Min","Huiyu Duan","Wei Sun","Yucheng Zhu","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2402.03413v1.pdf","comment":null}]},"2024-02-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.02643v1","updated":"2024-02-04T23:42:02Z","published":"2024-02-04T23:42:02Z","title":"LLM-Enhanced Data Management","summary":" Machine learning (ML) techniques for optimizing data management problems have\nbeen extensively studied and widely deployed in recent five years. However\ntraditional ML methods have limitations on generalizability (adapting to\ndifferent scenarios) and inference ability (understanding the context).\nFortunately, large language models (LLMs) have shown high generalizability and\nhuman-competitive abilities in understanding context, which are promising for\ndata management tasks (e.g., database diagnosis, database tuning). However,\nexisting LLMs have several limitations: hallucination, high cost, and low\naccuracy for complicated tasks. To address these challenges, we design LLMDB,\nan LLM-enhanced data management paradigm which has generalizability and high\ninference ability while avoiding hallucination, reducing LLM cost, and\nachieving high accuracy. LLMDB embeds domain-specific knowledge to avoid\nhallucination by LLM fine-tuning and prompt engineering. LLMDB reduces the high\ncost of LLMs by vector databases which provide semantic search and caching\nabilities. LLMDB improves the task accuracy by LLM agent which provides\nmultiple-round inference and pipeline executions. We showcase three real-world\nscenarios that LLMDB can well support, including query rewrite, database\ndiagnosis and data analytics. We also summarize the open research challenges of\nLLMDB.\n","authors":["Xuanhe Zhou","Xinyang Zhao","Guoliang Li"],"pdf_url":"https://arxiv.org/pdf/2402.02643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02639v1","updated":"2024-02-04T23:23:51Z","published":"2024-02-04T23:23:51Z","title":"It's how you do things that matters\": Attending to Process to Better\n Serve Indigenous Communities with Language Technologies","summary":" Indigenous languages are historically under-served by Natural Language\nProcessing (NLP) technologies, but this is changing for some languages with the\nrecent scaling of large multilingual models and an increased focus by the NLP\ncommunity on endangered languages. This position paper explores ethical\nconsiderations in building NLP technologies for Indigenous languages, based on\nthe premise that such projects should primarily serve Indigenous communities.\nWe report on interviews with 17 researchers working in or with Aboriginal\nand/or Torres Strait Islander communities on language technology projects in\nAustralia. Drawing on insights from the interviews, we recommend practices for\nNLP researchers to increase attention to the process of engagements with\nIndigenous communities, rather than focusing only on decontextualised\nartefacts.\n","authors":["Ned Cooper","Courtney Heldreth","Ben Hutchinson"],"pdf_url":"https://arxiv.org/pdf/2402.02639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02636v1","updated":"2024-02-04T23:04:02Z","published":"2024-02-04T23:04:02Z","title":"Can Large Language Models Learn Independent Causal Mechanisms?","summary":" Despite impressive performance on language modelling and complex reasoning\ntasks, Large Language Models (LLMs) fall short on the same tasks in uncommon\nsettings or with distribution shifts, exhibiting some lack of generalisation\nability. This issue has usually been alleviated by feeding more training data\ninto the LLM. However, this method is brittle, as the scope of tasks may not be\nreadily predictable or may evolve, and updating the model with new data\ngenerally requires extensive additional training. By contrast, systems, such as\ncausal models, that learn abstract variables and causal relationships can\ndemonstrate increased robustness against changes in the distribution. One\nreason for this success is the existence and use of Independent Causal\nMechanisms (ICMs) representing high-level concepts that only sparsely interact.\nIn this work, we apply two concepts from causality to learn ICMs within LLMs.\nWe develop a new LLM architecture composed of multiple sparsely interacting\nlanguage modelling modules. We introduce a routing scheme to induce\nspecialisation of the network into domain-specific modules. We also present a\nMutual Information minimisation objective that trains a separate module to\nlearn abstraction and domain-invariant mechanisms. We show that such causal\nconstraints can improve out-of-distribution performance on abstract and causal\nreasoning tasks.\n","authors":["Gaël Gendron","Bao Trung Nguyen","Alex Yuxuan Peng","Michael Witbrock","Gillian Dobbie"],"pdf_url":"https://arxiv.org/pdf/2402.02636v1.pdf","comment":"17 pages, 8 pages for the main paper and 9 pages for references and\n appendices, 12 figures"},{"id":"http://arxiv.org/abs/2402.02633v1","updated":"2024-02-04T22:56:56Z","published":"2024-02-04T22:56:56Z","title":"Predicting Machine Translation Performance on Low-Resource Languages:\n The Role of Domain Similarity","summary":" Fine-tuning and testing a multilingual large language model is expensive and\nchallenging for low-resource languages (LRLs). While previous studies have\npredicted the performance of natural language processing (NLP) tasks using\nmachine learning methods, they primarily focus on high-resource languages,\noverlooking LRLs and shifts across domains. Focusing on LRLs, we investigate\nthree factors: the size of the fine-tuning corpus, the domain similarity\nbetween fine-tuning and testing corpora, and the language similarity between\nsource and target languages. We employ classical regression models to assess\nhow these factors impact the model's performance. Our results indicate that\ndomain similarity has the most critical impact on predicting the performance of\nMachine Translation models.\n","authors":["Eric Khiu","Hasti Toossi","David Anugraha","Jinyu Liu","Jiaxu Li","Juan Armando Parra Flores","Leandro Acros Roman","A. Seza Doğruöz","En-Shiun Annie Lee"],"pdf_url":"https://arxiv.org/pdf/2402.02633v1.pdf","comment":"13 pages, 5 figures, accepted to EACL 2024, findings"},{"id":"http://arxiv.org/abs/2402.02632v1","updated":"2024-02-04T22:53:38Z","published":"2024-02-04T22:53:38Z","title":"GIRT-Model: Automated Generation of Issue Report Templates","summary":" Platforms such as GitHub and GitLab introduce Issue Report Templates (IRTs)\nto enable more effective issue management and better alignment with developer\nexpectations. However, these templates are not widely adopted in most\nrepositories, and there is currently no tool available to aid developers in\ngenerating them. In this work, we introduce GIRT-Model, an assistant language\nmodel that automatically generates IRTs based on the developer's instructions\nregarding the structure and necessary fields. We create GIRT-Instruct, a\ndataset comprising pairs of instructions and IRTs, with the IRTs sourced from\nGitHub repositories. We use GIRT-Instruct to instruction-tune a T5-base model\nto create the GIRT-Model. In our experiments, GIRT-Model outperforms general\nlanguage models (T5 and Flan-T5 with different parameter sizes) in IRT\ngeneration by achieving significantly higher scores in ROUGE, BLEU, METEOR, and\nhuman evaluation. Additionally, we analyze the effectiveness of GIRT-Model in a\nuser study in which participants wrote short IRTs with GIRT-Model. Our results\nshow that the participants find GIRT-Model useful in the automated generation\nof templates. We hope that through the use of GIRT-Model, we can encourage more\ndevelopers to adopt IRTs in their repositories. We publicly release our code,\ndataset, and model at https://github.com/ISE-Research/girt-model.\n","authors":["Nafiseh Nikeghbal","Amir Hossein Kargaran","Abbas Heydarnoori"],"pdf_url":"https://arxiv.org/pdf/2402.02632v1.pdf","comment":"Accepted to be published at the 21st IEEE/ACM International\n Conference on Mining Software Repositories (MSR 2024)"},{"id":"http://arxiv.org/abs/2402.00798v2","updated":"2024-02-04T22:16:48Z","published":"2024-02-01T17:30:50Z","title":"Formal-LLM: Integrating Formal Language and Natural Language for\n Controllable LLM-based Agents","summary":" Recent advancements on Large Language Models (LLMs) enable AI Agents to\nautomatically generate and execute multi-step plans to solve complex tasks.\nHowever, since LLM's content generation process is hardly controllable, current\nLLM-based agents frequently generate invalid or non-executable plans, which\njeopardizes the performance of the generated plans and corrupts users' trust in\nLLM-based agents. In response, this paper proposes a novel ``Formal-LLM''\nframework for LLM-based agents by integrating the expressiveness of natural\nlanguage and the precision of formal language. Specifically, the framework\nallows human users to express their requirements or constraints for the\nplanning process as an automaton. A stack-based LLM plan generation process is\nthen conducted under the supervision of the automaton to ensure that the\ngenerated plan satisfies the constraints, making the planning process\ncontrollable. We conduct experiments on both benchmark tasks and practical\nreal-life tasks, and our framework achieves over 50% overall performance\nincrease, which validates the feasibility and effectiveness of employing\nFormal-LLM to guide the plan generation of agents, preventing the agents from\ngenerating invalid and unsuccessful plans. Further, more controllable LLM-based\nagents can facilitate the broader utilization of LLM in application scenarios\nwhere high validity of planning is essential. The work is open-sourced at\nhttps://github.com/agiresearch/Formal-LLM.\n","authors":["Zelong Li","Wenyue Hua","Hao Wang","He Zhu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00798v2.pdf","comment":"21 pages, 6 figures; comments and suggestions are welcome"},{"id":"http://arxiv.org/abs/2402.02625v1","updated":"2024-02-04T22:12:29Z","published":"2024-02-04T22:12:29Z","title":"Enhancing Transformer RNNs with Multiple Temporal Perspectives","summary":" We introduce the concept of multiple temporal perspectives, a novel approach\napplicable to Recurrent Neural Network (RNN) architectures for enhancing their\nunderstanding of sequential data. This method involves maintaining diverse\ntemporal views of previously encountered text, significantly enriching the\nlanguage models' capacity to interpret context. To show the efficacy of this\napproach, we incorporate it into the Receptance Weighted Key Value (RWKV)\narchitecture, addressing its inherent challenge of retaining all historical\ninformation within a single hidden state. Notably, this improvement is achieved\nwith a minimal increase in the number of parameters --even as little as\n$0.04\\%$ of the original number of parameters. Further, the additional\nparameters necessary for the multiple temporal perspectives are fine-tuned with\nminimal computational overhead, avoiding the need for a full pre-training. The\nresulting model maintains linear computational complexity during prompt\ninference, ensuring consistent efficiency across various sequence lengths. The\nempirical results and ablation studies included in our research validate the\neffectiveness of our approach, showcasing improved performance across multiple\nbenchmarks. The code, model weights and datasets are open-sourced at:\nhttps://github.com/RazvanDu/TemporalRNNs.\n","authors":["Razvan-Gabriel Dumitru","Darius Peteleaza","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2402.02625v1.pdf","comment":"11 pages, 8 figures, 4 tables, in review for ICML 2024"},{"id":"http://arxiv.org/abs/2308.07134v4","updated":"2024-02-04T22:08:05Z","published":"2023-08-14T13:41:09Z","title":"Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models has revolutionized\nvarious AI research domains. Transformers-based Large Language Models (LLMs)\nhave gradually replaced CNNs and RNNs to unify fields of computer vision and\nnatural language processing. Compared with independent data samples such as\nimages, videos or texts, graphs usually contain rich structural and relational\ninformation. Meanwhile, language, especially natural language, being one of the\nmost expressive mediums, excels in describing complex structures. However,\nexisting work on incorporating graph problems into the generative language\nmodeling framework remains very limited. Considering the rising prominence of\nLLMs, it becomes essential to explore whether LLMs can also replace GNNs as the\nfoundation model for graphs. In this paper, we propose InstructGLM\n(Instruction-finetuned Graph Language Model) with highly scalable prompts based\non natural language instructions. We use natural language to describe\nmulti-scale geometric structure of the graph and then instruction finetune an\nLLM to perform graph tasks, which enables Generative Graph Learning. Our method\nsurpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets,\nunderscoring its effectiveness and sheds light on generative LLMs as new\nfoundation model for graph machine learning. Our code is open-sourced at\nhttps://github.com/agiresearch/InstructGLM.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v4.pdf","comment":"In EACL 2024"},{"id":"http://arxiv.org/abs/2402.02622v1","updated":"2024-02-04T21:44:09Z","published":"2024-02-04T21:44:09Z","title":"DenseFormer: Enhancing Information Flow in Transformers via Depth\n Weighted Averaging","summary":" The transformer architecture from Vaswani et al. (2017) is now ubiquitous\nacross application domains, from natural language processing to speech\nprocessing and image understanding. We propose DenseFormer, a simple\nmodification to the standard architecture that improves the perplexity of the\nmodel without increasing its size -- adding a few thousand parameters for\nlarge-scale models in the 100B parameters range. Our approach relies on an\nadditional averaging step after each transformer block, which computes a\nweighted average of current and past representations -- we refer to this\noperation as Depth-Weighted-Average (DWA). The learned DWA weights exhibit\ncoherent patterns of information flow, revealing the strong and structured\nreuse of activations from distant layers. Experiments demonstrate that\nDenseFormer is more data efficient, reaching the same perplexity of much deeper\ntransformer models, and that for the same perplexity, these new models\noutperform transformer baselines in terms of memory efficiency and inference\ntime.\n","authors":["Matteo Pagliardini","Amirkeivan Mohtashami","Francois Fleuret","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2402.02622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02617v1","updated":"2024-02-04T21:24:54Z","published":"2024-02-04T21:24:54Z","title":"Layer-Wise Analysis of Self-Supervised Acoustic Word Embeddings: A Study\n on Speech Emotion Recognition","summary":" The efficacy of self-supervised speech models has been validated, yet the\noptimal utilization of their representations remains challenging across diverse\ntasks. In this study, we delve into Acoustic Word Embeddings (AWEs), a\nfixed-length feature derived from continuous representations, to explore their\nadvantages in specific tasks. AWEs have previously shown utility in capturing\nacoustic discriminability. In light of this, we propose measuring layer-wise\nsimilarity between AWEs and word embeddings, aiming to further investigate the\ninherent context within AWEs. Moreover, we evaluate the contribution of AWEs,\nin comparison to other types of speech features, in the context of Speech\nEmotion Recognition (SER). Through a comparative experiment and a layer-wise\naccuracy analysis on two distinct corpora, IEMOCAP and ESD, we explore\ndifferences between AWEs and raw self-supervised representations, as well as\nthe proper utilization of AWEs alone and in combination with word embeddings.\nOur findings underscore the acoustic context conveyed by AWEs and showcase the\nhighly competitive SER accuracies by appropriately employing AWEs.\n","authors":["Alexandra Saliba","Yuanchao Li","Ramon Sanabria","Catherine Lai"],"pdf_url":"https://arxiv.org/pdf/2402.02617v1.pdf","comment":"Accepted to ICASSP2024 Self-supervision in Audio, Speech and Beyond\n (SASB) workshop. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2402.02611v1","updated":"2024-02-04T20:56:09Z","published":"2024-02-04T20:56:09Z","title":"PuzzleBench: Can LLMs Solve Challenging First-Order Combinatorial\n Reasoning Problems?","summary":" Recent works have explored the use of LLMs for reasoning tasks focussing on\nrelatively simple problems, such as logical question answering. In our work, we\nwish to tackle more complicated problems, significantly expanding the\ncapabilities of these models. Particularly, we explore whether LLMs can solve\nchallenging first-order combinatorial reasoning problems, an example being the\npopular puzzle Sudoku. These problems have an underlying first-order structure\ndescribed by a general description in natural language and can be instantiated\nto instances of varying sizes. Moreover these problems are computationally\nintensive requiring several reasoning steps to reach the solution. We present\nPuzzleBench a dataset of 31 such challenging puzzles. We observe that LLMs even\nwhen aided by symbolic solvers perform rather poorly on our benchmark. In\nresponse we propose a new approach, Puzzle-LM which combines LLMs with both\nsymbolic solvers and program interpreters enabling them to reason about such\nchallenging problems. We also show how feedback from smaller solved instances\ncan help improve this reasoning ability.\n","authors":["Chinmay Mittal","Krishna Kartik"," Mausam","Parag Singla"],"pdf_url":"https://arxiv.org/pdf/2402.02611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10642v4","updated":"2024-02-04T20:39:33Z","published":"2023-11-17T16:58:52Z","title":"Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as\n an Alternative to Attention Layers in Transformers","summary":" This work presents an analysis of the effectiveness of using standard shallow\nfeed-forward networks to mimic the behavior of the attention mechanism in the\noriginal Transformer model, a state-of-the-art architecture for\nsequence-to-sequence tasks. We substitute key elements of the attention\nmechanism in the Transformer with simple feed-forward networks, trained using\nthe original components via knowledge distillation. Our experiments, conducted\non the IWSLT2017 dataset, reveal the capacity of these \"attentionless\nTransformers\" to rival the performance of the original architecture. Through\nrigorous ablation studies, and experimenting with various replacement network\ntypes and sizes, we offer insights that support the viability of our approach.\nThis not only sheds light on the adaptability of shallow feed-forward networks\nin emulating attention mechanisms but also underscores their potential to\nstreamline complex architectures for sequence-to-sequence tasks.\n","authors":["Vukasin Bozic","Danilo Dordevic","Daniele Coppola","Joseph Thommes","Sidak Pal Singh"],"pdf_url":"https://arxiv.org/pdf/2311.10642v4.pdf","comment":"Accepted at AAAI24(https://aaai.org/aaai-conference/)"},{"id":"http://arxiv.org/abs/2402.02591v1","updated":"2024-02-04T19:54:44Z","published":"2024-02-04T19:54:44Z","title":"On the performance of phonetic algorithms in microtext normalization","summary":" User-generated content published on microblogging social networks constitutes\na priceless source of information. However, microtexts usually deviate from the\nstandard lexical and grammatical rules of the language, thus making its\nprocessing by traditional intelligent systems very difficult. As an answer,\nmicrotext normalization consists in transforming those non-standard microtexts\ninto standard well-written texts as a preprocessing step, allowing traditional\napproaches to continue with their usual processing. Given the importance of\nphonetic phenomena in non-standard text formation, an essential element of the\nknowledge base of a normalizer would be the phonetic rules that encode these\nphenomena, which can be found in the so-called phonetic algorithms.\n In this work we experiment with a wide range of phonetic algorithms for the\nEnglish language. The aim of this study is to determine the best phonetic\nalgorithms within the context of candidate generation for microtext\nnormalization. In other words, we intend to find those algorithms that taking\nas input non-standard terms to be normalized allow us to obtain as output the\nsmallest possible sets of normalization candidates which still contain the\ncorresponding target standard words. As it will be stated, the choice of the\nphonetic algorithm will depend heavily on the capabilities of the candidate\nselection mechanism which we usually find at the end of a microtext\nnormalization pipeline. The faster it can make the right choices among big\nenough sets of candidates, the more we can sacrifice on the precision of the\nphonetic algorithms in favour of coverage in order to increase the overall\nperformance of the normalization system.\n KEYWORDS: microtext normalization; phonetic algorithm; fuzzy matching;\nTwitter; texting\n","authors":["Yerai Doval","Manuel Vilares","Jesús Vilares"],"pdf_url":"https://arxiv.org/pdf/2402.02591v1.pdf","comment":"Accepted for publication in journal Expert Systems with Applications"},{"id":"http://arxiv.org/abs/2309.06364v3","updated":"2024-02-04T19:46:38Z","published":"2023-09-06T15:00:44Z","title":"Framework-Based Qualitative Analysis of Free Responses of Large Language\n Models: Algorithmic Fidelity","summary":" Today, using Large-scale generative Language Models (LLMs) it is possible to\nsimulate free responses to interview questions like those traditionally\nanalyzed using qualitative research methods. Qualitative methodology\nencompasses a broad family of techniques involving manual analysis of\nopen-ended interviews or conversations conducted freely in natural language.\nHere we consider whether artificial \"silicon participants\" generated by LLMs\nmay be productively studied using qualitative methods aiming to produce\ninsights that could generalize to real human populations. The key concept in\nour analysis is algorithmic fidelity, a term introduced by Argyle et al. (2023)\ncapturing the degree to which LLM-generated outputs mirror human\nsub-populations' beliefs and attitudes. By definition, high algorithmic\nfidelity suggests latent beliefs elicited from LLMs may generalize to real\nhumans, whereas low algorithmic fidelity renders such research invalid. Here we\nused an LLM to generate interviews with silicon participants matching specific\ndemographic characteristics one-for-one with a set of human participants. Using\nframework-based qualitative analysis, we showed the key themes obtained from\nboth human and silicon participants were strikingly similar. However, when we\nanalyzed the structure and tone of the interviews we found even more striking\ndifferences. We also found evidence of the hyper-accuracy distortion described\nby Aher et al. (2023). We conclude that the LLM we tested (GPT-3.5) does not\nhave sufficient algorithmic fidelity to expect research on it to generalize to\nhuman populations. However, the rapid pace of LLM research makes it plausible\nthis could change in the future. Thus we stress the need to establish epistemic\nnorms now around how to assess validity of LLM-based qualitative research,\nespecially concerning the need to ensure representation of heterogeneous lived\nexperiences.\n","authors":["Aliya Amirova","Theodora Fteropoulli","Nafiso Ahmed","Martin R. Cowie","Joel Z. Leibo"],"pdf_url":"https://arxiv.org/pdf/2309.06364v3.pdf","comment":"52 pages, 5 tables, 5 figures"},{"id":"http://arxiv.org/abs/2401.04700v2","updated":"2024-02-04T19:04:13Z","published":"2024-01-09T18:03:15Z","title":"Model Editing Can Hurt General Abilities of Large Language Models","summary":" One critical challenge that has emerged is the presence of hallucinations in\nthe output of large language models (LLMs) due to false or outdated knowledge.\nSince retraining LLMs with updated information is resource-intensive, there has\nbeen a growing interest in model editing. However, current model editing\nmethods, while effective in improving editing performance in various scenarios,\noften overlook potential side effects on the general abilities of LLMs. In this\npaper, we raise concerns that model editing inherently improves the factuality\nof the model, but may come at the cost of a significant degradation of these\ngeneral abilities. Systematically, we analyze side effects by evaluating four\npopular editing methods on three LLMs across eight representative task\ncategories. Extensive empirical research reveals that current model editing\nmethods are difficult to couple well with LLMs to simultaneously improve the\nfactuality and maintain the general abilities such as reasoning, question\nanswering, etc. Strikingly, the use of a specific method to edit LLaMA-1 (7B)\nresulted in a drastic performance degradation to nearly 0 on all selected tasks\nwith just a single edit. Therefore, we advocate for more research efforts to\nminimize the loss of general abilities acquired during LLM pre-training and to\nultimately preserve them during model editing.\n","authors":["Jia-Chen Gu","Hao-Xiang Xu","Jun-Yu Ma","Pan Lu","Zhen-Hua Ling","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2401.04700v2.pdf","comment":"Add new results on LLaMA-2 (7B)"},{"id":"http://arxiv.org/abs/2402.02572v1","updated":"2024-02-04T17:32:52Z","published":"2024-02-04T17:32:52Z","title":"A Quantitative Discourse Analysis of Asian Workers in the US Historical\n Newspapers","summary":" Warning: This paper contains examples of offensive language targetting\nmarginalized population. The digitization of historical texts invites\nresearchers to explore the large-scale corpus of historical texts with\ncomputational methods. In this study, we present computational text analysis on\na relatively understudied topic of how Asian workers are represented in\nhistorical newspapers in the United States. We found that the word \"coolie\" was\nsemantically different in some States (e.g., Massachusetts, Rhode Island,\nWyoming, Oklahoma, and Arkansas) with the different discourses around coolie.\nWe also found that then-Confederate newspapers and then-Union newspapers formed\ndistinctive discourses by measuring over-represented words. Newspapers from\nthen-Confederate States associated coolie with slavery-related words. In\naddition, we found Asians were perceived to be inferior to European immigrants\nand subjected to the target of racism. This study contributes to supplementing\nthe qualitative analysis of racism in the United States with quantitative\ndiscourse analysis.\n","authors":["Jaihyun Park","Ryan Cordell"],"pdf_url":"https://arxiv.org/pdf/2402.02572v1.pdf","comment":"3rd International Conference on Natural Language Processing for\n Digital Humanities (NLP4DH)"},{"id":"http://arxiv.org/abs/2401.15077v2","updated":"2024-02-04T17:18:34Z","published":"2024-01-26T18:59:01Z","title":"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty","summary":" Autoregressive decoding makes the inference of Large Language Models (LLMs)\ntime-consuming. In this paper, we reconsider speculative sampling and derive\ntwo key observations. Firstly, autoregression at the feature\n(second-to-top-layer) level is more straightforward than at the token level.\nSecondly, the inherent uncertainty in feature (second-to-top-layer) level\nautoregression constrains its performance. Based on these insights, we\nintroduce EAGLE (Extrapolation Algorithm for Greater Language-model\nEfficiency), a simple yet highly efficient speculative sampling framework. By\nincorporating a token sequence advanced by one time step, EAGLE effectively\nresolves the uncertainty, enabling precise second-to-top-layer feature\nprediction with minimal overhead. We conducted comprehensive evaluations of\nEAGLE, including all models from the Vicuna and LLaMA2-Chat series, the MoE\nmodel Mixtral 8x7B Instruct, and tasks in dialogue, code generation,\nmathematical reasoning, and instruction following. For LLaMA2-Chat 70B, EAGLE\nachieved a latency speedup ratio of 2.7x-3.5x, doubled throughput, while\nmaintaining the distribution of the generated text.\n","authors":["Yuhui Li","Fangyun Wei","Chao Zhang","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.15077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02564v1","updated":"2024-02-04T16:56:08Z","published":"2024-02-04T16:56:08Z","title":"A Truly Joint Neural Architecture for Segmentation and Parsing","summary":" Contemporary multilingual dependency parsers can parse a diverse set of\nlanguages, but for Morphologically Rich Languages (MRLs), performance is\nattested to be lower than other languages. The key challenge is that, due to\nhigh morphological complexity and ambiguity of the space-delimited input\ntokens, the linguistic units that act as nodes in the tree are not known in\nadvance. Pre-neural dependency parsers for MRLs subscribed to the joint\nmorpho-syntactic hypothesis, stating that morphological segmentation and\nsyntactic parsing should be solved jointly, rather than as a pipeline where\nsegmentation precedes parsing. However, neural state-of-the-art parsers to date\nuse a strict pipeline. In this paper we introduce a joint neural architecture\nwhere a lattice-based representation preserving all morphological ambiguity of\nthe input is provided to an arc-factored model, which then solves the\nmorphological segmentation and syntactic parsing tasks at once. Our experiments\non Hebrew, a rich and highly ambiguous MRL, demonstrate state-of-the-art\nperformance on parsing, tagging and segmentation of the Hebrew section of UD,\nusing a single model. This proposed architecture is LLM-based and language\nagnostic, providing a solid foundation for MRLs to obtain further performance\nimprovements and bridge the gap with other languages.\n","authors":["Danit Yshaayahu Levi","Reut Tsarfaty"],"pdf_url":"https://arxiv.org/pdf/2402.02564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02563v1","updated":"2024-02-04T16:45:01Z","published":"2024-02-04T16:45:01Z","title":"DefInt: A Default-interventionist Framework for Efficient Reasoning with\n Hybrid Large Language Models","summary":" Large language models (LLMs) have shown impressive emergent abilities in a\nwide range of tasks, but still face challenges in handling complex reasoning\nproblems. Previous works like chain-of-thought (CoT) and tree-of-thoughts(ToT)\nhave predominately focused on enhancing accuracy, but overlook the rapidly\nincreasing token cost, which could be particularly problematic for open-ended\nreal-world tasks with huge solution spaces. Motivated by the dual process\ntheory of human cognition, we propose a Default-Interventionist framework\n(DefInt) to unleash the synergistic potential of hybrid LLMs. By default,\nDefInt uses smaller-scale language models to generate low-cost reasoning\nthoughts, which resembles the fast intuitions produced by System 1. If the\nintuitions are considered with low confidence, DefInt will invoke the\nreflective reasoning of scaled-up language models as the intervention of System\n2, which can override the default thoughts and rectify the reasoning process.\nExperiments on five representative reasoning tasks show that DefInt\nconsistently achieves state-of-the-art reasoning accuracy and solution\ndiversity. More importantly, it substantially reduces the token cost by 49%-79%\ncompared to the second accurate baselines. Specifically, the open-ended tasks\nhave an average 75% token cost reduction. Code repo with all prompts will be\nreleased upon publication.\n","authors":["Yu Shang","Yu Li","Fengli Xu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2402.02563v1.pdf","comment":"18 pages, 10 figures, 14 tables"},{"id":"http://arxiv.org/abs/2311.03099v2","updated":"2024-02-04T16:28:06Z","published":"2023-11-06T13:43:07Z","title":"Language Models are Super Mario: Absorbing Abilities from Homologous\n Models as a Free Lunch","summary":" In this paper, we unveil that Language Models (LMs) can acquire new\ncapabilities by assimilating parameters from homologous models without\nretraining or GPUs. We first introduce DARE to set most delta parameters (i.e.,\nthe disparity between fine-tuned and pre-trained parameters) to zeros without\naffecting the abilities of Supervised Fine-Tuning (SFT) LMs, which randomly\nDrops delta parameters with a ratio p And REscales the remaining ones by 1/(1 -\np) to approximate the original embeddings. Then, we use DARE as a versatile\nplug-and-play technique to sparsify delta parameters of multiple SFT homologous\nmodels for mitigating parameter interference and merge them into a single model\nby parameter fusing. We experiment with encoder- and decoder-based LMs, showing\nthat: (1) SFT delta parameter value ranges are typically small (within 0.005)\nwith extreme redundancy, and DARE can effortlessly eliminate 90% or even 99% of\nthem. (2) DARE can merge multiple task-specific LMs into one LM with diverse\ncapabilities. For instance, the amalgamation of WizardLM and WizardMath\nsignificantly enhances the GSM8K zero-shot accuracy of WizardLM from 2.2 to\n66.3, retaining the instruction-following proficiency while surpassing\nWizardMath's 64.2 performance. Our merged LM also ranks first among models with\n7 billion parameters on the Open LLM Leaderboard.\n","authors":["Le Yu","Bowen Yu","Haiyang Yu","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2311.03099v2.pdf","comment":"24 pages, 21 figures"},{"id":"http://arxiv.org/abs/2402.02559v1","updated":"2024-02-04T16:23:16Z","published":"2024-02-04T16:23:16Z","title":"NavHint: Vision and Language Navigation Agent with a Hint Generator","summary":" Existing work on vision and language navigation mainly relies on\nnavigation-related losses to establish the connection between vision and\nlanguage modalities, neglecting aspects of helping the navigation agent build a\ndeep understanding of the visual environment. In our work, we provide indirect\nsupervision to the navigation agent through a hint generator that provides\ndetailed visual descriptions. The hint generator assists the navigation agent\nin developing a global understanding of the visual environment. It directs the\nagent's attention toward related navigation details, including the relevant\nsub-instruction, potential challenges in recognition and ambiguities in\ngrounding, and the targeted viewpoint description. To train the hint generator,\nwe construct a synthetic dataset based on landmarks in the instructions and\nvisible and distinctive objects in the visual environment. We evaluate our\nmethod on the R2R and R4R datasets and achieve state-of-the-art on several\nmetrics. The experimental results demonstrate that generating hints not only\nenhances the navigation performance but also helps improve the interpretability\nof the agent's actions.\n","authors":["Yue Zhang","Quan Guo","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2402.02559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10688v3","updated":"2024-02-04T16:19:59Z","published":"2023-10-14T17:01:37Z","title":"A decoder-only foundation model for time-series forecasting","summary":" Motivated by recent advances in large language models for Natural Language\nProcessing (NLP), we design a time-series foundation model for forecasting\nwhose out-of-the-box zero-shot performance on a variety of public datasets\ncomes close to the accuracy of state-of-the-art supervised forecasting models\nfor each individual dataset. Our model is based on pretraining a\npatched-decoder style attention model on a large time-series corpus, and can\nwork well across different forecasting history lengths, prediction lengths and\ntemporal granularities.\n","authors":["Abhimanyu Das","Weihao Kong","Rajat Sen","Yichen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.10688v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02558v1","updated":"2024-02-04T16:18:01Z","published":"2024-02-04T16:18:01Z","title":"Enhancing Robustness in Biomedical NLI Models: A Probing Approach for\n Clinical Trials","summary":" Large Language Models have revolutionized various fields and industries, such\nas Conversational AI, Content Generation, Information Retrieval, Business\nIntelligence, and Medical, to name a few. One major application in the field of\nmedical is to analyze and investigate clinical trials for entailment\ntasks.However, It has been observed that Large Language Models are susceptible\nto shortcut learning, factual inconsistency, and performance degradation with\nlittle variation in context. Adversarial and robust testing is performed to\nensure the integrity of models output. But, ambiguity still persists. In order\nto ensure the integrity of the reasoning performed and investigate the model\nhas correct syntactic and semantic understanding probing is used. Here, I used\nmnestic probing to investigate the Sci-five model, trained on clinical trial. I\ninvestigated the model for feature learnt with respect to natural logic. To\nachieve the target, I trained task specific probes. Used these probes to\ninvestigate the final layers of trained model. Then, fine tuned the trained\nmodel using iterative null projection. The results shows that model accuracy\nimproved. During experimentation, I observed that size of the probe has affect\non the fine tuning process.\n","authors":["Ata Mustafa"],"pdf_url":"https://arxiv.org/pdf/2402.02558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v1","updated":"2024-02-04T15:52:59Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hangwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v1.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2402.02548v1","updated":"2024-02-04T15:52:46Z","published":"2024-02-04T15:52:46Z","title":"\"What's my model inside of?\": Exploring the role of environments for\n grounded natural language understanding","summary":" In contrast to classical cognitive science which studied brains in isolation,\necological approaches focused on the role of the body and environment in\nshaping cognition. Similarly, in this thesis we adopt an ecological approach to\ngrounded natural language understanding (NLU) research. Grounded language\nunderstanding studies language understanding systems situated in the context of\nevents, actions and precepts in naturalistic/simulated virtual environments.\nWhere classic research tends to focus on designing new models and optimization\nmethods while treating environments as given, we explore the potential of\nenvironment design for improving data collection and model development. We\ndeveloped novel training and annotation approaches for procedural text\nunderstanding based on text-based game environments. We also drew upon embodied\ncognitive linguistics literature to propose a roadmap for grounded NLP\nresearch, and to inform the development of a new benchmark for measuring the\nprogress of large language models on challenging commonsense reasoning tasks.\nWe leveraged the richer supervision provided by text-based game environments to\ndevelop Breakpoint Transformers, a novel approach to modeling intermediate\nsemantic information in long narrative or procedural texts. Finally, we\nintegrated theories on the role of environments in collective human\nintelligence to propose a design for AI-augmented \"social thinking\nenvironments\" for knowledge workers like scientists.\n","authors":["Ronen Tamari"],"pdf_url":"https://arxiv.org/pdf/2402.02548v1.pdf","comment":"PhD Thesis"},{"id":"http://arxiv.org/abs/2402.02541v1","updated":"2024-02-04T15:41:35Z","published":"2024-02-04T15:41:35Z","title":"Knowledge Generation for Zero-shot Knowledge-based VQA","summary":" Previous solutions to knowledge-based visual question answering~(K-VQA)\nretrieve knowledge from external knowledge bases and use supervised learning to\ntrain the K-VQA model. Recently pre-trained LLMs have been used as both a\nknowledge source and a zero-shot QA model for K-VQA and demonstrated promising\nresults. However, these recent methods do not explicitly show the knowledge\nneeded to answer the questions and thus lack interpretability. Inspired by\nrecent work on knowledge generation from LLMs for text-based QA, in this work\nwe propose and test a similar knowledge-generation-based K-VQA method, which\nfirst generates knowledge from an LLM and then incorporates the generated\nknowledge for K-VQA in a zero-shot manner. We evaluate our method on two K-VQA\nbenchmarks and found that our method performs better than previous zero-shot\nK-VQA methods and our generated knowledge is generally relevant and helpful.\n","authors":["Rui Cao","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.02541v1.pdf","comment":"accepted as Findings in EACL 2023;"},{"id":"http://arxiv.org/abs/2402.02522v1","updated":"2024-02-04T15:10:34Z","published":"2024-02-04T15:10:34Z","title":"Absolute convergence and error thresholds in non-active adaptive\n sampling","summary":" Non-active adaptive sampling is a way of building machine learning models\nfrom a training data base which are supposed to dynamically and automatically\nderive guaranteed sample size. In this context and regardless of the strategy\nused in both scheduling and generating of weak predictors, a proposal for\ncalculating absolute convergence and error thresholds is described. We not only\nmake it possible to establish when the quality of the model no longer\nincreases, but also supplies a proximity condition to estimate in absolute\nterms how close it is to achieving such a goal, thus supporting decision making\nfor fine-tuning learning parameters in model selection. The technique proves\nits correctness and completeness with respect to our working hypotheses, in\naddition to strengthening the robustness of the sampling scheme. Tests meet our\nexpectations and illustrate the proposal in the domain of natural language\nprocessing, taking the generation of part-of-speech taggers as case study.\n","authors":["Manuel Vilares Ferro","Victor M. Darriba Bilbao","Jesús Vilares Ferro"],"pdf_url":"https://arxiv.org/pdf/2402.02522v1.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2402.02516v1","updated":"2024-02-04T15:02:17Z","published":"2024-02-04T15:02:17Z","title":"Adaptive scheduling for adaptive sampling in POS taggers construction","summary":" We introduce an adaptive scheduling for adaptive sampling as a novel way of\nmachine learning in the construction of part-of-speech taggers. The goal is to\nspeed up the training on large data sets, without significant loss of\nperformance with regard to an optimal configuration. In contrast to previous\nmethods using a random, fixed or regularly rising spacing between the\ninstances, ours analyzes the shape of the learning curve geometrically in\nconjunction with a functional model to increase or decrease it at any time. The\nalgorithm proves to be formally correct regarding our working hypotheses.\nNamely, given a case, the following one is the nearest ensuring a net gain of\nlearning ability from the former, it being possible to modulate the level of\nrequirement for this condition. We also improve the robustness of sampling by\npaying greater attention to those regions of the training data base subject to\na temporary inflation in performance, thus preventing the learning from\nstopping prematurely.\n The proposal has been evaluated on the basis of its reliability to identify\nthe convergence of models, corroborating our expectations. While a concrete\nhalting condition is used for testing, users can choose any condition\nwhatsoever to suit their own specific needs.\n","authors":["Manuel Vilares Ferro","Victor M. Darriba Bilbao","Jesús Vilares Ferro"],"pdf_url":"https://arxiv.org/pdf/2402.02516v1.pdf","comment":"23 pager, 10 figures"},{"id":"http://arxiv.org/abs/2402.02515v1","updated":"2024-02-04T15:00:52Z","published":"2024-02-04T15:00:52Z","title":"Modeling of learning curves with applications to pos tagging","summary":" An algorithm to estimate the evolution of learning curves on the whole of a\ntraining data base, based on the results obtained from a portion and using a\nfunctional strategy, is introduced. We approximate iteratively the sought value\nat the desired time, independently of the learning technique used and once a\npoint in the process, called prediction level, has been passed. The proposal\nproves to be formally correct with respect to our working hypotheses and\nincludes a reliable proximity condition. This allows the user to fix a\nconvergence threshold with respect to the accuracy finally achievable, which\nextends the concept of stopping criterion and seems to be effective even in the\npresence of distorting observations.\n Our aim is to evaluate the training effort, supporting decision making in\norder to reduce the need for both human and computational resources during the\nlearning process. The proposal is of interest in at least three operational\nprocedures. The first is the anticipation of accuracy gain, with the purpose of\nmeasuring how much work is needed to achieve a certain degree of performance.\nThe second relates the comparison of efficiency between systems at training\ntime, with the objective of completing this task only for the one that best\nsuits our requirements. The prediction of accuracy is also a valuable item of\ninformation for customizing systems, since we can estimate in advance the\nimpact of settings on both the performance and the development costs. Using the\ngeneration of part-of-speech taggers as an example application, the\nexperimental results are consistent with our expectations.\n","authors":["Manuel Vilares Ferro","Victor M. Darriba Bilbao","Francisco J. Ribadas Pena"],"pdf_url":"https://arxiv.org/pdf/2402.02515v1.pdf","comment":"30 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.01386v2","updated":"2024-02-04T15:00:11Z","published":"2023-11-02T16:44:24Z","title":"Can Language Models Be Tricked by Language Illusions? Easier with\n Syntax, Harder with Semantics","summary":" Language models (LMs) have been argued to overlap substantially with human\nbeings in grammaticality judgment tasks. But when humans systematically make\nerrors in language processing, should we expect LMs to behave like cognitive\nmodels of language and mimic human behavior? We answer this question by\ninvestigating LMs' more subtle judgments associated with \"language illusions\"\n-- sentences that are vague in meaning, implausible, or ungrammatical but\nreceive unexpectedly high acceptability judgments by humans. We looked at three\nillusions: the comparative illusion (e.g. \"More people have been to Russia than\nI have\"), the depth-charge illusion (e.g. \"No head injury is too trivial to be\nignored\"), and the negative polarity item (NPI) illusion (e.g. \"The hunter who\nno villager believed to be trustworthy will ever shoot a bear\"). We found that\nprobabilities represented by LMs were more likely to align with human judgments\nof being \"tricked\" by the NPI illusion which examines a structural dependency,\ncompared to the comparative and the depth-charge illusions which require\nsophisticated semantic understanding. No single LM or metric yielded results\nthat are entirely consistent with human behavior. Ultimately, we show that LMs\nare limited both in their construal as cognitive models of human language\nprocessing and in their capacity to recognize nuanced but critical information\nin complicated language materials.\n","authors":["Yuhan Zhang","Edward Gibson","Forrest Davis"],"pdf_url":"https://arxiv.org/pdf/2311.01386v2.pdf","comment":"Accepted by The SIGNLL Conference on Computational Natural Language\n Learning 2023"},{"id":"http://arxiv.org/abs/2402.02513v1","updated":"2024-02-04T14:57:20Z","published":"2024-02-04T14:57:20Z","title":"Early stopping by correlating online indicators in neural networks","summary":" In order to minimize the generalization error in neural networks, a novel\ntechnique to identify overfitting phenomena when training the learner is\nformally introduced. This enables support of a reliable and trustworthy early\nstopping condition, thus improving the predictive power of that type of\nmodeling. Our proposal exploits the correlation over time in a collection of\nonline indicators, namely characteristic functions for indicating if a set of\nhypotheses are met, associated with a range of independent stopping conditions\nbuilt from a canary judgment to evaluate the presence of overfitting. That way,\nwe provide a formal basis for decision making in terms of interrupting the\nlearning process.\n As opposed to previous approaches focused on a single criterion, we take\nadvantage of subsidiarities between independent assessments, thus seeking both\na wider operating range and greater diagnostic reliability. With a view to\nillustrating the effectiveness of the halting condition described, we choose to\nwork in the sphere of natural language processing, an operational continuum\nincreasingly based on machine learning. As a case study, we focus on parser\ngeneration, one of the most demanding and complex tasks in the domain. The\nselection of cross-validation as a canary function enables an actual comparison\nwith the most representative early stopping conditions based on overfitting\nidentification, pointing to a promising start toward an optimal bias and\nvariance control.\n","authors":["Manuel Vilares Ferro","Yerai Doval Mosquera","Francisco J. Ribadas Pena","Victor M. Darriba Bilbao"],"pdf_url":"https://arxiv.org/pdf/2402.02513v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.17686v2","updated":"2024-02-04T13:18:34Z","published":"2024-01-31T09:16:35Z","title":"Deductive Beam Search: Decoding Deducible Rationale for Chain-of-Thought\n Reasoning","summary":" Recent advancements have significantly augmented the reasoning capabilities\nof Large Language Models (LLMs) through various methodologies, especially\nchain-of-thought (CoT) reasoning. However, previous methods fail to address\nreasoning errors in intermediate steps, leading to accumulative errors. In this\npaper, we propose Deductive Beam Search (DBS), which seamlessly integrates CoT\nand deductive reasoning with step-wise beam search for LLMs. Our approach\ndeploys a verifier, verifying the deducibility of a reasoning step and its\npremises, thus alleviating the error accumulation. Furthermore, we introduce a\nscalable and labor-free data construction method to amplify our model's\nverification capabilities. Extensive experiments demonstrate that our approach\nsignificantly enhances the base performance of LLMs of various scales (7B, 13B,\n70B, and ChatGPT) across 8 reasoning datasets from 3 diverse reasoning genres,\nincluding arithmetic, commonsense, and symbolic. Moreover, our analysis proves\nDBS's capability of detecting diverse and subtle reasoning errors and\nrobustness on different model scales.\n","authors":["Tinghui Zhu","Kai Zhang","Jian Xie","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2401.17686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10477v5","updated":"2024-02-04T13:02:39Z","published":"2023-10-16T14:59:10Z","title":"Gaining Wisdom from Setbacks: Aligning Large Language Models via Mistake\n Analysis","summary":" The rapid development of large language models (LLMs) has not only provided\nnumerous opportunities but also presented significant challenges. This becomes\nparticularly evident when LLMs inadvertently generate harmful or toxic content,\neither unintentionally or because of intentional inducement. Existing alignment\nmethods usually direct LLMs toward the favorable outcomes by utilizing\nhuman-annotated, flawless instruction-response pairs. Conversely, this study\nproposes a novel alignment technique based on mistake analysis, which\ndeliberately exposes LLMs to erroneous content to learn the reasons for\nmistakes and how to avoid them. In this case, mistakes are repurposed into\nvaluable data for alignment, effectively helping to avoid the production of\nerroneous responses. Without external models or human annotations, our method\nleverages a model's intrinsic ability to discern undesirable mistakes and\nimproves the safety of its generated responses. Experimental results reveal\nthat our method outperforms existing alignment approaches in enhancing model\nsafety while maintaining the overall utility.\n","authors":["Kai Chen","Chunwei Wang","Kuo Yang","Jianhua Han","Lanqing Hong","Fei Mi","Hang Xu","Zhengying Liu","Wenyong Huang","Zhenguo Li","Dit-Yan Yeung","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10477v5.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2304.01904v2","updated":"2024-02-04T12:15:18Z","published":"2023-04-04T15:57:28Z","title":"REFINER: Reasoning Feedback on Intermediate Representations","summary":" Language models (LMs) have recently shown remarkable performance on reasoning\ntasks by explicitly generating intermediate inferences, e.g., chain-of-thought\nprompting. However, these intermediate inference steps may be inappropriate\ndeductions from the initial context and lead to incorrect final predictions.\nHere we introduce REFINER, a framework for finetuning LMs to explicitly\ngenerate intermediate reasoning steps while interacting with a critic model\nthat provides automated feedback on the reasoning. Specifically, the critic\nprovides structured feedback that the reasoning LM uses to iteratively improve\nits intermediate arguments. Empirical evaluations of REFINER on three diverse\nreasoning tasks show significant improvements over baseline LMs of comparable\nscale. Furthermore, when using GPT-3.5 or ChatGPT as the reasoner, the trained\ncritic significantly improves reasoning without finetuning the reasoner.\nFinally, our critic model is trained without expensive human-in-the-loop data\nbut can be substituted with humans at inference time.\n","authors":["Debjit Paul","Mete Ismayilzada","Maxime Peyrard","Beatriz Borges","Antoine Bosselut","Robert West","Boi Faltings"],"pdf_url":"https://arxiv.org/pdf/2304.01904v2.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2402.02449v1","updated":"2024-02-04T11:38:12Z","published":"2024-02-04T11:38:12Z","title":"Surfing the modeling of PoS taggers in low-resource scenarios","summary":" The recent trend towards the application of deep structured techniques has\nrevealed the limits of huge models in natural language processing. This has\nreawakened the interest in traditional machine learning algorithms, which have\nproved still to be competitive in certain contexts, in particular low-resource\nsettings. In parallel, model selection has become an essential task to boost\nperformance at reasonable cost, even more so when we talk about processes\ninvolving domains where the training and/or computational resources are scarce.\nAgainst this backdrop, we evaluate the early estimation of learning curves as a\npractical mechanism for selecting the most appropriate model in scenarios\ncharacterized by the use of non-deep learners in resource-lean settings. On the\nbasis of a formal approximation model previously evaluated under conditions of\nwide availability of training and validation resources, we study the\nreliability of such an approach in a different and much more demanding\noperationalenvironment. Using as case study the generation of PoS taggers for\nGalician, a language belonging to the Western Ibero-Romance group, the\nexperimental results are consistent with our expectations.\n","authors":["Manuel Vilares Ferro","Víctor M. Darriba Bilbao","Francisco J. Ribadas-Pena","Jorge Graña Gil"],"pdf_url":"https://arxiv.org/pdf/2402.02449v1.pdf","comment":"17 papes, 5 figures"},{"id":"http://arxiv.org/abs/2402.02447v1","updated":"2024-02-04T11:12:17Z","published":"2024-02-04T11:12:17Z","title":"Breaking MLPerf Training: A Case Study on Optimizing BERT","summary":" Speeding up the large-scale distributed training is challenging in that it\nrequires improving various components of training including load balancing,\ncommunication, optimizers, etc. We present novel approaches for fast\nlarge-scale training of BERT model which individually ameliorates each\ncomponent thereby leading to a new level of BERT training performance. Load\nbalancing is imperative in distributed BERT training since its training\ndatasets are characterized by samples with various lengths. Communication cost,\nwhich is proportional to the scale of distributed training, needs to be hidden\nby useful computation. In addition, the optimizers, e.g., ADAM, LAMB, etc.,\nneed to be carefully re-evaluated in the context of large-scale distributed\ntraining. We propose two new ideas, (1) local presorting based on dataset\nstratification for load balancing and (2) bucket-wise gradient clipping before\nallreduce which allows us to benefit from the overlap of gradient computation\nand synchronization as well as the fast training of gradient clipping before\nallreduce. We also re-evaluate existing optimizers via hyperparameter\noptimization and utilize ADAM, which also contributes to fast training via\nlarger batches than existing methods. Our proposed methods, all combined, give\nthe fastest MLPerf BERT training of 25.1 (22.3) seconds on 1,024 NVIDIA A100\nGPUs, which is 1.33x (1.13x) and 1.57x faster than the other top two (one)\nsubmissions to MLPerf v1.1 (v2.0). Our implementation and evaluation results\nare available at MLPerf v1.1~v2.1.\n","authors":["Yongdeok Kim","Jaehyung Ahn","Myeongwoo Kim","Changin Choi","Heejae Kim","Narankhuu Tuvshinjargal","Seungwon Lee","Yanzi Zhang","Yuan Pei","Xiongzhan Linghu","Jingkun Ma","Lin Chen","Yuehua Dai","Sungjoo Yoo"],"pdf_url":"https://arxiv.org/pdf/2402.02447v1.pdf","comment":"Total 15 pages (Appendix 3 pages)"},{"id":"http://arxiv.org/abs/2310.19923v4","updated":"2024-02-04T11:11:53Z","published":"2023-10-30T18:35:30Z","title":"Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long\n Documents","summary":" Text embedding models have emerged as powerful tools for transforming\nsentences into fixed-sized feature vectors that encapsulate semantic\ninformation. While these models are essential for tasks like information\nretrieval, semantic clustering, and text re-ranking, most existing open-source\nmodels, especially those built on architectures like BERT, struggle to\nrepresent lengthy documents and often resort to truncation. One common approach\nto mitigate this challenge involves splitting documents into smaller paragraphs\nfor embedding. However, this strategy results in a much larger set of vectors,\nconsequently leading to increased memory consumption and computationally\nintensive vector searches with elevated latency.\n To address these challenges, we introduce Jina Embeddings 2, an open-source\ntext embedding model capable of accommodating up to 8192 tokens. This model is\ndesigned to transcend the conventional 512-token limit and adeptly process long\ndocuments. Jina Embeddings 2 not only achieves state-of-the-art performance on\na range of embedding-related tasks in the MTEB benchmark but also matches the\nperformance of OpenAI's proprietary ada-002 model. Additionally, our\nexperiments indicate that an extended context can enhance performance in tasks\nsuch as NarrativeQA.\n","authors":["Michael Günther","Jackmin Ong","Isabelle Mohr","Alaeddine Abdessalem","Tanguy Abel","Mohammad Kalim Akram","Susana Guzman","Georgios Mastrapas","Saba Sturua","Bo Wang","Maximilian Werk","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.19923v4.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2309.08969v2","updated":"2024-02-04T09:44:34Z","published":"2023-09-16T11:58:39Z","title":"Rethinking STS and NLI in Large Language Models","summary":" Recent years have seen the rise of large language models (LLMs), where\npractitioners use task-specific prompts; this was shown to be effective for a\nvariety of tasks. However, when applied to semantic textual similarity (STS)\nand natural language inference (NLI), the effectiveness of LLMs turns out to be\nlimited by low-resource domain accuracy, model overconfidence, and difficulty\nto capture the disagreements between human judgements. With this in mind, here\nwe try to rethink STS and NLI in the era of LLMs. We first evaluate the\nperformance of STS and NLI in the clinical/biomedical domain, and then we\nassess LLMs' predictive confidence and their capability of capturing collective\nhuman opinions. We find that these old problems are still to be properly\naddressed in the era of LLMs.\n","authors":["Yuxia Wang","Minghan Wang","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2309.08969v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.13138 by other authors"},{"id":"http://arxiv.org/abs/2402.02420v1","updated":"2024-02-04T09:36:31Z","published":"2024-02-04T09:36:31Z","title":"Factuality of Large Language Models in the Year 2024","summary":" Large language models (LLMs), especially when instruction-tuned for chat,\nhave become part of our daily lives, freeing people from the process of\nsearching, extracting, and integrating information from multiple sources by\noffering a straightforward answer to a variety of questions in a single place.\nUnfortunately, in many cases, LLM responses are factually incorrect, which\nlimits their applicability in real-world scenarios. As a result, research on\nevaluating and improving the factuality of LLMs has attracted a lot of research\nattention recently. In this survey, we critically analyze existing work with\nthe aim to identify the major challenges and their associated causes, pointing\nout to potential solutions for improving the factuality of LLMs, and analyzing\nthe obstacles to automated factuality evaluation for open-ended text\ngeneration. We further offer an outlook on where future research should go.\n","authors":["Yuxia Wang","Minghan Wang","Muhammad Arslan Manzoor","Georgi Georgiev","Rocktim Jyoti Das","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2402.02420v1.pdf","comment":"9 pages, 1 figure and 2 tables"},{"id":"http://arxiv.org/abs/2402.02416v1","updated":"2024-02-04T09:24:51Z","published":"2024-02-04T09:24:51Z","title":"Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction","summary":" Efforts to align Large Language Models (LLMs) are mainly conducted via\nReinforcement Learning from Human Feedback (RLHF) methods. However, RLHF\nencounters major challenges including training reward models, actor-critic\nengineering, and importantly, it requires access to LLM parameters. Here we\nintroduce Aligner, a new efficient alignment paradigm that bypasses the whole\nRLHF process by learning the correctional residuals between the aligned and the\nunaligned answers. Our Aligner offers several key advantages. Firstly, it is an\nautoregressive seq2seq model that is trained on the query-answer-correction\ndataset via supervised learning; this offers a parameter-efficient alignment\nsolution with minimal resources. Secondly, the Aligner facilitates\nweak-to-strong generalization; finetuning large pretrained models by Aligner's\nsupervisory signals demonstrates strong performance boost. Thirdly, Aligner\nfunctions as a model-agnostic plug-and-play module, allowing for its direct\napplication on different open-source and API-based models. Remarkably,\nAligner-7B improves 11 different LLMs by 18% in helpfulness and 23% in\nharmlessness on average (GPT-4 by 26.9% and 17.5%). When finetuning (strong)\nLlama2-70B with (weak) Aligner-7B's supervision, we can improve Llama2 by 8.2%\nin helpfulness and 61.6% in harmlessness. See our dataset and code at\n\\url{https://aligner2024.github.io}.\n","authors":["Jiaming Ji","Boyuan Chen","Hantao Lou","Donghai Hong","Borong Zhang","Xuehai Pan","Juntao Dai","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02416v1.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2402.00263v2","updated":"2024-02-04T09:23:23Z","published":"2024-02-01T01:23:07Z","title":"Does DetectGPT Fully Utilize Perturbation? Selective Perturbation on\n Model-Based Contrastive Learning Detector would be Better","summary":" The burgeoning capabilities of large language models (LLMs) have raised\ngrowing concerns about abuse. DetectGPT, a zero-shot metric-based unsupervised\nmachine-generated text detector, first introduces perturbation and shows great\nperformance improvement. However, DetectGPT's random perturbation strategy\nmight introduce noise, limiting the distinguishability and further performance\nimprovements. Moreover, its logit regression module relies on setting the\nthreshold, which harms the generalizability and applicability of individual or\nsmall-batch inputs. Hence, we propose a novel detector, Pecola, which uses\nselective strategy perturbation to relieve the information loss caused by\nrandom masking, and multi-pair contrastive learning to capture the implicit\npattern information during perturbation, facilitating few-shot performance. The\nexperiments show that Pecola outperforms the SOTA method by 1.20% in accuracy\non average on four public datasets. We further analyze the effectiveness,\nrobustness, and generalization of our perturbation method.\n","authors":["Shengchao Liu","Xiaoming Liu","Yichen Wang","Zehua Cheng","Chengzhengxu Li","Zhaohan Zhang","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2402.00263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06908v2","updated":"2024-02-04T09:07:54Z","published":"2023-07-13T17:14:38Z","title":"Generating Benchmarks for Factuality Evaluation of Language Models","summary":" Before deploying a language model (LM) within a given domain, it is important\nto measure its tendency to generate factually incorrect information in that\ndomain. Existing methods for factuality evaluation of LLM generation focus on\nfacts sampled from the LM itself, and thus do not control the set of evaluated\nfacts and might under-represent domain specific or rare facts. We propose\nFACTOR: Factual Assessment via Corpus TransfORmation, a scalable approach for\nevaluating LM factuality. FACTOR automatically transforms a factual corpus of\ninterest into a benchmark evaluating an LM's propensity to generate true facts\nfrom the corpus vs. similar but incorrect statements. We use our framework to\ncreate three benchmarks: Wiki-FACTOR, News-FACTOR and Expert-FACTOR. We show\nthat: (i) our benchmark scores increase with model size and improve when the LM\nis augmented with retrieval; (ii) benchmark score and perplexity do not always\nagree on model ranking; (iii) when perplexity and benchmark score disagree, the\nlatter better reflects factuality in open-ended generation, as measured by\nhuman annotators. We make our data and code publicly available in\nhttps://github.com/AI21Labs/factor.\n","authors":["Dor Muhlgay","Ori Ram","Inbal Magar","Yoav Levine","Nir Ratner","Yonatan Belinkov","Omri Abend","Kevin Leyton-Brown","Amnon Shashua","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2307.06908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02408v1","updated":"2024-02-04T08:57:54Z","published":"2024-02-04T08:57:54Z","title":"GLaPE: Gold Label-agnostic Prompt Evaluation and Optimization for Large\n Language Model","summary":" Despite the rapid progress of large language models (LLMs), their task\nperformance remains sensitive to prompt design. Recent studies have explored\nleveraging the LLM itself as an optimizer to identify optimal prompts that\nmaximize task accuracy. However, when evaluating prompts, such approaches\nheavily rely on elusive manually annotated gold labels to calculate task\naccuracy for each candidate prompt, which hinders the widespread implementation\nand generality. To overcome the limitation, this work proposes a gold\nlabel-agnostic prompt evaluation (GLaPE) to alleviate dependence on gold\nlabels. Motivated by the observed correlation between self-consistency and the\naccuracy of the answer, we adopt self-consistency as the initial evaluation\nscore. Subsequently, we refine the scores of prompts producing identical\nanswers to be mutually consistent. Experimental results show that GLaPE\nprovides reliable evaluations uniform with accuracy, even in the absence of\ngold labels. Moreover, on six popular reasoning tasks, our GLaPE-based prompt\noptimization yields effective prompts comparable to accuracy-based ones. The\ncode is publicly available at https://github.com/thunderous77/GLaPE.\n","authors":["Xuanchang Zhang","Zhuosheng Zhang","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02392v1","updated":"2024-02-04T08:11:45Z","published":"2024-02-04T08:11:45Z","title":"DeLLMa: A Framework for Decision Making Under Uncertainty with Large\n Language Models","summary":" Large language models (LLMs) are increasingly used across society, including\nin domains like business, engineering, and medicine. These fields often grapple\nwith decision-making under uncertainty, a critical yet challenging task. In\nthis paper, we show that directly prompting LLMs on these types of\ndecision-making problems yields poor results, especially as the problem\ncomplexity increases. To overcome this limitation, we propose DeLLMa\n(Decision-making Large Language Model assistant), a framework designed to\nenhance decision-making accuracy in uncertain environments. DeLLMa involves a\nmulti-step scaffolding procedure, drawing upon principles from decision theory\nand utility theory, to provide an optimal and human-auditable decision-making\nprocess. We validate our framework on decision-making environments involving\nreal agriculture and finance data. Our results show that DeLLMa can\nsignificantly improve LLM decision-making performance, achieving up to a 40%\nincrease in accuracy over competing methods.\n","authors":["Ollie Liu","Deqing Fu","Dani Yogatama","Willie Neiswanger"],"pdf_url":"https://arxiv.org/pdf/2402.02392v1.pdf","comment":"23 pages, 17 figures"},{"id":"http://arxiv.org/abs/2211.11419v4","updated":"2024-02-04T08:03:23Z","published":"2022-11-21T13:04:37Z","title":"SSCFormer: Push the Limit of Chunk-wise Conformer for Streaming ASR\n Using Sequentially Sampled Chunks and Chunked Causal Convolution","summary":" Currently, the chunk-wise schemes are often used to make Automatic Speech\nRecognition (ASR) models to support streaming deployment. However, existing\napproaches are unable to capture the global context, lack support for parallel\ntraining, or exhibit quadratic complexity for the computation of multi-head\nself-attention (MHSA). On the other side, the causal convolution, no future\ncontext used, has become the de facto module in streaming Conformer. In this\npaper, we propose SSCFormer to push the limit of chunk-wise Conformer for\nstreaming ASR using the following two techniques: 1) A novel cross-chunks\ncontext generation method, named Sequential Sampling Chunk (SSC) scheme, to\nre-partition chunks from regular partitioned chunks to facilitate efficient\nlong-term contextual interaction within local chunks. 2)The Chunked Causal\nConvolution (C2Conv) is designed to concurrently capture the left context and\nchunk-wise future context. Evaluations on AISHELL-1 show that an End-to-End\n(E2E) CER 5.33% can achieve, which even outperforms a strong time-restricted\nbaseline U2. Moreover, the chunk-wise MHSA computation in our model enables it\nto train with a large batch size and perform inference with linear complexity.\n","authors":["Fangyuan Wang","Bo Xu","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2211.11419v4.pdf","comment":"This manuscript has been accepted by SPL"},{"id":"http://arxiv.org/abs/2402.02389v1","updated":"2024-02-04T08:01:07Z","published":"2024-02-04T08:01:07Z","title":"KICGPT: Large Language Model with Knowledge in Context for Knowledge\n Graph Completion","summary":" Knowledge Graph Completion (KGC) is crucial for addressing knowledge graph\nincompleteness and supporting downstream applications. Many models have been\nproposed for KGC. They can be categorized into two main classes: triple-based\nand text-based approaches. Triple-based methods struggle with long-tail\nentities due to limited structural information and imbalanced entity\ndistributions. Text-based methods alleviate this issue but require costly\ntraining for language models and specific finetuning for knowledge graphs,\nwhich limits their efficiency. To alleviate these limitations, in this paper,\nwe propose KICGPT, a framework that integrates a large language model (LLM) and\na triple-based KGC retriever. It alleviates the long-tail problem without\nincurring additional training overhead. KICGPT uses an in-context learning\nstrategy called Knowledge Prompt, which encodes structural knowledge into\ndemonstrations to guide the LLM. Empirical results on benchmark datasets\ndemonstrate the effectiveness of KICGPT with smaller training overhead and no\nfinetuning.\n","authors":["Yanbin Wei","Qiushi Huang","James T. Kwok","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02389v1.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2402.02388v1","updated":"2024-02-04T07:59:06Z","published":"2024-02-04T07:59:06Z","title":"Solution-oriented Agent-based Models Generation with Verifier-assisted\n Iterative In-context Learning","summary":" Agent-based models (ABMs) stand as an essential paradigm for proposing and\nvalidating hypothetical solutions or policies aimed at addressing challenges\nposed by complex systems and achieving various objectives. This process demands\nlabor-intensive endeavors and multidisciplinary expertise. Large language\nmodels (LLMs) encapsulating cross-domain knowledge and programming proficiency\ncould potentially alleviate the difficulty of this process. However, LLMs excel\nin handling sequential information, making it challenging for analyzing the\nintricate interactions and nonlinear dynamics inherent in ABMs. Additionally,\ndue to the lack of self-evaluation capability of LLMs, relying solely on LLMs\nis insufficient to effectively accomplish this process. In this paper, we\npresent SAGE, a general solution-oriented ABM generation framework designed for\nautomatic modeling and generating solutions for targeted problems. Unlike\napproaches reliant on expert handcrafting or resource-intensive neural network\ntraining, SAGE establishes a verifier-assisted iterative in-context learning\nprocess employing large language models (LLMs) to leverages their inherent\ncross-domain knowledge for tackling intricate demands from diverse domain\nscenarios. In SAGE, we introduce an semi-structured conceptual representation\nexpliciting the intricate structures of ABMs and an objective representation to\nguide LLMs in modeling scenarios and proposing hypothetical solutions through\nin-context learning. To ensure the model executability and solution\nfeasibility, SAGE devises a two-level verifier with chain-of-thought prompting\ntailored to the complex interactions and non-linear dynamics of ABMs, driving\nthe iterative generation optimization. Moreover, we construct an evaluation\ndataset of solution-oriented ABMs from open sources.It contains practical\nmodels across various domains.\n","authors":["Tong Niu","Weihao Zhang","Rong Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02380v1","updated":"2024-02-04T07:39:06Z","published":"2024-02-04T07:39:06Z","title":"Evaluating Large Language Models in Analysing Classroom Dialogue","summary":" This study explores the application of Large Language Models (LLMs),\nspecifically GPT-4, in the analysis of classroom dialogue, a crucial research\ntask for both teaching diagnosis and quality improvement. Recognizing the\nknowledge-intensive and labor-intensive nature of traditional qualitative\nmethods in educational research, this study investigates the potential of LLM\nto streamline and enhance the analysis process. The study involves datasets\nfrom a middle school, encompassing classroom dialogues across mathematics and\nChinese classes. These dialogues were manually coded by educational experts and\nthen analyzed using a customised GPT-4 model. This study focuses on comparing\nmanual annotations with the outputs of GPT-4 to evaluate its efficacy in\nanalyzing educational dialogues. Time efficiency, inter-coder agreement, and\ninter-coder reliability between human coders and GPT-4 are evaluated. Results\nindicate substantial time savings with GPT-4, and a high degree of consistency\nin coding between the model and human coders, with some discrepancies in\nspecific codes. These findings highlight the strong potential of LLM in\nteaching evaluation and facilitation.\n","authors":["Yun Long","Haifeng Luo","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02379v1","updated":"2024-02-04T07:33:45Z","published":"2024-02-04T07:33:45Z","title":"Rethinking the Evaluation of Pre-trained Text-and-Layout Models from an\n Entity-Centric Perspective","summary":" Recently developed pre-trained text-and-layout models (PTLMs) have shown\nremarkable success in multiple information extraction tasks on visually-rich\ndocuments. However, the prevailing evaluation pipeline may not be sufficiently\nrobust for assessing the information extraction ability of PTLMs, due to\ninadequate annotations within the benchmarks. Therefore, we claim the necessary\nstandards for an ideal benchmark to evaluate the information extraction ability\nof PTLMs. We then introduce EC-FUNSD, an entity-centric benckmark designed for\nthe evaluation of semantic entity recognition and entity linking on\nvisually-rich documents. This dataset contains diverse formats of document\nlayouts and annotations of semantic-driven entities and their relations.\nMoreover, this dataset disentangles the falsely coupled annotation of segment\nand entity that arises from the block-level annotation of FUNSD. Experiment\nresults demonstrate that state-of-the-art PTLMs exhibit overfitting tendencies\non the prevailing benchmarks, as their performance sharply decrease when the\ndataset bias is removed.\n","authors":["Chong Zhang","Yixi Zhao","Chenshu Yuan","Yi Tu","Ya Guo","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11767v2","updated":"2024-02-04T06:59:45Z","published":"2023-08-15T23:22:37Z","title":"Detection of ChatGPT Fake Science with the xFakeBibs Learning Algorithm","summary":" ChatGPT is becoming a new reality. In this paper, we demonstrate a method for\ndistinguishing ChatGPT-generated publications from those produced by\nscientists. The objective of this work is to introduce a newly designed\nsupervised network-driven algorithm that illustrates how to predict\nmachine-generated content. The premise is that ChatGPT content exhibits\nbehavior that is distinctive and can be set apart from scientific articles. The\nalgorithm was trained and tested on three disease-specific publications, with\neach model constructed from 100 abstracts. Additionally, the algorithm\nunderwent k-Folds calibration (depending on the availability of the data) to\nestablish a lower-upper bound range of acceptance. The network training model\nof ChatGPT showed a lower number of nodes and a higher number of edges when\ncompared with models of real article abstracts. The algorithm was executed in\nsingle-mode to predict the class of one type of dataset at a time and achieved\n>94%. It was also executed in multi-mode on mixed documents of ChatGPT and\nPubMed abstracts. The algorithm remarkably predicted real articles with a\nprecision of 100% and, on rare occasions, 96%-98%. However, ChatGPT content was\noften misclassified as real publications with up to 88% accuracy in all\ndatasets of the three diseases. Our results also showed that the year of\npublications mixed with ChatGPT-generated content may play a factor in\ndetecting the correct class, where the older the publication, the better the\nprediction.\n","authors":["Ahmed Abdeen Hamed","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2308.11767v2.pdf","comment":"14 pages, 6 figures, 4 tables, 2 algorithms"},{"id":"http://arxiv.org/abs/2402.02369v1","updated":"2024-02-04T06:56:23Z","published":"2024-02-04T06:56:23Z","title":"M$^3$Face: A Unified Multi-Modal Multilingual Framework for Human Face\n Generation and Editing","summary":" Human face generation and editing represent an essential task in the era of\ncomputer vision and the digital world. Recent studies have shown remarkable\nprogress in multi-modal face generation and editing, for instance, using face\nsegmentation to guide image generation. However, it may be challenging for some\nusers to create these conditioning modalities manually. Thus, we introduce\nM3Face, a unified multi-modal multilingual framework for controllable face\ngeneration and editing. This framework enables users to utilize only text input\nto generate controlling modalities automatically, for instance, semantic\nsegmentation or facial landmarks, and subsequently generate face images. We\nconduct extensive qualitative and quantitative experiments to showcase our\nframeworks face generation and editing capabilities. Additionally, we propose\nthe M3CelebA Dataset, a large-scale multi-modal and multilingual face dataset\ncontaining high-quality images, semantic segmentations, facial landmarks, and\ndifferent captions for each image in multiple languages. The code and the\ndataset will be released upon publication.\n","authors":["Mohammadreza Mofayezi","Reza Alipour","Mohammad Ali Kakavand","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2402.02369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13565v3","updated":"2024-02-04T06:52:28Z","published":"2024-01-24T16:21:28Z","title":"Large Malaysian Language Model Based on Mistral for Enhanced Local\n Language Understanding","summary":" In this paper, we present significant advancements in the pretraining of\nMistral 7B, a large-scale language model, using a dataset of 32.6 GB,\nequivalent to 1.1 billion tokens. We explore the impact of extending the\ncontext length, releasing models with context lengths of 4096 and 32768 tokens,\nand further refining performance with a specialized 16384 context length\ninstruction-tuned model, we called it Malaysian Mistral.\n Our experiments demonstrate the efficacy of continue pretraining and the\ninfluence of extended context lengths on Mistral 7B's language understanding\ncapabilities. Additionally, we release a model specifically tuned with a 16384\ncontext length instruction, showcasing its potential for capturing nuanced\nlanguage intricacies.\n Furthermore, our research contributes to the benchmarking of Malaysian\nMistral against prominent language models, including ChatGPT3.5 and Claude 2.\nWe present compelling results indicating Malaysian Mistral's superior\nperformance on Tatabahasa (Malay grammar) test set, particularly when\nfine-tuned with instructions.\n All models released at\nhttps://huggingface.co/collections/mesolitica/malaysian-mistral-7b-6528f2ec825f4bba46c1700c\n","authors":["Husein Zolkepli","Aisyah Razak","Kamarul Adha","Ariff Nazhan"],"pdf_url":"https://arxiv.org/pdf/2401.13565v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05214v2","updated":"2024-02-04T06:21:03Z","published":"2023-05-09T07:23:01Z","title":"CharSpan: Utilizing Lexical Similarity to Enable Zero-Shot Machine\n Translation for Extremely Low-resource Languages","summary":" We address the task of machine translation (MT) from extremely low-resource\nlanguage (ELRL) to English by leveraging cross-lingual transfer from\n'closely-related' high-resource language (HRL). The development of an MT system\nfor ELRL is challenging because these languages typically lack parallel corpora\nand monolingual corpora, and their representations are absent from large\nmultilingual language models. Many ELRLs share lexical similarities with some\nHRLs, which presents a novel modeling opportunity. However, existing\nsubword-based neural MT models do not explicitly harness this lexical\nsimilarity, as they only implicitly align HRL and ELRL latent embedding space.\nTo overcome this limitation, we propose a novel, CharSpan, approach based on\n'character-span noise augmentation' into the training data of HRL. This serves\nas a regularization technique, making the model more robust to 'lexical\ndivergences' between the HRL and ELRL, thus facilitating effective\ncross-lingual transfer. Our method significantly outperformed strong baselines\nin zero-shot settings on closely related HRL and ELRL pairs from three diverse\nlanguage families, emerging as the state-of-the-art model for ELRLs.\n","authors":["Kaushal Kumar Maurya","Rahul Kejriwal","Maunendra Sankar Desarkar","Anoop Kunchukuttan"],"pdf_url":"https://arxiv.org/pdf/2305.05214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10456v2","updated":"2024-02-04T06:05:06Z","published":"2023-09-19T09:13:30Z","title":"Improving Speaker Diarization using Semantic Information: Joint Pairwise\n Constraints Propagation","summary":" Speaker diarization has gained considerable attention within speech\nprocessing research community. Mainstream speaker diarization rely primarily on\nspeakers' voice characteristics extracted from acoustic signals and often\noverlook the potential of semantic information. Considering the fact that\nspeech signals can efficiently convey the content of a speech, it is of our\ninterest to fully exploit these semantic cues utilizing language models. In\nthis work we propose a novel approach to effectively leverage semantic\ninformation in clustering-based speaker diarization systems. Firstly, we\nintroduce spoken language understanding modules to extract speaker-related\nsemantic information and utilize these information to construct pairwise\nconstraints. Secondly, we present a novel framework to integrate these\nconstraints into the speaker diarization pipeline, enhancing the performance of\nthe entire system. Extensive experiments conducted on the public dataset\ndemonstrate the consistent superiority of our proposed approach over\nacoustic-only speaker diarization systems.\n","authors":["Luyao Cheng","Siqi Zheng","Qinglin Zhang","Hui Wang","Yafeng Chen","Qian Chen","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.10456v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03735v2","updated":"2024-02-04T05:26:41Z","published":"2024-01-08T08:54:22Z","title":"Language Models Understand Numbers, at Least Partially","summary":" Large language models (LLMs) have exhibited impressive competence in various\ntasks, but their opaque internal mechanisms hinder their use in mathematical\nproblems. In this paper, we study a fundamental question: whether language\nmodels understand numbers, a basic element in math. Based on an assumption that\nLLMs should be capable of compressing numbers in their hidden states to solve\nmathematical problems, we construct a synthetic dataset comprising addition\nproblems and utilize linear probes to read out input numbers from the hidden\nstates. Experimental results support the existence of compressed numbers in\nLLMs. However, it is difficult to precisely reconstruct the original numbers,\nindicating that the compression process may not be lossless. Further\nexperiments show that LLMs can utilize encoded numbers to perform arithmetic\ncomputations, and the computational ability scales up with the model size. Our\npreliminary research suggests that LLMs exhibit a partial understanding of\nnumbers, offering insights for future investigations about the models'\nmathematical capability.\n","authors":["Fangwei Zhu","Damai Dai","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2401.03735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13933v3","updated":"2024-02-04T03:57:11Z","published":"2023-12-21T15:28:02Z","title":"Structured Probabilistic Coding","summary":" This paper presents a new supervised representation learning framework,\nnamely structured probabilistic coding (SPC), to learn compact and informative\nrepresentations from input related to the target task. SPC is an encoder-only\nprobabilistic coding technology with a structured regularization from the\ntarget space. It can enhance the generalization ability of pre-trained language\nmodels for better language understanding. Specifically, our probabilistic\ncoding simultaneously performs information encoding and task prediction in one\nmodule to more fully utilize the effective information from input data. It uses\nvariational inference in the output space to reduce randomness and uncertainty.\nBesides, to better control the learning process of probabilistic\nrepresentations, a structured regularization is proposed to promote uniformity\nacross classes in the latent space. With the regularization term, SPC can\npreserve the Gaussian structure of the latent code and achieve better coverage\nof the hidden space with class uniformly. Experimental results on 12 natural\nlanguage understanding tasks demonstrate that our SPC effectively improves the\nperformance of pre-trained language models for classification and regression.\nExtensive experiments show that SPC can enhance the generalization capability,\nrobustness to label noise, and clustering quality of output representations.\n","authors":["Dou Hu","Lingwei Wei","Yaxin Liu","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2312.13933v3.pdf","comment":"11 pages, accepted by AAAI 2024 (Oral)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.02634v1","updated":"2024-02-04T23:00:24Z","published":"2024-02-04T23:00:24Z","title":"Key-Graph Transformer for Image Restoration","summary":" While it is crucial to capture global information for effective image\nrestoration (IR), integrating such cues into transformer-based methods becomes\ncomputationally expensive, especially with high input resolution. Furthermore,\nthe self-attention mechanism in transformers is prone to considering\nunnecessary global cues from unrelated objects or regions, introducing\ncomputational inefficiencies. In response to these challenges, we introduce the\nKey-Graph Transformer (KGT) in this paper. Specifically, KGT views patch\nfeatures as graph nodes. The proposed Key-Graph Constructor efficiently forms a\nsparse yet representative Key-Graph by selectively connecting essential nodes\ninstead of all the nodes. Then the proposed Key-Graph Attention is conducted\nunder the guidance of the Key-Graph only among selected nodes with linear\ncomputational complexity within each window. Extensive experiments across 6 IR\ntasks confirm the proposed KGT's state-of-the-art performance, showcasing\nadvancements both quantitatively and qualitatively.\n","authors":["Bin Ren","Yawei Li","Jingyun Liang","Rakesh Ranjan","Mengyuan Liu","Rita Cucchiara","Luc Van Gool","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2402.02634v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.00501v7","updated":"2024-02-04T22:38:15Z","published":"2023-04-02T10:27:34Z","title":"A Comprehensive Review of YOLO Architectures in Computer Vision: From\n YOLOv1 to YOLOv8 and YOLO-NAS","summary":" YOLO has become a central real-time object detection system for robotics,\ndriverless cars, and video monitoring applications. We present a comprehensive\nanalysis of YOLO's evolution, examining the innovations and contributions in\neach iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with\nTransformers. We start by describing the standard metrics and postprocessing;\nthen, we discuss the major changes in network architecture and training tricks\nfor each model. Finally, we summarize the essential lessons from YOLO's\ndevelopment and provide a perspective on its future, highlighting potential\nresearch directions to enhance real-time object detection systems.\n","authors":["Juan Terven","Diana Cordova-Esparza"],"pdf_url":"https://arxiv.org/pdf/2304.00501v7.pdf","comment":"36 pages, 21 figures, 4 tables, published in Machine Learning and\n Knowledge Extraction. This version contains the last changes made to the\n published version"},{"id":"http://arxiv.org/abs/2311.13664v2","updated":"2024-02-04T22:29:41Z","published":"2023-11-22T19:36:47Z","title":"Sample as You Infer: Predictive Coding With Langevin Dynamics","summary":" We present a novel algorithm for parameter learning in generic deep\ngenerative models that builds upon the predictive coding (PC) framework of\ncomputational neuroscience. Our approach modifies the standard PC algorithm to\nbring performance on-par and exceeding that obtained from standard variational\nauto-encoder (VAE) training. By injecting Gaussian noise into the PC inference\nprocedure we re-envision it as an overdamped Langevin sampling, which\nfacilitates optimisation with respect to a tight evidence lower bound (ELBO).\nWe improve the resultant encoder-free training method by incorporating an\nencoder network to provide an amortised warm-start to our Langevin sampling and\ntest three different objectives for doing so. Finally, to increase robustness\nto the sampling step size and reduce sensitivity to curvature, we validate a\nlightweight and easily computable form of preconditioning, inspired by Riemann\nManifold Langevin and adaptive optimizers from the SGD literature. We compare\nagainst VAEs by training like-for-like generative models using our technique\nagainst those trained with standard reparameterisation-trick-based ELBOs. We\nobserve our method out-performs or matches performance across a number of\nmetrics, including sample quality, while converging in a fraction of the number\nof SGD training iterations.\n","authors":["Umais Zahid","Qinghai Guo","Zafeirios Fountas"],"pdf_url":"https://arxiv.org/pdf/2311.13664v2.pdf","comment":"FID values updated to use a fixed 50,000 samples for all experiments\n - Jeffrey's divergence now consistently best performing. Dynov2 based metrics\n removed due to inconsistency of results - and since not industry standard.\n Multiple beta values tested in Fig 4. Theta LR for VAEs; beta and inf LR for\n LPC now tuned for results. Figure 5B updated; curves now correspond to\n results in Table 1"},{"id":"http://arxiv.org/abs/2306.07392v3","updated":"2024-02-04T22:23:36Z","published":"2023-06-12T19:42:26Z","title":"Learning Any-View 6DoF Robotic Grasping in Cluttered Scenes via Neural\n Surface Rendering","summary":" A significant challenge for real-world robotic manipulation is the effective\n6DoF grasping of objects in cluttered scenes from any single viewpoint without\nthe need for additional scene exploration. This work reinterprets grasping as\nrendering and introduces NeuGraspNet, a novel method for 6DoF grasp detection\nthat leverages advances in neural volumetric representations and surface\nrendering. It encodes the interaction between a robot's end-effector and an\nobject's surface by jointly learning to render the local object surface and\nlearning grasping functions in a shared feature space. The approach uses global\n(scene-level) features for grasp generation and local (grasp-level) neural\nsurface features for grasp evaluation. This enables effective, fully implicit\n6DoF grasp quality prediction, even in partially observed scenes. NeuGraspNet\noperates on random viewpoints, common in mobile manipulation scenarios, and\noutperforms existing implicit and semi-implicit grasping methods. The\nreal-world applicability of the method has been demonstrated with a mobile\nmanipulator robot, grasping in open, cluttered spaces. Project website at\nhttps://sites.google.com/view/neugraspnet\n","authors":["Snehal Jauhri","Ishikaa Lunawat","Georgia Chalvatzaki"],"pdf_url":"https://arxiv.org/pdf/2306.07392v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2305.10110v2","updated":"2024-02-04T22:22:29Z","published":"2023-05-17T10:18:02Z","title":"Adaptive aggregation of Monte Carlo augmented decomposed filters for\n efficient group-equivariant convolutional neural network","summary":" Group-equivariant convolutional neural networks (G-CNN) heavily rely on\nparameter sharing to increase CNN's data efficiency and performance. However,\nthe parameter-sharing strategy greatly increases the computational burden for\neach added parameter, which hampers its application to deep neural network\nmodels. In this paper, we address these problems by proposing a\nnon-parameter-sharing approach for group equivariant neural networks. The\nproposed methods adaptively aggregate a diverse range of filters by a weighted\nsum of stochastically augmented decomposed filters. We give theoretical proof\nabout how the group equivariance can be achieved by our methods. Our method\napplies to both continuous and discrete groups, where the augmentation is\nimplemented using Monte Carlo sampling and bootstrap resampling, respectively.\nWe demonstrate that our methods serve as an efficient extension of standard\nCNN. Experiments on group equivariant tests show how our methods can achieve\nsuperior performance to parameter-sharing group equivariant networks.\nExperiments on image classification and image denoising tasks show that in\ncertain scenarios, with a suitable set of filter bases, our method helps\nimprove the performance of standard CNNs and build efficient lightweight image\ndenoising networks. The code will be available at\nhttps://github.com/ZhaoWenzhao/MCG_CNN.\n","authors":["Wenzhao Zhao","Barbara D. Wichtmann","Steffen Albert","Angelika Maurer","Frank G. Zöllner","Ulrike Attenberger","Jürgen Hesser"],"pdf_url":"https://arxiv.org/pdf/2305.10110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13320v2","updated":"2024-02-04T19:28:06Z","published":"2023-08-25T11:49:51Z","title":"Fine-tuning can cripple your foundation model; preserving features may\n be the solution","summary":" Pre-trained foundation models, due to their enormous capacity and exposure to\nvast amounts of data during pre-training, are known to have learned plenty of\nreal-world concepts. An important step in making these pre-trained models\nextremely effective on downstream tasks is to fine-tune them on related\ndatasets. While various fine-tuning methods have been devised and have been\nshown to be highly effective, we observe that a fine-tuned model's ability to\nrecognize concepts on tasks $\\textit{different}$ from the downstream one is\nreduced significantly compared to its pre-trained counterpart. This is an\nundesirable effect of fine-tuning as a substantial amount of resources was used\nto learn these pre-trained concepts in the first place. We call this phenomenon\n\"concept forgetting\" and via experiments show that most end-to-end fine-tuning\napproaches suffer heavily from this side effect. To this end, we propose a\nsimple fix to this problem by designing a new fine-tuning method called\n$\\textit{LDIFS}$ (short for $\\ell_2$ distance in feature space) that, while\nlearning new concepts related to the downstream task, allows a model to\npreserve its pre-trained knowledge as well. Through extensive experiments on 10\nfine-tuning tasks we show that LDIFS significantly reduces concept forgetting.\nAdditionally, we show that LDIFS is highly effective in performing continual\nfine-tuning on a sequence of tasks as well, in comparison with both fine-tuning\nas well as continual learning baselines.\n","authors":["Jishnu Mukhoti","Yarin Gal","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2308.13320v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02583v1","updated":"2024-02-04T18:50:29Z","published":"2024-02-04T18:50:29Z","title":"DiffEditor: Boosting Accuracy and Flexibility on Diffusion-based Image\n Editing","summary":" Large-scale Text-to-Image (T2I) diffusion models have revolutionized image\ngeneration over the last few years. Although owning diverse and high-quality\ngeneration capabilities, translating these abilities to fine-grained image\nediting remains challenging. In this paper, we propose DiffEditor to rectify\ntwo weaknesses in existing diffusion-based image editing: (1) in complex\nscenarios, editing results often lack editing accuracy and exhibit unexpected\nartifacts; (2) lack of flexibility to harmonize editing operations, e.g.,\nimagine new content. In our solution, we introduce image prompts in\nfine-grained image editing, cooperating with the text prompt to better describe\nthe editing content. To increase the flexibility while maintaining content\nconsistency, we locally combine stochastic differential equation (SDE) into the\nordinary differential equation (ODE) sampling. In addition, we incorporate\nregional score-based gradient guidance and a time travel strategy into the\ndiffusion sampling, further improving the editing quality. Extensive\nexperiments demonstrate that our method can efficiently achieve\nstate-of-the-art performance on various fine-grained image editing tasks,\nincluding editing within a single image (e.g., object moving, resizing, and\ncontent dragging) and across images (e.g., appearance replacing and object\npasting). Our source code is released at\nhttps://github.com/MC-E/DragonDiffusion.\n","authors":["Chong Mou","Xintao Wang","Jiechong Song","Ying Shan","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06773v4","updated":"2024-02-04T18:36:50Z","published":"2023-05-11T12:54:10Z","title":"Towards a Better Understanding of the Computer Vision Research Community\n in Africa","summary":" Computer vision is a broad field of study that encompasses different tasks\n(e.g., object detection). Although computer vision is relevant to the African\ncommunities in various applications, yet computer vision research is\nunder-explored in the continent and constructs only 0.06% of top-tier\npublications in the last ten years. In this paper, our goal is to have a better\nunderstanding of the computer vision research conducted in Africa and provide\npointers on whether there is equity in research or not. We do this through an\nempirical analysis of the African computer vision publications that are Scopus\nindexed, where we collect around 63,000 publications over the period 2012-2022.\nWe first study the opportunities available for African institutions to publish\nin top-tier computer vision venues. We show that African publishing trends in\ntop-tier venues over the years do not exhibit consistent growth, unlike other\ncontinents such as North America or Asia. Moreover, we study all computer\nvision publications beyond top-tier venues in different African regions to find\nthat mainly Northern and Southern Africa are publishing in computer vision with\n68.5% and 15.9% of publications, resp. Nonetheless, we highlight that both\nEastern and Western Africa are exhibiting a promising increase with the last\ntwo years closing the gap with Southern Africa. Additionally, we study the\ncollaboration patterns in these publications to find that most of these exhibit\ninternational collaborations rather than African ones. We also show that most\nof these publications include an African author that is a key contributor as\nthe first or last author. Finally, we present the most recurring keywords in\ncomputer vision publications per African region.\n","authors":["Abdul-Hakeem Omotayo","Mai Gamal","Eman Ehab","Gbetondji Dovonon","Zainab Akinjobi","Ismaila Lukman","Houcemeddine Turki","Mahmod Abdien","Idriss Tondji","Abigail Oppong","Yvan Pimi","Karim Gamal"," Ro'ya-CV4Africa","Mennatullah Siam"],"pdf_url":"https://arxiv.org/pdf/2305.06773v4.pdf","comment":"Published in EAAMO'23 under ACM License. This work is part of our\n African computer vision grassroots research in Ro'ya - CV4Africa,\n https://ro-ya-cv4africa.github.io/homepage/"},{"id":"http://arxiv.org/abs/2401.11511v3","updated":"2024-02-04T18:26:50Z","published":"2024-01-21T14:48:38Z","title":"MobileARLoc: On-device Robust Absolute Localisation for Pervasive\n Markerless Mobile AR","summary":" Recent years have seen significant improvement in absolute camera pose\nestimation, paving the way for pervasive markerless Augmented Reality (AR).\nHowever, accurate absolute pose estimation techniques are computation- and\nstorage-heavy, requiring computation offloading. As such, AR systems rely on\nvisual-inertial odometry (VIO) to track the device's relative pose between\nrequests to the server. However, VIO suffers from drift, requiring frequent\nabsolute repositioning. This paper introduces MobileARLoc, a new framework for\non-device large-scale markerless mobile AR that combines an absolute pose\nregressor (APR) with a local VIO tracking system. Absolute pose regressors\n(APRs) provide fast on-device pose estimation at the cost of reduced accuracy.\nTo address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback\nloop where VIO pose estimations refine the APR predictions. The VIO system\nidentifies reliable predictions of APR, which are then used to compensate for\nthe VIO drift. We comprehensively evaluate MobileARLoc through dataset\nsimulations. MobileARLoc halves the error compared to the underlying APR and\nachieve fast (80\\,ms) on-device inference speed.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2401.11511v3.pdf","comment":"Accepted for publication at the 3rd edition of the Pervasive and\n Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024).\n This article supersedes arXiv:2308.05394"},{"id":"http://arxiv.org/abs/2401.12480v2","updated":"2024-02-04T18:19:09Z","published":"2024-01-23T04:19:15Z","title":"Explore Synergistic Interaction Across Frames for Interactive Video\n Object Segmentation","summary":" Interactive Video Object Segmentation (iVOS) is a challenging task that\nrequires real-time human-computer interaction. To improve the user experience,\nit is important to consider the user's input habits, segmentation quality,\nrunning time and memory consumption.However, existing methods compromise user\nexperience with single input mode and slow running speed. Specifically, these\nmethods only allow the user to interact with one single frame, which limits the\nexpression of the user's intent.To overcome these limitations and better align\nwith people's usage habits, we propose a framework that can accept multiple\nframes simultaneously and explore synergistic interaction across frames (SIAF).\nConcretely, we designed the Across-Frame Interaction Module that enables users\nto annotate different objects freely on multiple frames. The AFI module will\nmigrate scribble information among multiple interactive frames and generate\nmulti-frame masks. Additionally, we employ the id-queried mechanism to process\nmultiple objects in batches. Furthermore, for a more efficient propagation and\nlightweight model, we design a truncated re-propagation strategy to replace the\nprevious multi-round fusion module, which employs an across-round memory that\nstores important interaction information. Our SwinB-SIAF achieves new\nstate-of-the-art performance on DAVIS 2017 (89.6%, J&F@60). Moreover, our\nR50-SIAF is more than 3 faster than the state-of-the-art competitor under\nchallenging multi-object scenarios.\n","authors":["Kexin Li","Tao Jiang","Zongxin Yang","Yi Yang","Yueting Zhuang","Jun Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.12480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11617v2","updated":"2024-02-04T18:17:27Z","published":"2024-01-21T22:50:44Z","title":"A Survey on African Computer Vision Datasets, Topics and Researchers","summary":" Computer vision encompasses a range of tasks such as object detection,\nsemantic segmentation, and 3D reconstruction. Despite its relevance to African\ncommunities, research in this field within Africa represents only 0.06% of\ntop-tier publications over the past decade. This study undertakes a thorough\nanalysis of 63,000 Scopus-indexed computer vision publications from Africa,\nspanning from 2012 to 2022. The aim is to provide a survey of African computer\nvision topics, datasets and researchers. A key aspect of our study is the\nidentification and categorization of African Computer Vision datasets using\nlarge language models that automatically parse abstracts of these publications.\nWe also provide a compilation of unofficial African Computer Vision datasets\ndistributed through challenges or data hosting platforms, and provide a full\ntaxonomy of dataset categories. Our survey also pinpoints computer vision\ntopics trends specific to different African regions, indicating their unique\nfocus areas. Additionally, we carried out an extensive survey to capture the\nviews of African researchers on the current state of computer vision research\nin the continent and the structural barriers they believe need urgent\nattention. In conclusion, this study catalogs and categorizes Computer Vision\ndatasets and topics contributed or initiated by African institutions and\nidentifies barriers to publishing in top-tier Computer Vision venues. This\nsurvey underscores the importance of encouraging African researchers and\ninstitutions in advancing computer vision research in the continent. It also\nstresses on the need for research topics to be more aligned with the needs of\nAfrican communities.\n","authors":["Abdul-Hakeem Omotayo","Ashery Mbilinyi","Lukman Ismaila","Houcemeddine Turki","Mahmoud Abdien","Karim Gamal","Idriss Tondji","Yvan Pimi","Naome A. Etori","Marwa M. Matar","Clifford Broni-Bediako","Abigail Oppong","Mai Gamal","Eman Ehab","Gbetondji Dovonon","Zainab Akinjobi","Daniel Ajisafe","Oluwabukola G. Adegboro","Mennatullah Siam"],"pdf_url":"https://arxiv.org/pdf/2401.11617v2.pdf","comment":"Under Review, Community Work of Ro'ya Grassroots,\n https://ro-ya-cv4africa.github.io/homepage/.Journal extension of our\n conference paper, arXiv admin note: text overlap with arXiv:2305.06773"},{"id":"http://arxiv.org/abs/2402.02574v1","updated":"2024-02-04T17:52:04Z","published":"2024-02-04T17:52:04Z","title":"Spatio-temporal Prompting Network for Robust Video Feature Extraction","summary":" Frame quality deterioration is one of the main challenges in the field of\nvideo understanding. To compensate for the information loss caused by\ndeteriorated frames, recent approaches exploit transformer-based integration\nmodules to obtain spatio-temporal information. However, these integration\nmodules are heavy and complex. Furthermore, each integration module is\nspecifically tailored for its target task, making it difficult to generalise to\nmultiple tasks. In this paper, we present a neat and unified framework, called\nSpatio-Temporal Prompting Network (STPN). It can efficiently extract robust and\naccurate video features by dynamically adjusting the input features in the\nbackbone network. Specifically, STPN predicts several video prompts containing\nspatio-temporal information of neighbour frames. Then, these video prompts are\nprepended to the patch embeddings of the current frame as the updated input for\nvideo feature extraction. Moreover, STPN is easy to generalise to various video\ntasks because it does not contain task-specific modules. Without bells and\nwhistles, STPN achieves state-of-the-art performance on three widely-used\ndatasets for different video understanding tasks, i.e., ImageNetVID for video\nobject detection, YouTubeVIS for video instance segmentation, and GOT-10k for\nvisual object tracking. Code is available at\nhttps://github.com/guanxiongsun/vfe.pytorch.\n","authors":["Guanxiong Sun","Chi Wang","Zhaoyu Zhang","Jiankang Deng","Stefanos Zafeiriou","Yang Hua"],"pdf_url":"https://arxiv.org/pdf/2402.02574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08035v2","updated":"2024-02-04T17:39:07Z","published":"2024-01-16T01:08:19Z","title":"BanglaNet: Bangla Handwritten Character Recognition using Ensembling of\n Convolutional Neural Network","summary":" Handwritten character recognition is a crucial task because of its abundant\napplications. The recognition task of Bangla handwritten characters is\nespecially challenging because of the cursive nature of Bangla characters and\nthe presence of compound characters with more than one way of writing. In this\npaper, a classification model based on the ensembling of several Convolutional\nNeural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic\ncharacters, compound characters, numerals, and modifiers. Three different\nmodels based on the idea of state-of-the-art CNN models like Inception, ResNet,\nand DenseNet have been trained with both augmented and non-augmented inputs.\nFinally, all these models are averaged or ensembled to get the finishing model.\nRigorous experimentation on three benchmark Bangla handwritten characters\ndatasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited\nsignificant recognition accuracies compared to some recent CNN-based research.\nThe top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and\nthe top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb,\nBanglaLekha-Isolated, and Ekush datasets respectively.\n","authors":["Chandrika Saha","Md Mostafijur Rahman"],"pdf_url":"https://arxiv.org/pdf/2401.08035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07125v2","updated":"2024-02-04T17:05:28Z","published":"2023-12-12T09:58:07Z","title":"TransMed: Large Language Models Enhance Vision Transformer for\n Biomedical Image Classification","summary":" Few-shot learning has been studied to adapt models to tasks with very few\nsamples. It holds profound significance, particularly in clinical tasks, due to\nthe high annotation cost of medical images. Several works have explored\nfew-shot learning on medical images, yet they still require a large number of\nmedical images for pre-training models to gain domain-specific priors. Vision\nfoundation models recently have achieved remarkable success in natural images.\nHence, adapting rapidly advancing vision foundation models from natural images\nto few-shot clinical tasks holds great promise. MedFMC has recently organized a\nchallenge to shed more light on this topic at NeurIPS 2023. In this work, we\npresent our challenge solution. We observe that a simple variant of fine-tuning\nwith partial freezing shows remarkable performance. Empirical evidence\ndemonstrates that this approach could outperform various common fine-tuning\nmethods under limited sample sizes. Additionally, we explore enhanced\nutilization of semantic supervision to boost performance. We propose a novel\napproach that contextualizes labels via large language models (LLMs). Our\nfindings reveal that the context generated by LLMs significantly enhances the\ndiscrimination of semantic embeddings for similar categories, resulting in a\nnotable performance improvement of 3%-5% in 1-shot settings compared to\ncommonly employed one-hot labels and other semantic supervision methods. Our\nsolution secures the 1st place in the MedFMC challenge.\n","authors":["Kaipeng Zheng","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2312.07125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15638v2","updated":"2024-02-04T16:47:28Z","published":"2024-01-28T12:22:34Z","title":"Cyto R-CNN and CytoNuke Dataset: Towards reliable whole-cell\n segmentation in bright-field histological images","summary":" Background: Cell segmentation in bright-field histological slides is a\ncrucial topic in medical image analysis. Having access to accurate segmentation\nallows researchers to examine the relationship between cellular morphology and\nclinical observations. Unfortunately, most segmentation methods known today are\nlimited to nuclei and cannot segmentate the cytoplasm.\n Material & Methods: We present a new network architecture Cyto R-CNN that is\nable to accurately segment whole cells (with both the nucleus and the\ncytoplasm) in bright-field images. We also present a new dataset CytoNuke,\nconsisting of multiple thousand manual annotations of head and neck squamous\ncell carcinoma cells. Utilizing this dataset, we compared the performance of\nCyto R-CNN to other popular cell segmentation algorithms, including QuPath's\nbuilt-in algorithm, StarDist and Cellpose. To evaluate segmentation\nperformance, we calculated AP50, AP75 and measured 17 morphological and\nstaining-related features for all detected cells. We compared these\nmeasurements to the gold standard of manual segmentation using the\nKolmogorov-Smirnov test.\n Results: Cyto R-CNN achieved an AP50 of 58.65% and an AP75 of 11.56% in\nwhole-cell segmentation, outperforming all other methods (QuPath\n$19.46/0.91\\%$; StarDist $45.33/2.32\\%$; Cellpose $31.85/5.61\\%$). Cell\nfeatures derived from Cyto R-CNN showed the best agreement to the gold standard\n($\\bar{D} = 0.15$) outperforming QuPath ($\\bar{D} = 0.22$), StarDist ($\\bar{D}\n= 0.25$) and Cellpose ($\\bar{D} = 0.23$).\n Conclusion: Our newly proposed Cyto R-CNN architecture outperforms current\nalgorithms in whole-cell segmentation while providing more reliable cell\nmeasurements than any other model. This could improve digital pathology\nworkflows, potentially leading to improved diagnosis. Moreover, our published\ndataset can be used to develop further models in the future.\n","authors":["Johannes Raufeisen","Kunpeng Xie","Fabian Hörst","Till Braunschweig","Jianning Li","Jens Kleesiek","Rainer Röhrig","Jan Egger","Bastian Leibe","Frank Hölzle","Alexander Hermans","Behrus Puladi"],"pdf_url":"https://arxiv.org/pdf/2401.15638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02555v1","updated":"2024-02-04T16:06:05Z","published":"2024-02-04T16:06:05Z","title":"Generalizable Entity Grounding via Assistance of Large Language Model","summary":" In this work, we propose a novel approach to densely ground visual entities\nfrom a long caption. We leverage a large multimodal model (LMM) to extract\nsemantic nouns, a class-agnostic segmentation model to generate entity-level\nsegmentation, and the proposed multi-modal feature fusion module to associate\neach semantic noun with its corresponding segmentation mask. Additionally, we\nintroduce a strategy of encoding entity segmentation masks into a colormap,\nenabling the preservation of fine-grained predictions from features of\nhigh-resolution masks. This approach allows us to extract visual features from\nlow-resolution images using the CLIP vision encoder in the LMM, which is more\ncomputationally efficient than existing approaches that use an additional\nencoder for high-resolution images. Our comprehensive experiments demonstrate\nthe superiority of our method, outperforming state-of-the-art techniques on\nthree tasks, including panoptic narrative grounding, referring expression\nsegmentation, and panoptic segmentation.\n","authors":["Lu Qi","Yi-Wen Chen","Lehan Yang","Tiancheng Shen","Xiangtai Li","Weidong Guo","Yu Xu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02554v1","updated":"2024-02-04T15:59:35Z","published":"2024-02-04T15:59:35Z","title":"DeSparsify: Adversarial Attack Against Token Sparsification Mechanisms\n in Vision Transformers","summary":" Vision transformers have contributed greatly to advancements in the computer\nvision domain, demonstrating state-of-the-art performance in diverse tasks\n(e.g., image classification, object detection). However, their high\ncomputational requirements grow quadratically with the number of tokens used.\nToken sparsification techniques have been proposed to address this issue. These\ntechniques employ an input-dependent strategy, in which uninformative tokens\nare discarded from the computation pipeline, improving the model's efficiency.\nHowever, their dynamism and average-case assumption makes them vulnerable to a\nnew threat vector - carefully crafted adversarial examples capable of fooling\nthe sparsification mechanism, resulting in worst-case performance. In this\npaper, we present DeSparsify, an attack targeting the availability of vision\ntransformers that use token sparsification mechanisms. The attack aims to\nexhaust the operating system's resources, while maintaining its stealthiness.\nOur evaluation demonstrates the attack's effectiveness on three token\nsparsification techniques and examines the attack's transferability between\nthem and its effect on the GPU resources. To mitigate the impact of the attack,\nwe propose various countermeasures.\n","authors":["Oryan Yehezkel","Alon Zolfi","Amit Baras","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2402.02554v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.02545v1","updated":"2024-02-04T15:48:20Z","published":"2024-02-04T15:48:20Z","title":"Classification of Tennis Actions Using Deep Learning","summary":" Recent advances of deep learning makes it possible to identify specific\nevents in videos with greater precision. This has great relevance in sports\nlike tennis in order to e.g., automatically collect game statistics, or replay\nactions of specific interest for game strategy or player improvements. In this\npaper, we investigate the potential and the challenges of using deep learning\nto classify tennis actions. Three models of different size, all based on the\ndeep learning architecture SlowFast were trained and evaluated on the academic\ntennis dataset THETIS. The best models achieve a generalization accuracy of 74\n%, demonstrating a good performance for tennis action classification. We\nprovide an error analysis for the best model and pinpoint directions for\nimprovement of tennis datasets in general. We discuss the limitations of the\ndata set, general limitations of current publicly available tennis data-sets,\nand future steps needed to make progress.\n","authors":["Emil Hovad","Therese Hougaard-Jensen","Line Katrine Harder Clemmensen"],"pdf_url":"https://arxiv.org/pdf/2402.02545v1.pdf","comment":"5 Figures"},{"id":"http://arxiv.org/abs/2402.02544v1","updated":"2024-02-04T15:46:43Z","published":"2024-02-04T15:46:43Z","title":"LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal\n Language Model","summary":" The revolutionary capabilities of large language models (LLMs) have paved the\nway for multimodal large language models (MLLMs) and fostered diverse\napplications across various specialized domains. In the remote sensing (RS)\nfield, however, the diverse geographical landscapes and varied objects in RS\nimagery are not adequately considered in recent MLLM endeavors. To bridge this\ngap, we construct a large-scale RS image-text dataset, LHRS-Align, and an\ninformative RS-specific instruction dataset, LHRS-Instruct, leveraging the\nextensive volunteered geographic information (VGI) and globally available RS\nimages. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored\nfor RS image understanding through a novel multi-level vision-language\nalignment strategy and a curriculum learning method. Comprehensive experiments\ndemonstrate that LHRS-Bot exhibits a profound understanding of RS images and\nthe ability to perform nuanced reasoning within the RS domain.\n","authors":["Dilxat Muhtar","Zhenshi Li","Feng Gu","Xueliang Zhang","Pengfeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.02544v1.pdf","comment":"32 pages, 8 figures. Github https://github.com/NJU-LHRS/LHRS-Bot"},{"id":"http://arxiv.org/abs/2402.02541v1","updated":"2024-02-04T15:41:35Z","published":"2024-02-04T15:41:35Z","title":"Knowledge Generation for Zero-shot Knowledge-based VQA","summary":" Previous solutions to knowledge-based visual question answering~(K-VQA)\nretrieve knowledge from external knowledge bases and use supervised learning to\ntrain the K-VQA model. Recently pre-trained LLMs have been used as both a\nknowledge source and a zero-shot QA model for K-VQA and demonstrated promising\nresults. However, these recent methods do not explicitly show the knowledge\nneeded to answer the questions and thus lack interpretability. Inspired by\nrecent work on knowledge generation from LLMs for text-based QA, in this work\nwe propose and test a similar knowledge-generation-based K-VQA method, which\nfirst generates knowledge from an LLM and then incorporates the generated\nknowledge for K-VQA in a zero-shot manner. We evaluate our method on two K-VQA\nbenchmarks and found that our method performs better than previous zero-shot\nK-VQA methods and our generated knowledge is generally relevant and helpful.\n","authors":["Rui Cao","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.02541v1.pdf","comment":"accepted as Findings in EACL 2023;"},{"id":"http://arxiv.org/abs/2402.00672v2","updated":"2024-02-04T15:39:34Z","published":"2024-02-01T15:33:17Z","title":"Exploring Homogeneous and Heterogeneous Consistent Label Associations\n for Unsupervised Visible-Infrared Person ReID","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID) aims to\nretrieve pedestrian images of the same identity from different modalities\nwithout annotations. While prior work focuses on establishing cross-modality\npseudo-label associations to bridge the modality-gap, they ignore maintaining\nthe instance-level homogeneous and heterogeneous consistency in pseudo-label\nspace, resulting in coarse associations. In response, we introduce a\nModality-Unified Label Transfer (MULT) module that simultaneously accounts for\nboth homogeneous and heterogeneous fine-grained instance-level structures,\nyielding high-quality cross-modality label associations. It models both\nhomogeneous and heterogeneous affinities, leveraging them to define the\ninconsistency for the pseudo-labels and then minimize it, leading to\npseudo-labels that maintain alignment across modalities and consistency within\nintra-modality structures. Additionally, a straightforward plug-and-play Online\nCross-memory Label Refinement (OCLR) module is proposed to further mitigate the\nimpact of noisy pseudo-labels while simultaneously aligning different\nmodalities, coupled with a Modality-Invariant Representation Learning (MIRL)\nframework. Experiments demonstrate that our proposed method outperforms\nexisting USL-VI-ReID methods, highlighting the superiority of our MULT in\ncomparison to other cross-modality association methods. The code will be\navailable.\n","authors":["Lingfeng He","De Cheng","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02540v1","updated":"2024-02-04T15:39:18Z","published":"2024-02-04T15:39:18Z","title":"Embedding Non-Distortive Cancelable Face Template Generation","summary":" Biometric authentication systems are crucial for security, but developing\nthem involves various complexities, including privacy, security, and achieving\nhigh accuracy without directly storing pure biometric data in storage. We\nintroduce an innovative image distortion technique that makes facial images\nunrecognizable to the eye but still identifiable by any custom embedding neural\nnetwork model. Using the proposed approach, we test the reliability of\nbiometric recognition networks by determining the maximum image distortion that\ndoes not change the predicted identity. Through experiments on MNIST and LFW\ndatasets, we assess its effectiveness and compare it based on the traditional\ncomparison metrics.\n","authors":["Dmytro Zakharov","Oleksandr Kuznetsov","Emanuele Frontoni","Natalia Kryvinska"],"pdf_url":"https://arxiv.org/pdf/2402.02540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02519v1","updated":"2024-02-04T15:07:49Z","published":"2024-02-04T15:07:49Z","title":"SIMPL: A Simple and Efficient Multi-agent Motion Prediction Baseline for\n Autonomous Driving","summary":" This paper presents a Simple and effIcient Motion Prediction baseLine (SIMPL)\nfor autonomous vehicles. Unlike conventional agent-centric methods with high\naccuracy but repetitive computations and scene-centric methods with compromised\naccuracy and generalizability, SIMPL delivers real-time, accurate motion\npredictions for all relevant traffic participants. To achieve improvements in\nboth accuracy and inference speed, we propose a compact and efficient global\nfeature fusion module that performs directed message passing in a symmetric\nmanner, enabling the network to forecast future motion for all road users in a\nsingle feed-forward pass and mitigating accuracy loss caused by viewpoint\nshifting. Additionally, we investigate the continuous trajectory\nparameterization using Bernstein basis polynomials in trajectory decoding,\nallowing evaluations of states and their higher-order derivatives at any\ndesired time point, which is valuable for downstream planning tasks. As a\nstrong baseline, SIMPL exhibits highly competitive performance on Argoverse 1 &\n2 motion forecasting benchmarks compared with other state-of-the-art methods.\nFurthermore, its lightweight design and low inference latency make SIMPL highly\nextensible and promising for real-world onboard deployment. We open-source the\ncode at https://github.com/HKUST-Aerial-Robotics/SIMPL.\n","authors":["Lu Zhang","Peiliang Li","Sikang Liu","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2402.02519v1.pdf","comment":"Code is available at https://github.com/HKUST-Aerial-Robotics/SIMPL"},{"id":"http://arxiv.org/abs/2402.02514v1","updated":"2024-02-04T14:59:31Z","published":"2024-02-04T14:59:31Z","title":"Deep Supervision by Gaussian Pseudo-label-based Morphological Attention\n for Abdominal Aorta Segmentation in Non-Contrast CTs","summary":" The segmentation of the abdominal aorta in non-contrast CT images is a\nnon-trivial task for computer-assisted endovascular navigation, particularly in\nscenarios where contrast agents are unsuitable. While state-of-the-art deep\nlearning segmentation models have been proposed recently for this task, they\nare trained on manually annotated strong labels. However, the inherent\nambiguity in the boundary of the aorta in non-contrast CT may undermine the\nreliability of strong labels, leading to potential overfitting risks. This\npaper introduces a Gaussian-based pseudo label, integrated into conventional\ndeep learning models through deep supervision, to achieve Morphological\nAttention (MA) enhancement. As the Gaussian pseudo label retains the\nmorphological features of the aorta without explicitly representing its\nboundary distribution, we suggest that it preserves aortic morphology during\ntraining while mitigating the negative impact of ambiguous boundaries, reducing\nthe risk of overfitting. It is introduced in various 2D/3D deep learning models\nand validated on our local data set of 30 non-contrast CT volumes comprising\n5749 CT slices. The results underscore the effectiveness of MA in preserving\nthe morphological characteristics of the aorta and addressing overfitting\nconcerns, thereby enhancing the performance of the models.\n","authors":["Qixiang Ma","Antoine Lucas","Adrien Kaladji","Pascal Haigron"],"pdf_url":"https://arxiv.org/pdf/2402.02514v1.pdf","comment":"Accepted by 21st IEEE International Symposium on Biomedical Imaging"},{"id":"http://arxiv.org/abs/2401.15313v3","updated":"2024-02-04T14:51:59Z","published":"2024-01-27T06:09:56Z","title":"Multi-Robot Relative Pose Estimation in SE(2) with Observability\n Analysis: A Comparison of Extended Kalman Filtering and Robust Pose Graph\n Optimization","summary":" In this study, we address multi-robot localization issues, with a specific\nfocus on cooperative localization and observability analysis of relative pose\nestimation. Cooperative localization involves enhancing each robot's\ninformation through a communication network and message passing. If odometry\ndata from a target robot can be transmitted to the ego robot, observability of\ntheir relative pose estimation can be achieved through range-only or\nbearing-only measurements, provided both robots have non-zero linear\nvelocities. In cases where odometry data from a target robot are not directly\ntransmitted but estimated by the ego robot, both range and bearing measurements\nare necessary to ensure observability of relative pose estimation. For\nROS/Gazebo simulations, we explore four sensing and communication structures.\nWe compare extended Kalman filtering (EKF) and pose graph optimization (PGO)\nestimation using different robust loss functions (filtering and smoothing with\nvarying batch sizes of sliding windows) in terms of estimation accuracy. In\nhardware experiments, two Turtlebot3 equipped with UWB modules are used for\nreal-world inter-robot relative pose estimation, applying both EKF and PGO and\ncomparing their performance.\n","authors":["Kihoon Shin","Hyunjae Sim","Seungwon Nam","Yonghee Kim","Jae Hu","Kwang-Ki K. Kim"],"pdf_url":"https://arxiv.org/pdf/2401.15313v3.pdf","comment":"20 pages, 21 figures"},{"id":"http://arxiv.org/abs/2402.02503v1","updated":"2024-02-04T14:28:23Z","published":"2024-02-04T14:28:23Z","title":"GeReA: Question-Aware Prompt Captions for Knowledge-based Visual\n Question Answering","summary":" Knowledge-based visual question answering (VQA) requires world knowledge\nbeyond the image for accurate answer. Recently, instead of extra knowledge\nbases, a large language model (LLM) like GPT-3 is activated as an implicit\nknowledge engine to jointly acquire and reason the necessary knowledge for\nanswering by converting images into textual information (e.g., captions and\nanswer candidates). However, such conversion may introduce irrelevant\ninformation, which causes the LLM to misinterpret images and ignore visual\ndetails crucial for accurate knowledge. We argue that multimodal large language\nmodel (MLLM) is a better implicit knowledge engine than the LLM for its\nsuperior capability of visual understanding. Despite this, how to activate the\ncapacity of MLLM as the implicit knowledge engine has not been explored yet.\nTherefore, we propose GeReA, a generate-reason framework that prompts a MLLM\nlike InstructBLIP with question relevant vision and language information to\ngenerate knowledge-relevant descriptions and reasons those descriptions for\nknowledge-based VQA. Specifically, the question-relevant image regions and\nquestion-specific manual prompts are encoded in the MLLM to generate the\nknowledge relevant descriptions, referred to as question-aware prompt captions.\nAfter that, the question-aware prompt captions, image-question pair, and\nsimilar samples are sent into the multi-modal reasoning model to learn a joint\nknowledge-image-question representation for answer prediction. GeReA unlocks\nthe use of MLLM as the implicit knowledge engine, surpassing all previous\nstate-of-the-art methods on OK-VQA and A-OKVQA datasets, with test accuracies\nof 66.5% and 63.3% respectively. Our code will be released at\nhttps://github.com/Upper9527/GeReA.\n","authors":["Ziyu Ma","Shutao Li","Bin Sun","Jianfei Cai","Zuxiang Long","Fuyan Ma"],"pdf_url":"https://arxiv.org/pdf/2402.02503v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2402.02500v1","updated":"2024-02-04T14:18:45Z","published":"2024-02-04T14:18:45Z","title":"Point Cloud Matters: Rethinking the Impact of Different Observation\n Spaces on Robot Learning","summary":" In this study, we explore the influence of different observation spaces on\nrobot learning, focusing on three predominant modalities: RGB, RGB-D, and point\ncloud. Through extensive experimentation on over 17 varied contact-rich\nmanipulation tasks, conducted across two benchmarks and simulators, we have\nobserved a notable trend: point cloud-based methods, even those with the\nsimplest designs, frequently surpass their RGB and RGB-D counterparts in\nperformance. This remains consistent in both scenarios: training from scratch\nand utilizing pretraining. Furthermore, our findings indicate that point cloud\nobservations lead to improved policy zero-shot generalization in relation to\nvarious geometry and visual clues, including camera viewpoints, lighting\nconditions, noise levels and background appearance. The outcomes suggest that\n3D point cloud is a valuable observation modality for intricate robotic tasks.\nWe will open-source all our codes and checkpoints, hoping that our insights can\nhelp design more generalizable and robust robotic models.\n","authors":["Haoyi Zhu","Yating Wang","Di Huang","Weicai Ye","Wanli Ouyang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2402.02500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02498v1","updated":"2024-02-04T14:12:51Z","published":"2024-02-04T14:12:51Z","title":"Fully Differentiable Correlation-driven 2D/3D Registration for X-ray to\n CT Image Fusion","summary":" Image-based rigid 2D/3D registration is a critical technique for fluoroscopic\nguided surgical interventions. In recent years, some learning-based fully\ndifferentiable methods have produced beneficial outcomes while the process of\nfeature extraction and gradient flow transmission still lack controllability\nand interpretability. To alleviate these problems, in this work, we propose a\nnovel fully differentiable correlation-driven network using a dual-branch\nCNN-transformer encoder which enables the network to extract and separate\nlow-frequency global features from high-frequency local features. A\ncorrelation-driven loss is further proposed for low-frequency feature and\nhigh-frequency feature decomposition based on embedded information. Besides, a\ntraining strategy that learns to approximate a convex-shape similarity function\nis applied in our work. We test our approach on a in-house datasetand show that\nit outperforms both existing fully differentiable learning-based registration\napproaches and the conventional optimization-based baseline.\n","authors":["Minheng Chen","Zhirun Zhang","Shuheng Gu","Zhangyang Ge","Youyong Kong"],"pdf_url":"https://arxiv.org/pdf/2402.02498v1.pdf","comment":"ISBI 2024"},{"id":"http://arxiv.org/abs/2312.11872v2","updated":"2024-02-04T14:12:38Z","published":"2023-12-19T05:52:38Z","title":"Beyond Prototypes: Semantic Anchor Regularization for Better\n Representation Learning","summary":" One of the ultimate goals of representation learning is to achieve\ncompactness within a class and well-separability between classes. Many\noutstanding metric-based and prototype-based methods following the\nExpectation-Maximization paradigm, have been proposed for this objective.\nHowever, they inevitably introduce biases into the learning process,\nparticularly with long-tail distributed training data. In this paper, we reveal\nthat the class prototype is not necessarily to be derived from training\nfeatures and propose a novel perspective to use pre-defined class anchors\nserving as feature centroid to unidirectionally guide feature learning.\nHowever, the pre-defined anchors may have a large semantic distance from the\npixel features, which prevents them from being directly applied. To address\nthis issue and generate feature centroid independent from feature learning, a\nsimple yet effective Semantic Anchor Regularization (SAR) is proposed. SAR\nensures the interclass separability of semantic anchors in the semantic space\nby employing a classifier-aware auxiliary cross-entropy loss during training\nvia disentanglement learning. By pulling the learned features to these semantic\nanchors, several advantages can be attained: 1) the intra-class compactness and\nnaturally inter-class separability, 2) induced bias or errors from feature\nlearning can be avoided, and 3) robustness to the long-tailed problem. The\nproposed SAR can be used in a plug-and-play manner in the existing models.\nExtensive experiments demonstrate that the SAR performs better than previous\nsophisticated prototype-based methods. The implementation is available at\nhttps://github.com/geyanqi/SAR.\n","authors":["Yanqi Ge","Qiang Nie","Ye Huang","Yong Liu","Chengjie Wang","Feng Zheng","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2312.11872v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2402.02491v1","updated":"2024-02-04T13:37:21Z","published":"2024-02-04T13:37:21Z","title":"VM-UNet: Vision Mamba UNet for Medical Image Segmentation","summary":" In the realm of medical image segmentation, both CNN-based and\nTransformer-based models have been extensively explored. However, CNNs exhibit\nlimitations in long-range modeling capabilities, whereas Transformers are\nhampered by their quadratic computational complexity. Recently, State Space\nModels (SSMs), exemplified by Mamba, have emerged as a promising approach. They\nnot only excel in modeling long-range interactions but also maintain a linear\ncomputational complexity. In this paper, leveraging state space models, we\npropose a U-shape architecture model for medical image segmentation, named\nVision Mamba UNet (VM-UNet). Specifically, the Visual State Space (VSS) block\nis introduced as the foundation block to capture extensive contextual\ninformation, and an asymmetrical encoder-decoder structure is constructed. We\nconduct comprehensive experiments on the ISIC17, ISIC18, and Synapse datasets,\nand the results indicate that VM-UNet performs competitively in medical image\nsegmentation tasks. To our best knowledge, this is the first medical image\nsegmentation model constructed based on the pure SSM-based model. We aim to\nestablish a baseline and provide valuable insights for the future development\nof more efficient and effective SSM-based segmentation systems. Our code is\navailable at https://github.com/JCruan519/VM-UNet.\n","authors":["Jiacheng Ruan","Suncheng Xiang"],"pdf_url":"https://arxiv.org/pdf/2402.02491v1.pdf","comment":"12 pages, 2 figures, 3 tables. Work in progress"},{"id":"http://arxiv.org/abs/2402.02474v1","updated":"2024-02-04T13:09:13Z","published":"2024-02-04T13:09:13Z","title":"Deep Spectral Improvement for Unsupervised Image Instance Segmentation","summary":" Deep spectral methods reframe the image decomposition process as a graph\npartitioning task by extracting features using self-supervised learning and\nutilizing the Laplacian of the affinity matrix to obtain eigensegments.\nHowever, instance segmentation has received less attention compared to other\ntasks within the context of deep spectral methods. This paper addresses the\nfact that not all channels of the feature map extracted from a self-supervised\nbackbone contain sufficient information for instance segmentation purposes. In\nfact, Some channels are noisy and hinder the accuracy of the task. To overcome\nthis issue, this paper proposes two channel reduction modules: Noise Channel\nReduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains\nchannels with lower entropy, as they are less likely to be noisy, while DCR\nprunes channels with low standard deviation, as they lack sufficient\ninformation for effective instance segmentation. Furthermore, the paper\ndemonstrates that the dot product, commonly used in deep spectral methods, is\nnot suitable for instance segmentation due to its sensitivity to feature map\nvalues, potentially leading to incorrect instance segments. A new similarity\nmetric called Bray-Curtis over Chebyshev (BoC) is proposed to address this\nissue. It takes into account the distribution of features in addition to their\nvalues, providing a more robust similarity measure for instance segmentation.\nQuantitative and qualitative results on the Youtube-VIS2019 dataset highlight\nthe improvements achieved by the proposed channel reduction methods and the use\nof BoC instead of the conventional dot product for creating the affinity\nmatrix. These improvements are observed in terms of mean Intersection over\nUnion and extracted instance segments, demonstrating enhanced instance\nsegmentation performance. The code is available on:\nhttps://github.com/farnooshar/SpecUnIIS\n","authors":["Farnoosh Arefi","Amir M. Mansourian","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2402.02474v1.pdf","comment":"11 pages, 13 figures and 5 tables"},{"id":"http://arxiv.org/abs/2306.04607v7","updated":"2024-02-04T13:05:49Z","published":"2023-06-07T17:17:58Z","title":"GeoDiffusion: Text-Prompted Geometric Control for Object Detection Data\n Generation","summary":" Diffusion models have attracted significant attention due to the remarkable\nability to create content and generate data for tasks like image\nclassification. However, the usage of diffusion models to generate the\nhigh-quality object detection data remains an underexplored area, where not\nonly image-level perceptual quality but also geometric conditions such as\nbounding boxes and camera views are essential. Previous studies have utilized\neither copy-paste synthesis or layout-to-image (L2I) generation with\nspecifically designed modules to encode the semantic layouts. In this paper, we\npropose the GeoDiffusion, a simple framework that can flexibly translate\nvarious geometric conditions into text prompts and empower pre-trained\ntext-to-image (T2I) diffusion models for high-quality detection data\ngeneration. Unlike previous L2I methods, our GeoDiffusion is able to encode not\nonly the bounding boxes but also extra geometric conditions such as camera\nviews in self-driving scenes. Extensive experiments demonstrate GeoDiffusion\noutperforms previous L2I methods while maintaining 4x training time faster. To\nthe best of our knowledge, this is the first work to adopt diffusion models for\nlayout-to-image generation with geometric conditions and demonstrate that\nL2I-generated images can be beneficial for improving the performance of object\ndetectors.\n","authors":["Kai Chen","Enze Xie","Zhe Chen","Yibo Wang","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2306.04607v7.pdf","comment":"Accept by ICLR 2024. Project Page:\n https://kaichen1998.github.io/projects/geodiffusion/"},{"id":"http://arxiv.org/abs/2312.15740v2","updated":"2024-02-04T12:32:35Z","published":"2023-12-25T14:25:43Z","title":"BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge","summary":" High-definition (HD) cameras for surveillance and road traffic have\nexperienced tremendous growth, demanding intensive computation resources for\nreal-time analytics. Recently, offloading frames from the front-end device to\nthe back-end edge server has shown great promise. In multi-stream competitive\nenvironments, efficient bandwidth management and proper scheduling are crucial\nto ensure both high inference accuracy and high throughput. To achieve this\ngoal, we propose BiSwift, a bi-level framework that scales the concurrent\nreal-time video analytics by a novel adaptive hybrid codec integrated with\nmulti-level pipelines, and a global bandwidth controller for multiple video\nstreams. The lower-level front-back-end collaborative mechanism (called\nadaptive hybrid codec) locally optimizes the accuracy and accelerates\nend-to-end video analytics for a single stream. The upper-level scheduler aims\nto accuracy fairness among multiple streams via the global bandwidth\ncontroller. The evaluation of BiSwift shows that BiSwift is able to real-time\nobject detection on 9 streams with an edge device only equipped with an NVIDIA\nRTX3070 (8G) GPU. BiSwift improves 10%$\\sim$21% accuracy and presents\n1.2$\\sim$9$\\times$ throughput compared with the state-of-the-art video\nanalytics pipelines.\n","authors":["Lin Sun","Weijun Wang","Tingting Yuan","Liang Mi","Haipeng Dai","Yunxin Liu","Xiaoming Fu"],"pdf_url":"https://arxiv.org/pdf/2312.15740v2.pdf","comment":"Accepted by 2024 IEEE INFOCOM"},{"id":"http://arxiv.org/abs/2402.02453v1","updated":"2024-02-04T11:49:51Z","published":"2024-02-04T11:49:51Z","title":"AI Art Neural Constellation: Revealing the Collective and Contrastive\n State of AI-Generated and Human Art","summary":" Discovering the creative potentials of a random signal to various artistic\nexpressions in aesthetic and conceptual richness is a ground for the recent\nsuccess of generative machine learning as a way of art creation. To understand\nthe new artistic medium better, we conduct a comprehensive analysis to position\nAI-generated art within the context of human art heritage. Our comparative\nanalysis is based on an extensive dataset, dubbed ``ArtConstellation,''\nconsisting of annotations about art principles, likability, and emotions for\n6,000 WikiArt and 3,200 AI-generated artworks. After training various\nstate-of-the-art generative models, art samples are produced and compared with\nWikiArt data on the last hidden layer of a deep-CNN trained for style\nclassification. We actively examined the various art principles to interpret\nthe neural representations and used them to drive the comparative knowledge\nabout human and AI-generated art. A key finding in the semantic analysis is\nthat AI-generated artworks are visually related to the principle concepts for\nmodern period art made in 1800-2000. In addition, through Out-Of-Distribution\n(OOD) and In-Distribution (ID) detection in CLIP space, we find that\nAI-generated artworks are ID to human art when they depict landscapes and\ngeometric abstract figures, while detected as OOD when the machine art consists\nof deformed and twisted figures. We observe that machine-generated art is\nuniquely characterized by incomplete and reduced figuration. Lastly, we\nconducted a human survey about emotional experience. Color composition and\nfamiliar subjects are the key factors of likability and emotions in art\nappreciation. We propose our whole methodologies and collected dataset as our\nanalytical framework to contrast human and AI-generated art, which we refer to\nas ``ArtNeuralConstellation''. Code is available at:\nhttps://github.com/faixan-khan/ArtNeuralConstellation\n","authors":["Faizan Farooq Khan","Diana Kim","Divyansh Jha","Youssef Mohamed","Hanna H Chang","Ahmed Elgammal","Luba Elliott","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2402.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02444v1","updated":"2024-02-04T10:52:43Z","published":"2024-02-04T10:52:43Z","title":"BECLR: Batch Enhanced Contrastive Few-Shot Learning","summary":" Learning quickly from very few labeled samples is a fundamental attribute\nthat separates machines and humans in the era of deep representation learning.\nUnsupervised few-shot learning (U-FSL) aspires to bridge this gap by discarding\nthe reliance on annotations at training time. Intrigued by the success of\ncontrastive learning approaches in the realm of U-FSL, we structurally approach\ntheir shortcomings in both pretraining and downstream inference stages. We\npropose a novel Dynamic Clustered mEmory (DyCE) module to promote a highly\nseparable latent representation space for enhancing positive sampling at the\npretraining phase and infusing implicit class-level insights into unsupervised\ncontrastive learning. We then tackle the, somehow overlooked yet critical,\nissue of sample bias at the few-shot inference stage. We propose an iterative\nOptimal Transport-based distribution Alignment (OpTA) strategy and demonstrate\nthat it efficiently addresses the problem, especially in low-shot scenarios\nwhere FSL approaches suffer the most from sample bias. We later on discuss that\nDyCE and OpTA are two intertwined pieces of a novel end-to-end approach (we\ncoin as BECLR), constructively magnifying each other's impact. We then present\na suite of extensive quantitative and qualitative experimentation to\ncorroborate that BECLR sets a new state-of-the-art across ALL existing U-FSL\nbenchmarks (to the best of our knowledge), and significantly outperforms the\nbest of the current baselines (codebase available at:\nhttps://github.com/stypoumic/BECLR).\n","authors":["Stylianos Poulakakis-Daktylidis","Hadi Jamali-Rad"],"pdf_url":"https://arxiv.org/pdf/2402.02444v1.pdf","comment":"ICLR 2024 Spotlight Presentation"},{"id":"http://arxiv.org/abs/2306.04940v3","updated":"2024-02-04T10:34:10Z","published":"2023-06-08T05:13:34Z","title":"LayerAct: Advanced activation mechanism utilizing layer-direction\n normalization for CNNs with BatchNorm","summary":" In this work, we propose a novel activation mechanism aimed at establishing\nlayer-level activation (LayerAct) functions for CNNs with BatchNorm. These\nfunctions are designed to be more noise-robust compared to existing\nelement-level activation functions by reducing the layer-level fluctuation of\nthe activation outputs due to shift in inputs. Moreover, the LayerAct functions\nachieve this noise-robustness independent of the activation's saturation state,\nwhich limits the activation output space and complicates efficient training. We\npresent an analysis and experiments demonstrating that LayerAct functions\nexhibit superior noise-robustness compared to element-level activation\nfunctions, and empirically show that these functions have a zero-like mean\nactivation. Experimental results with three clean and three out-of-distribution\nbenchmark datasets for image classification tasks show that LayerAct functions\nexcel in handling noisy datasets, outperforming element-level activation\nfunctions, while the performance on clean datasets is also superior in most\ncases.\n","authors":["Kihyuk Yoon","Chiehyeon Lim"],"pdf_url":"https://arxiv.org/pdf/2306.04940v3.pdf","comment":"10 pages, 3 figures, 3 tables except appendix"},{"id":"http://arxiv.org/abs/2310.18961v4","updated":"2024-02-04T10:28:24Z","published":"2023-10-29T10:03:49Z","title":"AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly\n Detection","summary":" Zero-shot anomaly detection (ZSAD) requires detection models trained using\nauxiliary data to detect anomalies without any training sample in a target\ndataset. It is a crucial task when training data is not accessible due to\nvarious concerns, \\eg, data privacy, yet it is challenging since the models\nneed to generalize to anomalies across different domains where the appearance\nof foreground objects, abnormal regions, and background features, such as\ndefects/tumors on different products/organs, can vary significantly. Recently\nlarge pre-trained vision-language models (VLMs), such as CLIP, have\ndemonstrated strong zero-shot recognition ability in various vision tasks,\nincluding anomaly detection. However, their ZSAD performance is weak since the\nVLMs focus more on modeling the class semantics of the foreground objects\nrather than the abnormality/normality in the images. In this paper we introduce\na novel approach, namely AnomalyCLIP, to adapt CLIP for accurate ZSAD across\ndifferent domains. The key insight of AnomalyCLIP is to learn object-agnostic\ntext prompts that capture generic normality and abnormality in an image\nregardless of its foreground objects. This allows our model to focus on the\nabnormal image regions rather than the object semantics, enabling generalized\nnormality and abnormality recognition on diverse types of objects. Large-scale\nexperiments on 17 real-world anomaly detection datasets show that AnomalyCLIP\nachieves superior zero-shot performance of detecting and segmenting anomalies\nin datasets of highly diverse class semantics from various defect inspection\nand medical imaging domains. Code will be made available at\nhttps://github.com/zqhang/AnomalyCLIP.\n","authors":["Qihang Zhou","Guansong Pang","Yu Tian","Shibo He","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2310.18961v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02433v1","updated":"2024-02-04T10:11:27Z","published":"2024-02-04T10:11:27Z","title":"Uncertainty-Aware Perceiver","summary":" The Perceiver makes few architectural assumptions about the relationship\namong its inputs with quadratic scalability on its memory and computation time.\nIndeed, the Perceiver model outpaces or is competitive with ResNet-50 and ViT\nin terms of accuracy to some degree. However, the Perceiver does not take\npredictive uncertainty and calibration into account. The Perceiver also\ngeneralizes its performance on three datasets, three models, one evaluation\nmetric, and one hyper-parameter setting. Worst of all, the Perceiver's relative\nperformance improvement against other models is marginal. Furthermore, its\nreduction of architectural prior is not substantial; is not equivalent to its\nquality. Thereby, I invented five mutations of the Perceiver, the\nUncertainty-Aware Perceivers, that obtain uncertainty estimates and measured\ntheir performance on three metrics. Experimented with CIFAR-10 and CIFAR-100,\nthe Uncertainty-Aware Perceivers make considerable performance enhancement\ncompared to the Perceiver.\n","authors":["EuiYul Song"],"pdf_url":"https://arxiv.org/pdf/2402.02433v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.02431v1","updated":"2024-02-04T10:00:00Z","published":"2024-02-04T10:00:00Z","title":"Learning Mutual Excitation for Hand-to-Hand and Human-to-Human\n Interaction Recognition","summary":" Recognizing interactive actions, including hand-to-hand interaction and\nhuman-to-human interaction, has attracted increasing attention for various\napplications in the field of video analysis and human-robot interaction.\nConsidering the success of graph convolution in modeling topology-aware\nfeatures from skeleton data, recent methods commonly operate graph convolution\non separate entities and use late fusion for interactive action recognition,\nwhich can barely model the mutual semantic relationships between pairwise\nentities. To this end, we propose a mutual excitation graph convolutional\nnetwork (me-GCN) by stacking mutual excitation graph convolution (me-GC)\nlayers. Specifically, me-GC uses a mutual topology excitation module to firstly\nextract adjacency matrices from individual entities and then adaptively model\nthe mutual constraints between them. Moreover, me-GC extends the above idea and\nfurther uses a mutual feature excitation module to extract and merge deep\nfeatures from pairwise entities. Compared with graph convolution, our proposed\nme-GC gradually learns mutual information in each layer and each stage of graph\nconvolution operations. Extensive experiments on a challenging hand-to-hand\ninteraction dataset, i.e., the Assembely101 dataset, and two large-scale\nhuman-to-human interaction datasets, i.e., NTU60-Interaction and\nNTU120-Interaction consistently verify the superiority of our proposed method,\nwhich outperforms the state-of-the-art GCN-based and Transformer-based methods.\n","authors":["Mengyuan Liu","Chen Chen","Songtao Wu","Fanyang Meng","Hong Liu"],"pdf_url":"https://arxiv.org/pdf/2402.02431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02430v1","updated":"2024-02-04T09:59:18Z","published":"2024-02-04T09:59:18Z","title":"Exploiting Low-level Representations for Ultra-Fast Road Segmentation","summary":" Achieving real-time and accuracy on embedded platforms has always been the\npursuit of road segmentation methods. To this end, they have proposed many\nlightweight networks. However, they ignore the fact that roads are \"stuff\"\n(background or environmental elements) rather than \"things\" (specific\nidentifiable objects), which inspires us to explore the feasibility of\nrepresenting roads with low-level instead of high-level features. Surprisingly,\nwe find that the primary stage of mainstream network models is sufficient to\nrepresent most pixels of the road for segmentation. Motivated by this, we\npropose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg).\nSpecifically, LFD-RoadSeg employs a bilateral structure. The spatial detail\nbranch is firstly designed to extract low-level feature representation for the\nroad by the first stage of ResNet-18. To suppress texture-less regions mistaken\nas the road in the low-level feature, the context semantic branch is then\ndesigned to extract the context feature in a fast manner. To this end, in the\nsecond branch, we asymmetrically downsample the input image and design an\naggregation module to achieve comparable receptive fields to the third stage of\nResNet-18 but with less time consumption. Finally, to segment the road from the\nlow-level feature, a selective fusion module is proposed to calculate\npixel-wise attention between the low-level representation and context feature,\nand suppress the non-road low-level response by this attention. On KITTI-Road,\nLFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average\nprecision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on\na Jetson TX2, all with a compact model size of just 936k parameters. The source\ncode is available at https://github.com/zhouhuan-hust/LFD-RoadSeg.\n","authors":["Huan Zhou","Feng Xue","Yucong Li","Shi Gong","Yiqun Li","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02430v1.pdf","comment":"11 pages, 7 figures, TEEE TITS"},{"id":"http://arxiv.org/abs/2402.02411v1","updated":"2024-02-04T09:07:28Z","published":"2024-02-04T09:07:28Z","title":"Physics-Inspired Degradation Models for Hyperspectral Image Fusion","summary":" The fusion of a low-spatial-resolution hyperspectral image (LR-HSI) with a\nhigh-spatial-resolution multispectral image (HR-MSI) has garnered increasing\nresearch interest. However, most fusion methods solely focus on the fusion\nalgorithm itself and overlook the degradation models, which results in\nunsatisfactory performance in practical scenarios. To fill this gap, we propose\nphysics-inspired degradation models (PIDM) to model the degradation of LR-HSI\nand HR-MSI, which comprises a spatial degradation network (SpaDN) and a\nspectral degradation network (SpeDN). SpaDN and SpeDN are designed based on two\ninsights. First, we employ spatial warping and spectral modulation operations\nto simulate lens aberrations, thereby introducing non-uniformity into the\nspatial and spectral degradation processes. Second, we utilize asymmetric\ndownsampling and parallel downsampling operations to separately reduce the\nspatial and spectral resolutions of the images, thus ensuring the matching of\nspatial and spectral degradation processes with specific physical\ncharacteristics. Once SpaDN and SpeDN are established, we adopt a\nself-supervised training strategy to optimize the network parameters and\nprovide a plug-and-play solution for fusion methods. Comprehensive experiments\ndemonstrate that our proposed PIDM can boost the fusion performance of existing\nfusion methods in practical scenarios.\n","authors":["Jie Lian","Lizhi Wang","Lin Zhu","Renwei Dian","Zhiwei Xiong","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2402.02411v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.02626v1","updated":"2024-02-04T22:15:30Z","published":"2024-02-04T22:15:30Z","title":"Position bias in features","summary":" The purpose of modeling document relevance for search engines is to rank\nbetter in subsequent searches. Document-specific historical click-through rates\ncan be important features in a dynamic ranking system which updates as we\naccumulate more sample. This paper describes the properties of several such\nfeatures, and tests them in controlled experiments. Extending the inverse\npropensity weighting method to documents creates an unbiased estimate of\ndocument relevance. This feature can approximate relevance accurately, leading\nto near-optimal ranking in ideal circumstances. However, it has high variance\nthat is increasing with respect to the degree of position bias. Furthermore,\ninaccurate position bias estimation leads to poor performance. Under several\nscenarios this feature can perform worse than biased click-through rates. This\npaper underscores the need for accurate position bias estimation, and is unique\nin suggesting simultaneous use of biased and unbiased position bias features.\n","authors":["Richard Demsyn-Jones"],"pdf_url":"https://arxiv.org/pdf/2402.02626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07134v4","updated":"2024-02-04T22:08:05Z","published":"2023-08-14T13:41:09Z","title":"Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models has revolutionized\nvarious AI research domains. Transformers-based Large Language Models (LLMs)\nhave gradually replaced CNNs and RNNs to unify fields of computer vision and\nnatural language processing. Compared with independent data samples such as\nimages, videos or texts, graphs usually contain rich structural and relational\ninformation. Meanwhile, language, especially natural language, being one of the\nmost expressive mediums, excels in describing complex structures. However,\nexisting work on incorporating graph problems into the generative language\nmodeling framework remains very limited. Considering the rising prominence of\nLLMs, it becomes essential to explore whether LLMs can also replace GNNs as the\nfoundation model for graphs. In this paper, we propose InstructGLM\n(Instruction-finetuned Graph Language Model) with highly scalable prompts based\non natural language instructions. We use natural language to describe\nmulti-scale geometric structure of the graph and then instruction finetune an\nLLM to perform graph tasks, which enables Generative Graph Learning. Our method\nsurpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets,\nunderscoring its effectiveness and sheds light on generative LLMs as new\nfoundation model for graph machine learning. Our code is open-sourced at\nhttps://github.com/agiresearch/InstructGLM.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v4.pdf","comment":"In EACL 2024"},{"id":"http://arxiv.org/abs/2304.07944v3","updated":"2024-02-04T22:04:34Z","published":"2023-04-17T01:55:40Z","title":"An In-depth Investigation of User Response Simulation for Conversational\n Search","summary":" Conversational search has seen increased recent attention in both the IR and\nNLP communities. It seeks to clarify and solve users' search needs through\nmulti-turn natural language interactions. However, most existing systems are\ntrained and demonstrated with recorded or artificial conversation logs.\nEventually, conversational search systems should be trained, evaluated, and\ndeployed in an open-ended setting with unseen conversation trajectories. A key\nchallenge is that training and evaluating such systems both require a\nhuman-in-the-loop, which is expensive and does not scale. One strategy is to\nsimulate users, thereby reducing the scaling costs. However, current user\nsimulators are either limited to only responding to yes-no questions from the\nconversational search system or unable to produce high-quality responses in\ngeneral.\n In this paper, we show that existing user simulation systems could be\nsignificantly improved by a smaller finetuned natural language generation\nmodel. However, rather than merely reporting it as the new state-of-the-art, we\nconsider it a strong baseline and present an in-depth investigation of\nsimulating user response for conversational search. Our goal is to supplement\nexisting work with an insightful hand-analysis of unsolved challenges by the\nbaseline and propose our solutions. The challenges we identified include (1) a\nblind spot that is difficult to learn, and (2) a specific type of misevaluation\nin the standard setup. We propose a new generation system to effectively cover\nthe training blind spot and suggest a new evaluation setup to avoid\nmisevaluation. Our proposed system leads to significant improvements over\nexisting systems and large language models such as GPT-4. Additionally, our\nanalysis provides insights into the nature of user simulation to facilitate\nfuture work.\n","authors":["Zhenduo Wang","Zhichao Xu","Qingyao Ai","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2304.07944v3.pdf","comment":"To appear in The Web Conference 2024, 8 pages with Appendices"},{"id":"http://arxiv.org/abs/2312.13434v3","updated":"2024-02-04T10:45:40Z","published":"2023-12-20T21:20:23Z","title":"Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of\n Early-bird Students towards Three Diagnostic Objectives","summary":" Cognitive diagnosis seeks to estimate the cognitive states of students by\nexploring their logged practice quiz data. It plays a pivotal role in\npersonalized learning guidance within intelligent education systems. In this\npaper, we focus on an important, practical, yet often underexplored task:\ndomain-level zero-shot cognitive diagnosis (DZCD), which arises due to the\nabsence of student practice logs in newly launched domains. Recent cross-domain\ndiagnostic models have been demonstrated to be a promising strategy for DZCD.\nThese methods primarily focus on how to transfer student states across domains.\nHowever, they might inadvertently incorporate non-transferable information into\nstudent representations, thereby limiting the efficacy of knowledge transfer.\nTo tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive\ndiagnosis framework via one batch of early-bird students towards three\ndiagnostic objectives. Our approach initiates with pre-training a diagnosis\nmodel with dual regularizers, which decouples student states into domain-shared\nand domain-specific parts. The shared cognitive signals can be transferred to\nthe target domain, enriching the cognitive priors for the new domain, which\nensures the cognitive state propagation objective. Subsequently, we devise a\nstrategy to generate simulated practice logs for cold-start students through\nanalyzing the behavioral patterns from early-bird students, fulfilling the\ndomain-adaption goal. Consequently, we refine the cognitive states of\ncold-start students as diagnostic outcomes via virtual data, aligning with the\ndiagnosis-oriented goal. Finally, extensive experiments on six real-world\ndatasets highlight the efficacy of our model for DZCD and its practical\napplication in question recommendation. The code is publicly available at\nhttps://github.com/bigdata-ustc/Zero-1-to-3.\n","authors":["Weibo Gao","Qi Liu","Hao Wang","Linan Yue","Haoyang Bi","Yin Gu","Fangzhou Yao","Zheng Zhang","Xin Li","Yuanjing He"],"pdf_url":"https://arxiv.org/pdf/2312.13434v3.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2402.02418v1","updated":"2024-02-04T09:34:13Z","published":"2024-02-04T09:34:13Z","title":"eXplainable Bayesian Multi-Perspective Generative Retrieval","summary":" Modern deterministic retrieval pipelines prioritize achieving\nstate-of-the-art performance but often lack interpretability in\ndecision-making. These models face challenges in assessing uncertainty, leading\nto overconfident predictions. To overcome these limitations, we integrate\nuncertainty calibration and interpretability into a retrieval pipeline.\nSpecifically, we introduce Bayesian methodologies and multi-perspective\nretrieval to calibrate uncertainty within a retrieval pipeline. We incorporate\ntechniques such as LIME and SHAP to analyze the behavior of a black-box\nreranker model. The importance scores derived from these explanation\nmethodologies serve as supplementary relevance scores to enhance the base\nreranker model. We evaluate the resulting performance enhancements achieved\nthrough uncertainty calibration and interpretable reranking on Question\nAnswering and Fact Checking tasks. Our methods demonstrate substantial\nperformance improvements across three KILT datasets.\n","authors":["EuiYul Song","Philhoon Oh","Sangryul Kim","James Thorne"],"pdf_url":"https://arxiv.org/pdf/2402.02418v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.11767v2","updated":"2024-02-04T06:59:45Z","published":"2023-08-15T23:22:37Z","title":"Detection of ChatGPT Fake Science with the xFakeBibs Learning Algorithm","summary":" ChatGPT is becoming a new reality. In this paper, we demonstrate a method for\ndistinguishing ChatGPT-generated publications from those produced by\nscientists. The objective of this work is to introduce a newly designed\nsupervised network-driven algorithm that illustrates how to predict\nmachine-generated content. The premise is that ChatGPT content exhibits\nbehavior that is distinctive and can be set apart from scientific articles. The\nalgorithm was trained and tested on three disease-specific publications, with\neach model constructed from 100 abstracts. Additionally, the algorithm\nunderwent k-Folds calibration (depending on the availability of the data) to\nestablish a lower-upper bound range of acceptance. The network training model\nof ChatGPT showed a lower number of nodes and a higher number of edges when\ncompared with models of real article abstracts. The algorithm was executed in\nsingle-mode to predict the class of one type of dataset at a time and achieved\n>94%. It was also executed in multi-mode on mixed documents of ChatGPT and\nPubMed abstracts. The algorithm remarkably predicted real articles with a\nprecision of 100% and, on rare occasions, 96%-98%. However, ChatGPT content was\noften misclassified as real publications with up to 88% accuracy in all\ndatasets of the three diseases. Our results also showed that the year of\npublications mixed with ChatGPT-generated content may play a factor in\ndetecting the correct class, where the older the publication, the better the\nprediction.\n","authors":["Ahmed Abdeen Hamed","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2308.11767v2.pdf","comment":"14 pages, 6 figures, 4 tables, 2 algorithms"},{"id":"http://arxiv.org/abs/2402.03388v1","updated":"2024-02-04T10:18:33Z","published":"2024-02-04T10:18:33Z","title":"Delivery Optimized Discovery in Behavioral User Segmentation under\n Budget Constrain","summary":" Users' behavioral footprints online enable firms to discover behavior-based\nuser segments (or, segments) and deliver segment specific messages to users.\nFollowing the discovery of segments, delivery of messages to users through\npreferred media channels like Facebook and Google can be challenging, as only a\nportion of users in a behavior segment find match in a medium, and only a\nfraction of those matched actually see the message (exposure). Even high\nquality discovery becomes futile when delivery fails. Many sophisticated\nalgorithms exist for discovering behavioral segments; however, these ignore the\ndelivery component. The problem is compounded because (i) the discovery is\nperformed on the behavior data space in firms' data (e.g., user clicks), while\nthe delivery is predicated on the static data space (e.g., geo, age) as defined\nby media; and (ii) firms work under budget constraint. We introduce a\nstochastic optimization based algorithm for delivery optimized discovery of\nbehavioral user segmentation and offer new metrics to address the joint\noptimization. We leverage optimization under a budget constraint for delivery\ncombined with a learning-based component for discovery. Extensive experiments\non a public dataset from Google and a proprietary dataset show the\neffectiveness of our approach by simultaneously improving delivery metrics,\nreducing budget spend and achieving strong predictive performance in discovery.\n","authors":["Harshita Chopra","Atanu R. Sinha","Sunav Choudhary","Ryan A. Rossi","Paavan Kumar Indela","Veda Pranav Parwatala","Srinjayee Paul","Aurghya Maiti"],"pdf_url":"https://arxiv.org/pdf/2402.03388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03380v1","updated":"2024-02-04T05:46:21Z","published":"2024-02-04T05:46:21Z","title":"Modified K-means with Cluster Assignment -- Application to COVID-19 Data","summary":" Text extraction is a highly subjective problem which depends on the dataset\nthat one is working on and the kind of summarization details that needs to be\nextracted out. All the steps ranging from preprocessing of the data, to the\nchoice of an optimal model for predictions, depends on the problem and the\ncorpus at hand. In this paper, we describe a text extraction model where the\naim is to extract word specified information relating to the semantics such\nthat we can get all related and meaningful information about that word in a\nsuccinct format. This model can obtain meaningful results and can augment\nubiquitous search model or a normal clustering or topic modelling algorithms.\nBy utilizing new technique called two cluster assignment technique with K-means\nmodel, we improved the ontology of the retrieved text. We further apply the\nvector average damping technique for flexible movement of clusters. Our\nexperimental results on a recent corpus of Covid-19 shows that we obtain good\nresults based on main keywords.\n","authors":["Shreyash Rawat","V. Vijayarajan","V. B. Surya Prasath"],"pdf_url":"https://arxiv.org/pdf/2402.03380v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.03379v1","updated":"2024-02-04T03:30:25Z","published":"2024-02-04T03:30:25Z","title":"Entire Chain Uplift Modeling with Context-Enhanced Learning for\n Intelligent Marketing","summary":" Uplift modeling, vital in online marketing, seeks to accurately measure the\nimpact of various strategies, such as coupons or discounts, on different users\nby predicting the Individual Treatment Effect (ITE). In an e-commerce setting,\nuser behavior follows a defined sequential chain, including impression, click,\nand conversion. Marketing strategies exert varied uplift effects at each stage\nwithin this chain, impacting metrics like click-through and conversion rate.\nDespite its utility, existing research has neglected to consider the inter-task\nacross all stages impacts within a specific treatment and has insufficiently\nutilized the treatment information, potentially introducing substantial bias\ninto subsequent marketing decisions. We identify these two issues as the\nchain-bias problem and the treatment-unadaptive problem. This paper introduces\nthe Entire Chain UPlift method with context-enhanced learning (ECUP), devised\nto tackle these issues. ECUP consists of two primary components: 1) the Entire\nChain-Enhanced Network, which utilizes user behavior patterns to estimate ITE\nthroughout the entire chain space, models the various impacts of treatments on\neach task, and integrates task prior information to enhance context awareness\nacross all stages, capturing the impact of treatment on different tasks, and 2)\nthe Treatment-Enhanced Network, which facilitates fine-grained treatment\nmodeling through bit-level feature interactions, thereby enabling adaptive\nfeature adjustment. Extensive experiments on public and industrial datasets\nvalidate ECUPs effectiveness. Moreover, ECUP has been deployed on the Meituan\nfood delivery platform, serving millions of daily active users, with the\nrelated dataset released for future research.\n","authors":["Yinqiu Huang","Shuli Wang","Min Gao","Xue Wei","Changhao Li","Chuan Luo","Yinhua Zhu","Xiong Xiao","Yi Luo"],"pdf_url":"https://arxiv.org/pdf/2402.03379v1.pdf","comment":"Accepted by WWW2024"},{"id":"http://arxiv.org/abs/2402.02335v1","updated":"2024-02-04T04:13:31Z","published":"2024-02-04T04:13:31Z","title":"Video Editing for Video Retrieval","summary":" Though pre-training vision-language models have demonstrated significant\nbenefits in boosting video-text retrieval performance from large-scale web\nvideos, fine-tuning still plays a critical role with manually annotated clips\nwith start and end times, which requires considerable human effort. To address\nthis issue, we explore an alternative cheaper source of annotations, single\ntimestamps, for video-text retrieval. We initialise clips from timestamps in a\nheuristic way to warm up a retrieval model. Then a video clip editing method is\nproposed to refine the initial rough boundaries to improve retrieval\nperformance. A student-teacher network is introduced for video clip editing.\nThe teacher model is employed to edit the clips in the training set whereas the\nstudent model trains on the edited clips. The teacher weights are updated from\nthe student's after the student's performance increases. Our method is model\nagnostic and applicable to any retrieval models. We conduct experiments based\non three state-of-the-art retrieval models, COOT, VideoCLIP and CLIP4Clip.\nExperiments conducted on three video retrieval datasets, YouCook2, DiDeMo and\nActivityNet-Captions show that our edited clips consistently improve retrieval\nperformance over initial clips across all the three retrieval models.\n","authors":["Bin Zhu","Kevin Flanagan","Adriano Fragomeni","Michael Wray","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2402.02335v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2402.02369v1","updated":"2024-02-04T06:56:23Z","published":"2024-02-04T06:56:23Z","title":"M$^3$Face: A Unified Multi-Modal Multilingual Framework for Human Face\n Generation and Editing","summary":" Human face generation and editing represent an essential task in the era of\ncomputer vision and the digital world. Recent studies have shown remarkable\nprogress in multi-modal face generation and editing, for instance, using face\nsegmentation to guide image generation. However, it may be challenging for some\nusers to create these conditioning modalities manually. Thus, we introduce\nM3Face, a unified multi-modal multilingual framework for controllable face\ngeneration and editing. This framework enables users to utilize only text input\nto generate controlling modalities automatically, for instance, semantic\nsegmentation or facial landmarks, and subsequently generate face images. We\nconduct extensive qualitative and quantitative experiments to showcase our\nframeworks face generation and editing capabilities. Additionally, we propose\nthe M3CelebA Dataset, a large-scale multi-modal and multilingual face dataset\ncontaining high-quality images, semantic segmentations, facial landmarks, and\ndifferent captions for each image in multiple languages. The code and the\ndataset will be released upon publication.\n","authors":["Mohammadreza Mofayezi","Reza Alipour","Mohammad Ali Kakavand","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2402.02369v1.pdf","comment":null}]},"2024-02-03T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.02246v1","updated":"2024-02-03T19:24:45Z","published":"2024-02-03T19:24:45Z","title":"ExTTNet: A Deep Learning Algorithm for Extracting Table Texts from\n Invoice Images","summary":" In this work, product tables in invoices are obtained autonomously via a deep\nlearning model, which is named as ExTTNet. Firstly, text is obtained from\ninvoice images using Optical Character Recognition (OCR) techniques. Tesseract\nOCR engine [37] is used for this process. Afterwards, the number of existing\nfeatures is increased by using feature extraction methods to increase the\naccuracy. Labeling process is done according to whether each text obtained as a\nresult of OCR is a table element or not. In this study, a multilayer artificial\nneural network model is used. The training has been carried out with an Nvidia\nRTX 3090 graphics card and taken $162$ minutes. As a result of the training,\nthe F1 score is $0.92$.\n","authors":["Adem Akdoğan","Murat Kurt"],"pdf_url":"https://arxiv.org/pdf/2402.02246v1.pdf","comment":"6 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.10244v3","updated":"2024-02-03T17:14:46Z","published":"2023-12-01T21:50:43Z","title":"Knowledge Graph Driven Recommendation System Algorithm","summary":" In this paper, we propose a novel graph neural network-based recommendation\nmodel called KGLN, which leverages Knowledge Graph (KG) information to enhance\nthe accuracy and effectiveness of personalized recommendations. We first use a\nsingle-layer neural network to merge individual node features in the graph, and\nthen adjust the aggregation weights of neighboring entities by incorporating\ninfluence factors. The model evolves from a single layer to multiple layers\nthrough iteration, enabling entities to access extensive multi-order associated\nentity information. The final step involves integrating features of entities\nand users to produce a recommendation score. The model performance was\nevaluated by comparing its effects on various aggregation methods and influence\nfactors. In tests over the MovieLen-1M and Book-Crossing datasets, KGLN shows\nan Area Under the ROC curve (AUC) improvement of 0.3% to 5.9% and 1.1% to 8.2%,\nrespectively, which is better than existing benchmark methods like LibFM,\nDeepFM, Wide&Deep, and RippleNet.\n","authors":["Chaoyang Zhang","Yanan Li","Shen Chen","Siwei Fan","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2401.10244v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02182v1","updated":"2024-02-03T15:14:51Z","published":"2024-02-03T15:14:51Z","title":"Diffusion Cross-domain Recommendation","summary":" It is always a challenge for recommender systems to give high-quality\noutcomes to cold-start users. One potential solution to alleviate the data\nsparsity problem for cold-start users in the target domain is to add data from\nthe auxiliary domain. Finding a proper way to extract knowledge from an\nauxiliary domain and transfer it into a target domain is one of the main\nobjectives for cross-domain recommendation (CDR) research. Among the existing\nmethods, mapping approach is a popular one to implement cross-domain\nrecommendation models (CDRs). For models of this type, a mapping module plays\nthe role of transforming data from one domain to another. It primarily\ndetermines the performance of mapping approach CDRs. Recently, diffusion\nprobability models (DPMs) have achieved impressive success for image synthesis\nrelated tasks. They involve recovering images from noise-added samples, which\ncan be viewed as a data transformation process with outstanding performance. To\nfurther enhance the performance of CDRs, we first reveal the potential\nconnection between DPMs and mapping modules of CDRs, and then propose a novel\nCDR model named Diffusion Cross-domain Recommendation (DiffCDR). More\nspecifically, we first adopt the theory of DPM and design a Diffusion Module\n(DIM), which generates user's embedding in target domain. To reduce the\nnegative impact of randomness introduced in DIM and improve the stability, we\nemploy an Alignment Module to produce the aligned user embeddings. In addition,\nwe consider the label data of the target domain and form the task-oriented loss\nfunction, which enables our DiffCDR to adapt to specific tasks. By conducting\nextensive experiments on datasets collected from reality, we demonstrate the\neffectiveness and adaptability of DiffCDR to outperform baseline models on\nvarious CDR tasks in both cold-start and warm-start scenarios.\n","authors":["Yuner Xuan"],"pdf_url":"https://arxiv.org/pdf/2402.02182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02175v1","updated":"2024-02-03T14:54:13Z","published":"2024-02-03T14:54:13Z","title":"Enhancing Complex Question Answering over Knowledge Graphs through\n Evidence Pattern Retrieval","summary":" Information retrieval (IR) methods for KGQA consist of two stages: subgraph\nextraction and answer reasoning. We argue current subgraph extraction methods\nunderestimate the importance of structural dependencies among evidence facts.\nWe propose Evidence Pattern Retrieval (EPR) to explicitly model the structural\ndependencies during subgraph extraction. We implement EPR by indexing the\natomic adjacency pattern of resource pairs. Given a question, we perform dense\nretrieval to obtain atomic patterns formed by resource pairs. We then enumerate\ntheir combinations to construct candidate evidence patterns. These evidence\npatterns are scored using a neural model, and the best one is selected to\nextract a subgraph for downstream answer reasoning. Experimental results\ndemonstrate that the EPR-based approach has significantly improved the F1\nscores of IR-KGQA methods by over 10 points on ComplexWebQuestions and achieves\ncompetitive performance on WebQuestionsSP.\n","authors":["Wentao Ding","Jinmao Li","Liangchuan Luo","Yuzhong Qu"],"pdf_url":"https://arxiv.org/pdf/2402.02175v1.pdf","comment":"Accepted to TheWebConf'24 (WWW 2024). This is a preprint version; the\n CR version will include more details. Github:\n https://github.com/nju-websoft/EPR-KGQA"},{"id":"http://arxiv.org/abs/2402.02158v1","updated":"2024-02-03T14:11:55Z","published":"2024-02-03T14:11:55Z","title":"PatSTEG: Modeling Formation Dynamics of Patent Citation Networks via The\n Semantic-Topological Evolutionary Graph","summary":" Patent documents in the patent database (PatDB) are crucial for research,\ndevelopment, and innovation as they contain valuable technical information.\nHowever, PatDB presents a multifaceted challenge compared to publicly available\npreprocessed databases due to the intricate nature of the patent text and the\ninherent sparsity within the patent citation network. Although patent text\nanalysis and citation analysis bring new opportunities to explore patent data\nmining, no existing work exploits the complementation of them. To this end, we\npropose a joint semantic-topological evolutionary graph learning approach\n(PatSTEG) to model the formation dynamics of patent citation networks. More\nspecifically, we first create a real-world dataset of Chinese patents named\nCNPat and leverage its patent texts and citations to construct a patent\ncitation network. Then, PatSTEG is modeled to study the evolutionary dynamics\nof patent citation formation by considering the semantic and topological\ninformation jointly. Extensive experiments are conducted on CNPat and public\ndatasets to prove the superiority of PatSTEG over other state-of-the-art\nmethods. All the results provide valuable references for patent literature\nresearch and technical exploration.\n","authors":["Ran Miao","Xueyu Chen","Liang Hu","Zhifei Zhang","Minghua Wan","Qi Zhang","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02152v1","updated":"2024-02-03T13:46:28Z","published":"2024-02-03T13:46:28Z","title":"Position Paper: Why the Shooting in the Dark Method Dominates\n Recommender Systems Practice; A Call to Abandon Anti-Utopian Thinking","summary":" Applied recommender systems research is in a curious position. While there is\na very rigorous protocol for measuring performance by A/B testing, best\npractice for finding a `B' to test does not explicitly target performance but\nrather targets a proxy measure. The success or failure of a given A/B test then\ndepends entirely on if the proposed proxy is better correlated to performance\nthan the previous proxy. No principle exists to identify if one proxy is better\nthan another offline, leaving the practitioners shooting in the dark. The\npurpose of this position paper is to question this anti-Utopian thinking and\nargue that a non-standard use of the deep learning stacks actually has the\npotential to unlock reward optimizing recommendation.\n","authors":["David Rohde"],"pdf_url":"https://arxiv.org/pdf/2402.02152v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.05502v2","updated":"2024-02-03T09:54:51Z","published":"2023-08-10T11:14:22Z","title":"Bringing order into the realm of Transformer-based language models for\n artificial intelligence and law","summary":" Transformer-based language models (TLMs) have widely been recognized to be a\ncutting-edge technology for the successful development of deep-learning-based\nsolutions to problems and applications that require natural language processing\nand understanding. Like for other textual domains, TLMs have indeed pushed the\nstate-of-the-art of AI approaches for many tasks of interest in the legal\ndomain. Despite the first Transformer model being proposed about six years ago,\nthere has been a rapid progress of this technology at an unprecedented rate,\nwhereby BERT and related models represent a major reference, also in the legal\ndomain. This article provides the first systematic overview of TLM-based\nmethods for AI-driven problems and tasks in the legal sphere. A major goal is\nto highlight research advances in this field so as to understand, on the one\nhand, how the Transformers have contributed to the success of AI in supporting\nlegal processes, and on the other hand, what are the current limitations and\nopportunities for further research development.\n","authors":["Candida M. Greco","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2308.05502v2.pdf","comment":"Please refer to the published version: Greco, C.M., Tagarelli, A.\n (2023) Bringing order into the realm of Transformer-based language models for\n artificial intelligence and law. Artif Intell Law, Springer Nature. November\n 2023. https://doi.org/10.1007/s10506-023-09374-7"},{"id":"http://arxiv.org/abs/2402.02079v1","updated":"2024-02-03T08:19:26Z","published":"2024-02-03T08:19:26Z","title":"Prototypical Contrastive Learning through Alignment and Uniformity for\n Recommendation","summary":" Graph Collaborative Filtering (GCF), one of the most widely adopted\nrecommendation system methods, effectively captures intricate relationships\nbetween user and item interactions. Graph Contrastive Learning (GCL) based GCF\nhas gained significant attention as it leverages self-supervised techniques to\nextract valuable signals from real-world scenarios. However, many methods\nusually learn the instances of discrimination tasks that involve the\nconstruction of contrastive pairs through random sampling. GCL approaches\nsuffer from sampling bias issues, where the negatives might have a semantic\nstructure similar to that of the positives, thus leading to a loss of effective\nfeature representation. To address these problems, we present the\n\\underline{Proto}typical contrastive learning through \\underline{A}lignment and\n\\underline{U}niformity for recommendation, which is called \\textbf{ProtoAU}.\nSpecifically, we first propose prototypes (cluster centroids) as a latent space\nto ensure consistency across different augmentations from the origin graph,\naiming to eliminate the need for random sampling of contrastive pairs.\nFurthermore, the absence of explicit negatives means that directly optimizing\nthe consistency loss between instance and prototype could easily result in\ndimensional collapse issues. Therefore, we propose aligning and maintaining\nuniformity in the prototypes of users and items as optimization objectives to\nprevent falling into trivial solutions. Finally, we conduct extensive\nexperiments on four datasets and evaluate their performance on the task of link\nprediction. Experimental results demonstrate that the proposed ProtoAU\noutperforms other representative methods. The source codes of our proposed\nProtoAU are available at \\url{https://github.com/oceanlvr/ProtoAU}.\n","authors":["Yangxun Ou","Lei Chen","Fenglin Pan","Yupeng Wu"],"pdf_url":"https://arxiv.org/pdf/2402.02079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02044v1","updated":"2024-02-03T05:43:39Z","published":"2024-02-03T05:43:39Z","title":"Locally-Adaptive Quantization for Streaming Vector Search","summary":" Retrieving the most similar vector embeddings to a given query among a\nmassive collection of vectors has long been a key component of countless\nreal-world applications. The recently introduced Retrieval-Augmented Generation\nis one of the most prominent examples. For many of these applications, the\ndatabase evolves over time by inserting new data and removing outdated data. In\nthese cases, the retrieval problem is known as streaming similarity search.\nWhile Locally-Adaptive Vector Quantization (LVQ), a highly efficient vector\ncompression method, yields state-of-the-art search performance for non-evolving\ndatabases, its usefulness in the streaming setting has not been yet\nestablished. In this work, we study LVQ in streaming similarity search. In\nsupport of our evaluation, we introduce two improvements of LVQ: Turbo LVQ and\nmulti-means LVQ that boost its search performance by up to 28% and 27%,\nrespectively. Our studies show that LVQ and its new variants enable blazing\nfast vector search, outperforming its closest competitor by up to 9.4x for\nidentically distributed data and by up to 8.8x under the challenging scenario\nof data distribution shifts (i.e., where the statistical distribution of the\ndata changes over time). We release our contributions as part of Scalable\nVector Search, an open-source library for high-performance similarity search.\n","authors":["Cecilia Aguerrebere","Mark Hildebrand","Ishwar Singh Bhati","Theodore Willke","Mariano Tepper"],"pdf_url":"https://arxiv.org/pdf/2402.02044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01963v1","updated":"2024-02-03T00:11:29Z","published":"2024-02-03T00:11:29Z","title":"Improving Large-Scale k-Nearest Neighbor Text Categorization with Label\n Autoencoders","summary":" In this paper, we introduce a multi-label lazy learning approach to deal with\nautomatic semantic indexing in large document collections in the presence of\ncomplex and structured label vocabularies with high inter-label correlation.\nThe proposed method is an evolution of the traditional k-Nearest Neighbors\nalgorithm which uses a large autoencoder trained to map the large label space\nto a reduced size latent space and to regenerate the predicted labels from this\nlatent space. We have evaluated our proposal in a large portion of the MEDLINE\nbiomedical document collection which uses the Medical Subject Headings (MeSH)\nthesaurus as a controlled vocabulary. In our experiments we propose and\nevaluate several document representation approaches and different label\nautoencoder configurations.\n","authors":["Francisco J. Ribadas-Pena","Shuyuan Cao","Víctor M. Darriba Bilbao"],"pdf_url":"https://arxiv.org/pdf/2402.01963v1.pdf","comment":"22 pages, 4 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2402.02210v1","updated":"2024-02-03T16:51:04Z","published":"2024-02-03T16:51:04Z","title":"Wavelet-Decoupling Contrastive Enhancement Network for Fine-Grained\n Skeleton-Based Action Recognition","summary":" Skeleton-based action recognition has attracted much attention, benefiting\nfrom its succinctness and robustness. However, the minimal inter-class\nvariation in similar action sequences often leads to confusion. The inherent\nspatiotemporal coupling characteristics make it challenging to mine the subtle\ndifferences in joint motion trajectories, which is critical for distinguishing\nconfusing fine-grained actions. To alleviate this problem, we propose a\nWavelet-Attention Decoupling (WAD) module that utilizes discrete wavelet\ntransform to effectively disentangle salient and subtle motion features in the\ntime-frequency domain. Then, the decoupling attention adaptively recalibrates\ntheir temporal responses. To further amplify the discrepancies in these subtle\nmotion features, we propose a Fine-grained Contrastive Enhancement (FCE) module\nto enhance attention towards trajectory features by contrastive learning.\nExtensive experiments are conducted on the coarse-grained dataset NTU RGB+D and\nthe fine-grained dataset FineGYM. Our methods perform competitively compared to\nstate-of-the-art methods and can discriminate confusing fine-grained actions\nwell.\n","authors":["Haochen Chang","Jing Chen","Yilin Li","Jixiang Chen","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02210v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2304.09571v4","updated":"2024-02-03T09:13:07Z","published":"2023-04-19T11:19:10Z","title":"LLIC: Large Receptive Field Transform Coding with Adaptive Weights for\n Learned Image Compression","summary":" Effective Receptive field (ERF) plays an important role in transform coding,\nwhich determines how much redundancy can be removed at most during transform\nand how many spatial priors can be utilized to synthesize textures during\ninverse transform. Existing methods rely on stacks of small kernels, whose ERF\nremains not large enough instead, or heavy non-local attention mechanisms,\nwhich limit the potential of high resolution image coding. To tackle this\nissue, we propose Large Receptive Field Transform Coding with Adaptive Weights\nfor Learned Image Compression (LLIC). Specifically, for the first time in\nlearned image compression community, we introduce a few large kernel-based\ndepth-wise convolutions to reduce more redundancy while maintaining modest\ncomplexity. Due to wide range of image diversity, we propose to enhance the\nadaptability of convolutions via generating weights in a self-conditioned\nmanner. The large kernels cooperate with non-linear embedding and gate\nmechanisms for better expressiveness and lighter point-wise interactions. We\nalso investigate improved training techniques to fully exploit the potential of\nlarge kernels. In addition, to enhance the interactions among channels, we\npropose the adaptive channel-wise bit allocation via generating channel\nimportance factor in a self-conditioned manner. To demonstrate the\neffectiveness of proposed transform coding, we align the entropy model to\ncompare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC,\nLLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have\nsignificant improvements over corresponding baselines and achieve\nstate-of-the-art performances and better trade-off between performance and\ncomplexity.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v4.pdf","comment":"Fix typos"},{"id":"http://arxiv.org/abs/2305.12793v2","updated":"2024-02-03T03:24:46Z","published":"2023-05-22T07:42:52Z","title":"Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal\n Selective Self-Training","summary":" End-to-end (E2E) spoken language understanding (SLU) is constrained by the\ncost of collecting speech-semantics pairs, especially when label domains\nchange. Hence, we explore \\textit{zero-shot} E2E SLU, which learns E2E SLU\nwithout speech-semantics pairs, instead using only speech-text and\ntext-semantics pairs. Previous work achieved zero-shot by pseudolabeling all\nspeech-text transcripts with a natural language understanding (NLU) model\nlearned on text-semantics corpora. However, this method requires the domains of\nspeech-text and text-semantics to match, which often mismatch due to separate\ncollections. Furthermore, using the entire collected speech-text corpus from\nany domains leads to \\textit{imbalance} and \\textit{noise} issues. To address\nthese, we propose \\textit{cross-modal selective self-training} (CMSST). CMSST\ntackles imbalance by clustering in a joint space of the three modalities\n(speech, text, and semantics) and handles label noise with a selection network.\nWe also introduce two benchmarks for zero-shot E2E SLU, covering matched and\nfound speech (mismatched) settings. Experiments show that CMSST improves\nperformance in both two settings, with significantly reduced sample sizes and\ntraining time. Our code and data are released in\nhttps://github.com/amazon-science/zero-shot-E2E-slu.\n","authors":["Jianfeng He","Julian Salazar","Kaisheng Yao","Haoqi Li","Jinglun Cai"],"pdf_url":"https://arxiv.org/pdf/2305.12793v2.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.04293v2","updated":"2024-02-03T02:08:49Z","published":"2023-12-07T13:27:37Z","title":"GPT-4V with Emotion: A Zero-shot Benchmark for Generalized Emotion\n Recognition","summary":" Recently, GPT-4 with Vision (GPT-4V) has demonstrated remarkable visual\ncapabilities across various tasks, but its performance in emotion recognition\nhas not been fully evaluated. To bridge this gap, we present the quantitative\nevaluation results of GPT-4V on 19 benchmark datasets covering 5 tasks: visual\nsentiment analysis, micro-expression recognition, facial emotion recognition,\ndynamic facial emotion recognition, and multimodal emotion recognition. This\npaper collectively refers to these tasks as ``Generalized Emotion Recognition\n(GER)''. Through experimental analysis, we observe that GPT-4V generally\noutperforms supervised systems in visual sentiment analysis, highlighting its\npowerful visual understanding capabilities. Meanwhile, GPT-4V shows the ability\nto integrate multimodal clues and exploit temporal information, which is also\ncritical for emotion recognition. Despite these achievements, GPT-4V is\nprimarily tailored for general-purpose domains, which cannot recognize\nmicro-expressions that require specialized knowledge. To the best of our\nknowledge, this paper provides the first quantitative assessment of GPT-4V for\nthe GER tasks, offering valuable insights to researchers in this field. It can\nalso serve as a zero-shot benchmark for subsequent research. Our code and\nevaluation results are available at:\nhttps://github.com/zeroQiaoba/gpt4v-emotion.\n","authors":["Zheng Lian","Licai Sun","Haiyang Sun","Kang Chen","Zhuofan Wen","Hao Gu","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.04293v2.pdf","comment":null}]},"2024-02-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.04253v1","updated":"2024-02-06T18:59:57Z","published":"2024-02-06T18:59:57Z","title":"AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls","summary":" We introduce AnyTool, a large language model agent designed to revolutionize\nthe utilization of a vast array of tools in addressing user queries. We utilize\nover 16,000 APIs from Rapid API, operating under the assumption that a subset\nof these APIs could potentially resolve the queries. AnyTool primarily\nincorporates three elements: an API retriever with a hierarchical structure, a\nsolver aimed at resolving user queries using a selected set of API candidates,\nand a self-reflection mechanism, which re-activates AnyTool if the initial\nsolution proves impracticable. AnyTool is powered by the function calling\nfeature of GPT-4, eliminating the need for training external modules. We also\nrevisit the evaluation protocol introduced by previous works and identify a\nlimitation in this protocol that leads to an artificially high pass rate. By\nrevising the evaluation protocol to better reflect practical application\nscenarios, we introduce an additional benchmark, termed AnyToolBench.\nExperiments across various datasets demonstrate the superiority of our AnyTool\nover strong baselines such as ToolLLM and a GPT-4 variant tailored for tool\nutilization. For instance, AnyTool outperforms ToolLLM by +35.4% in terms of\naverage pass rate on ToolBench. Code will be available at\nhttps://github.com/dyabel/AnyTool.\n","authors":["Yu Du","Fangyun Wei","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04251v1","updated":"2024-02-06T18:59:30Z","published":"2024-02-06T18:59:30Z","title":"Linear-time Minimum Bayes Risk Decoding with Reference Aggregation","summary":" Minimum Bayes Risk (MBR) decoding is a text generation technique that has\nbeen shown to improve the quality of machine translations, but is expensive,\neven if a sampling-based approximation is used. Besides requiring a large\nnumber of sampled sequences, it requires the pairwise calculation of a utility\nmetric, which has quadratic complexity. In this paper, we propose to\napproximate pairwise metric scores with scores calculated against aggregated\nreference representations. This changes the complexity of utility estimation\nfrom $O(n^2)$ to $O(n)$, while empirically preserving most of the quality gains\nof MBR decoding. We release our source code at https://github.com/ZurichNLP/mbr\n","authors":["Jannis Vamvas","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2402.04251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04249v1","updated":"2024-02-06T18:59:08Z","published":"2024-02-06T18:59:08Z","title":"HarmBench: A Standardized Evaluation Framework for Automated Red Teaming\n and Robust Refusal","summary":" Automated red teaming holds substantial promise for uncovering and mitigating\nthe risks associated with the malicious use of large language models (LLMs),\nyet the field lacks a standardized evaluation framework to rigorously assess\nnew methods. To address this issue, we introduce HarmBench, a standardized\nevaluation framework for automated red teaming. We identify several desirable\nproperties previously unaccounted for in red teaming evaluations and\nsystematically design HarmBench to meet these criteria. Using HarmBench, we\nconduct a large-scale comparison of 18 red teaming methods and 33 target LLMs\nand defenses, yielding novel insights. We also introduce a highly efficient\nadversarial training method that greatly enhances LLM robustness across a wide\nrange of attacks, demonstrating how HarmBench enables codevelopment of attacks\nand defenses. We open source HarmBench at\nhttps://github.com/centerforaisafety/HarmBench.\n","authors":["Mantas Mazeika","Long Phan","Xuwang Yin","Andy Zou","Zifan Wang","Norman Mu","Elham Sakhaee","Nathaniel Li","Steven Basart","Bo Li","David Forsyth","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2402.04249v1.pdf","comment":"Website: https://www.harmbench.org"},{"id":"http://arxiv.org/abs/2401.06118v2","updated":"2024-02-06T18:55:25Z","published":"2024-01-11T18:54:44Z","title":"Extreme Compression of Large Language Models via Additive Quantization","summary":" The emergence of accurate open large language models (LLMs) has led to a race\ntowards quantization techniques for such models enabling execution on end-user\ndevices. In this paper, we revisit the problem of \"extreme\" LLM\ncompression--defined as targeting extremely low bit counts, such as 2 to 3 bits\nper parameter, from the point of view of classic methods in Multi-Codebook\nQuantization (MCQ). Our work builds on top of Additive Quantization, a classic\nalgorithm from the MCQ family, and adapts it to the quantization of language\nmodels. The resulting algorithm advances the state-of-the-art in LLM\ncompression, outperforming all recently-proposed techniques in terms of\naccuracy at a given compression budget. For instance, when compressing Llama 2\nmodels to 2 bits per parameter, our algorithm quantizes the 7B model to 6.93\nperplexity (a 1.29 improvement relative to the best prior work, and 1.81 points\nfrom FP16), the 13B model to 5.70 perplexity (a .36 improvement) and the 70B\nmodel to 3.94 perplexity (a .22 improvement) on WikiText2. We release our\nimplementation of Additive Quantization for Language Models AQLM as a baseline\nto facilitate future research in LLM quantization.\n","authors":["Vage Egiazarian","Andrei Panferov","Denis Kuznedelev","Elias Frantar","Artem Babenko","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2401.06118v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.04247v1","updated":"2024-02-06T18:54:07Z","published":"2024-02-06T18:54:07Z","title":"Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science","summary":" Intelligent agents powered by large language models (LLMs) have demonstrated\nsubstantial promise in autonomously conducting experiments and facilitating\nscientific discoveries across various disciplines. While their capabilities are\npromising, they also introduce novel vulnerabilities that demand careful\nconsideration for safety. However, there exists a notable gap in the\nliterature, as there has been no comprehensive exploration of these\nvulnerabilities. This position paper fills this gap by conducting a thorough\nexamination of vulnerabilities in LLM-based agents within scientific domains,\nshedding light on potential risks associated with their misuse and emphasizing\nthe need for safety measures. We begin by providing a comprehensive overview of\nthe potential risks inherent to scientific LLM agents, taking into account user\nintent, the specific scientific domain, and their potential impact on the\nexternal environment. Then, we delve into the origins of these vulnerabilities\nand provide a scoping review of the limited existing works. Based on our\nanalysis, we propose a triadic framework involving human regulation, agent\nalignment, and an understanding of environmental feedback (agent regulation) to\nmitigate these identified risks. Furthermore, we highlight the limitations and\nchallenges associated with safeguarding scientific agents and advocate for the\ndevelopment of improved models, robust benchmarks, and comprehensive\nregulations to address these issues effectively.\n","authors":["Xiangru Tang","Qiao Jin","Kunlun Zhu","Tongxin Yuan","Yichi Zhang","Wangchunshu Zhou","Meng Qu","Yilun Zhao","Jian Tang","Zhuosheng Zhang","Arman Cohan","Zhiyong Lu","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2402.04247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14330v3","updated":"2024-02-06T18:44:30Z","published":"2023-05-23T17:57:09Z","title":"DirecT2V: Large Language Models are Frame-Level Directors for Zero-Shot\n Text-to-Video Generation","summary":" In the paradigm of AI-generated content (AIGC), there has been increasing\nattention to transferring knowledge from pre-trained text-to-image (T2I) models\nto text-to-video (T2V) generation. Despite their effectiveness, these\nframeworks face challenges in maintaining consistent narratives and handling\nshifts in scene composition or object placement from a single abstract user\nprompt. Exploring the ability of large language models (LLMs) to generate\ntime-dependent, frame-by-frame prompts, this paper introduces a new framework,\ndubbed DirecT2V. DirecT2V leverages instruction-tuned LLMs as directors,\nenabling the inclusion of time-varying content and facilitating consistent\nvideo generation. To maintain temporal consistency and prevent mapping the\nvalue to a different object, we equip a diffusion model with a novel value\nmapping method and dual-softmax filtering, which do not require any additional\ntraining. The experimental results validate the effectiveness of our framework\nin producing visually coherent and storyful videos from abstract user prompts,\nsuccessfully addressing the challenges of zero-shot video generation.\n","authors":["Susung Hong","Junyoung Seo","Heeseong Shin","Sunghwan Hong","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2305.14330v3.pdf","comment":"The code and demo will be available at\n https://github.com/KU-CVLAB/DirecT2V"},{"id":"http://arxiv.org/abs/2402.04236v1","updated":"2024-02-06T18:43:48Z","published":"2024-02-06T18:43:48Z","title":"CogCoM: Train Large Vision-Language Models Diving into Details through\n Chain of Manipulations","summary":" Vision-Language Models (VLMs) have demonstrated their widespread viability\nthanks to extensive training in aligning visual instructions to answers.\nHowever, this conclusive alignment leads models to ignore critical visual\nreasoning, and further result in failures on meticulous visual problems and\nunfaithful responses. In this paper, we propose Chain of Manipulations, a\nmechanism that enables VLMs to solve problems with a series of manipulations,\nwhere each manipulation refers to an operation on the visual input, either from\nintrinsic abilities (e.g., grounding) acquired through prior training or from\nimitating human-like behaviors (e.g., zoom in). This mechanism encourages VLMs\nto generate faithful responses with evidential visual reasoning, and permits\nusers to trace error causes in the interpretable paths. We thus train CogCoM, a\ngeneral 17B VLM with a memory-based compatible architecture endowed this\nreasoning mechanism. Experiments show that our model achieves the\nstate-of-the-art performance across 8 benchmarks from 3 categories, and a\nlimited number of training steps with the data swiftly gains a competitive\nperformance. The code and data are publicly available at\nhttps://github.com/THUDM/CogCoM.\n","authors":["Ji Qi","Ming Ding","Weihan Wang","Yushi Bai","Qingsong Lv","Wenyi Hong","Bin Xu","Lei Hou","Juanzi Li","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2402.04236v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.04232v1","updated":"2024-02-06T18:39:43Z","published":"2024-02-06T18:39:43Z","title":"Can Generative Agents Predict Emotion?","summary":" Large Language Models (LLMs) have demonstrated a number of human-like\nabilities, however the empathic understanding and emotional state of LLMs is\nyet to be aligned to that of humans. In this work, we investigate how the\nemotional state of generative LLM agents evolves as they perceive new events,\nintroducing a novel architecture in which new experiences are compared to past\nmemories. Through this comparison, the agent gains the ability to understand\nnew experiences in context, which according to the appraisal theory of emotion\nis vital in emotion creation. First, the agent perceives new experiences as\ntime series text data. After perceiving each new input, the agent generates a\nsummary of past relevant memories, referred to as the norm, and compares the\nnew experience to this norm. Through this comparison we can analyse how the\nagent reacts to the new experience in context. The PANAS, a test of affect, is\nadministered to the agent, capturing the emotional state of the agent after the\nperception of the new event. Finally, the new experience is then added to the\nagents memory to be used in the creation of future norms. By creating multiple\nexperiences in natural language from emotionally charged situations, we test\nthe proposed architecture on a wide range of scenarios. The mixed results\nsuggests that introducing context can occasionally improve the emotional\nalignment of the agent, but further study and comparison with human evaluators\nis necessary. We hope that this paper is another step towards the alignment of\ngenerative agents.\n","authors":["Ciaran Regan","Nanami Iwahashi","Shogo Tanaka","Mizuki Oka"],"pdf_url":"https://arxiv.org/pdf/2402.04232v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03300v2","updated":"2024-02-06T18:39:38Z","published":"2024-02-05T18:55:32Z","title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open\n Language Models","summary":" Mathematical reasoning poses a significant challenge for language models due\nto its complex and structured nature. In this paper, we introduce DeepSeekMath\n7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B\nmath-related tokens sourced from Common Crawl, together with natural language\nand code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the\ncompetition-level MATH benchmark without relying on external toolkits and\nvoting techniques, approaching the performance level of Gemini-Ultra and GPT-4.\nSelf-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH.\nThe mathematical reasoning capability of DeepSeekMath is attributed to two key\nfactors: First, we harness the significant potential of publicly available web\ndata through a meticulously engineered data selection pipeline. Second, we\nintroduce Group Relative Policy Optimization (GRPO), a variant of Proximal\nPolicy Optimization (PPO), that enhances mathematical reasoning abilities while\nconcurrently optimizing the memory usage of PPO.\n","authors":["Zhihong Shao","Peiyi Wang","Qihao Zhu","Runxin Xu","Junxiao Song","Mingchuan Zhang","Y. K. Li","Y. Wu","Daya Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03300v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04222v1","updated":"2024-02-06T18:29:39Z","published":"2024-02-06T18:29:39Z","title":"What is 'Typological Diversity' in NLP?","summary":" The NLP research community has devoted increased attention to languages\nbeyond English, resulting in considerable improvements for multilingual NLP.\nHowever, these improvements only apply to a small subset of the world's\nlanguages. Aiming to extend this, an increasing number of papers aspires to\nenhance generalizable multilingual performance across languages. To this end,\nlinguistic typology is commonly used to motivate language selection, on the\nbasis that a broad typological sample ought to imply generalization across a\nbroad range of languages. These selections are often described as being\n'typologically diverse'. In this work, we systematically investigate NLP\nresearch that includes claims regarding 'typological diversity'. We find there\nare no set definitions or criteria for such claims. We introduce metrics to\napproximate the diversity of language selection along several axes and find\nthat the results vary considerably across papers. Furthermore, we show that\nskewed language selection can lead to overestimated multilingual performance.\nWe recommend future work to include an operationalization of 'typological\ndiversity' that empirically justifies the diversity of language samples.\n","authors":["Esther Ploeger","Wessel Poelman","Miryam de Lhoneux","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2402.04222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02416v2","updated":"2024-02-06T18:02:01Z","published":"2024-02-04T09:24:51Z","title":"Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction","summary":" Efforts to align Large Language Models (LLMs) are mainly conducted via\nReinforcement Learning from Human Feedback (RLHF) methods. However, RLHF\nencounters major challenges including training reward models, actor-critic\nengineering, and importantly, it requires access to LLM parameters. Here we\nintroduce Aligner, a new efficient alignment paradigm that bypasses the whole\nRLHF process by learning the correctional residuals between the aligned and the\nunaligned answers. Our Aligner offers several key advantages. Firstly, it is an\nautoregressive seq2seq model that is trained on the query-answer-correction\ndataset via supervised learning; this offers a parameter-efficient alignment\nsolution with minimal resources. Secondly, the Aligner facilitates\nweak-to-strong generalization; finetuning large pretrained models by Aligner's\nsupervisory signals demonstrates strong performance boost. Thirdly, Aligner\nfunctions as a model-agnostic plug-and-play module, allowing for its direct\napplication on different open-source and API-based models. Remarkably,\nAligner-7B improves 11 different LLMs by 21.9% in helpfulness and 23.8% in\nharmlessness on average (GPT-4 by 17.5% and 26.9%). When finetuning (strong)\nLlama2-70B with (weak) Aligner-13B's supervision, we can improve Llama2 by 8.2%\nin helpfulness and 61.6% in harmlessness. See our dataset and code at\nhttps://aligner2024.github.io\n","authors":["Jiaming Ji","Boyuan Chen","Hantao Lou","Donghai Hong","Borong Zhang","Xuehai Pan","Juntao Dai","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02416v2.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2308.09687v4","updated":"2024-02-06T18:00:18Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Michal Podstawski","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04177v1","updated":"2024-02-06T17:31:20Z","published":"2024-02-06T17:31:20Z","title":"Scaling Laws for Downstream Task Performance of Large Language Models","summary":" Scaling laws provide important insights that can guide the design of large\nlanguage models (LLMs). Existing work has primarily focused on studying scaling\nlaws for pretraining (upstream) loss. However, in transfer learning settings,\nin which LLMs are pretrained on an unsupervised dataset and then finetuned on a\ndownstream task, we often also care about the downstream performance. In this\nwork, we study the scaling behavior in a transfer learning setting, where LLMs\nare finetuned for machine translation tasks. Specifically, we investigate how\nthe choice of the pretraining data and its size affect downstream performance\n(translation quality) as judged by two metrics: downstream cross-entropy and\nBLEU score. Our experiments indicate that the size of the finetuning dataset\nand the distribution alignment between the pretraining and downstream data\nsignificantly influence the scaling behavior. With sufficient alignment, both\ndownstream cross-entropy and BLEU score improve monotonically with more\npretraining data. In such cases, we show that it is possible to predict the\ndownstream BLEU score with good accuracy using a log-law. However, there are\nalso cases where moderate misalignment causes the BLEU score to fluctuate or\nget worse with more pretraining, whereas downstream cross-entropy monotonically\nimproves. By analyzing these observations, we provide new practical insights\nfor choosing appropriate pretraining data.\n","authors":["Berivan Isik","Natalia Ponomareva","Hussein Hazimeh","Dimitris Paparas","Sergei Vassilvitskii","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2402.04177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04161v1","updated":"2024-02-06T17:18:59Z","published":"2024-02-06T17:18:59Z","title":"Attention with Markov: A Framework for Principled Analysis of\n Transformers via Markov Chains","summary":" In recent years, attention-based transformers have achieved tremendous\nsuccess across a variety of disciplines including natural languages. A key\ningredient behind their success is the generative pretraining procedure, during\nwhich these models are trained on a large text corpus in an auto-regressive\nmanner. To shed light on this phenomenon, we propose a new framework that\nallows both theory and systematic experiments to study the sequential modeling\ncapabilities of transformers through the lens of Markov chains. Inspired by the\nMarkovianity of natural languages, we model the data as a Markovian source and\nutilize this framework to systematically study the interplay between the\ndata-distributional properties, the transformer architecture, the learnt\ndistribution, and the final model performance. In particular, we theoretically\ncharacterize the loss landscape of single-layer transformers and show the\nexistence of global minima and bad local minima contingent upon the specific\ndata characteristics and the transformer architecture. Backed by experiments,\nwe demonstrate that our theoretical findings are in congruence with the\nempirical results. We further investigate these findings in the broader context\nof higher order Markov chains and deeper architectures, and outline open\nproblems in this arena. Code is available at\n\\url{https://github.com/Bond1995/Markov}.\n","authors":["Ashok Vardhan Makkuva","Marco Bondaschi","Adway Girish","Alliot Nagle","Martin Jaggi","Hyeji Kim","Michael Gastpar"],"pdf_url":"https://arxiv.org/pdf/2402.04161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04160v1","updated":"2024-02-06T17:18:25Z","published":"2024-02-06T17:18:25Z","title":"Harnessing the Plug-and-Play Controller by Prompting","summary":" Controllable text generation is a growing field within natural language\ngeneration (NLG) that focuses on producing text that meets specific constraints\nin real-world applications. Previous approaches, such as plug-and-play\ncontrollers (PPCs), aimed to steer the properties of generated text in a\nflexible manner. However, these methods often compromised the integrity of the\nlanguage model's decoding process, resulting in less smooth text generation.\nAlternatively, other techniques utilized multiple attribute prompts to align\nthe generated text with desired attributes, but this approach required prompt\ndesign for each attribute and was dependent on the size of the language model.\nThis paper introduces a novel method for flexible attribute control in text\ngeneration using pre-trained language models (PLMs). The proposed approach aims\nto enhance the fluency of generated text by guiding the generation process with\nPPCs. The key idea is to dynamically adjust the distribution of generated text\nby modifying prompts, effectively constraining the output space of the language\nmodel and influencing the desired attribute. To enable smooth cooperation\nbetween the PLM and the PPC, our work innovatively proposes a new model\nfine-tuning method: Reinforcement Learning with Dynamic Adjust Feedback\n(RLDAF).This fine-tuning process adapts a small subset of the language model's\nparameters based on the generating actions taken during the PPC control\nprocess. The resulting harmonious collaboration between the PLM and PPC leads\nto improved smoothness in text generation during inference. Extensive\nexperiments were conducted on the SST2 dataset, and the proposed method\noutperformed previous approaches in various evaluation metrics, including text\nfluency and attribute consistency.\n","authors":["Hao Wang","Lei Sha"],"pdf_url":"https://arxiv.org/pdf/2402.04160v1.pdf","comment":"The Third Version of the Generation, Evaluation & Metrics (GEM)\n Workshop in EMNLP 2023"},{"id":"http://arxiv.org/abs/2307.07889v3","updated":"2024-02-06T17:05:58Z","published":"2023-07-15T22:02:12Z","title":"LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise\n Comparisons using Large Language Models","summary":" Current developments in large language models (LLMs) have enabled impressive\nzero-shot capabilities across various natural language tasks. An interesting\napplication of these systems is in the automated assessment of natural language\ngeneration (NLG), a highly challenging area with great practical benefit. In\nthis paper, we explore two options for exploiting the emergent abilities of\nLLMs for zero-shot NLG assessment: absolute score prediction, and comparative\nassessment which uses relative comparisons between pairs of candidates. Though\ncomparative assessment has not been extensively studied in NLG assessment, we\nnote that humans often find it more intuitive to compare two options rather\nthan scoring each one independently. This work examines comparative assessment\nfrom multiple perspectives: performance compared to absolute grading;\npositional biases in the prompt; and efficient ranking in terms of the number\nof comparisons. We illustrate that LLM comparative assessment is a simple,\ngeneral and effective approach for NLG assessment. For moderate-sized\nopen-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is\nsuperior to prompt scoring, and in many cases can achieve performance\ncompetitive with state-of-the-art methods. Additionally, we demonstrate that\nLLMs often exhibit strong positional biases when making pairwise comparisons,\nand we propose debiasing methods that can further improve performance.\n","authors":["Adian Liusie","Potsawee Manakul","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2307.07889v3.pdf","comment":"To Appear at EACL 2024"},{"id":"http://arxiv.org/abs/2310.02031v5","updated":"2024-02-06T17:00:08Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reason may be the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean\ndomain, which is expert in various ocean science tasks. We propose DoInstruct,\na novel framework to automatically obtain a large volume of ocean domain\ninstruction data, which generates instructions based on multi-agent\ncollaboration. Additionally, we construct the first oceanography benchmark,\nOceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though\ncomprehensive experiments, OceanGPT not only shows a higher level of knowledge\nexpertise for oceans science tasks but also gains preliminary embodied\nintelligence capabilities in ocean technology. Codes, data and checkpoints will\nsoon be available at https://github.com/zjunlp/KnowLM.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v5.pdf","comment":"Work in progress. Project Website:\n https://zjunlp.github.io/project/OceanGPT/"},{"id":"http://arxiv.org/abs/2311.16839v2","updated":"2024-02-06T16:43:31Z","published":"2023-11-28T14:54:37Z","title":"Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware\n Direct Preference Optimization","summary":" Multimodal large language models have made significant advancements in recent\nyears, yet they still suffer from a common issue known as the \"hallucination\nproblem\", in which the models generate textual descriptions that inaccurately\ndepict or entirely fabricate content from associated images. This paper\nintroduces a novel solution, Hallucination-Aware Direct Preference Optimization\n(HA-DPO), which reframes the hallucination problem as a preference selection\ntask. The model is trained to favor the non-hallucinating response when\npresented with two responses of the same image (one accurate and one\nhallucinatory). Furthermore, this paper proposes an efficient pipeline for\nconstructing positive~(non-hallucinatory) and negative~(hallucinatory) sample\npairs, ensuring a high-quality, style-consistent dataset for robust preference\nlearning. When applied to three mainstream multimodal models, HA-DPO\nsignificantly reduced hallucination issues and amplified the models'\ngeneralization capabilities. Notably, the MiniGPT-4 model, when enhanced with\nHA-DPO, demonstrated a substantial improvement: POPE accuracy rose from 51.13%\nto 86.13% (an absolute improvement of 35%), and the MME score surged from\n932.00 to 1326.46 (a relative improvement of 42.32%). The codes, models, and\ndatasets are made accessible at https://opendatalab.github.io/HA-DPO.\n","authors":["Zhiyuan Zhao","Bin Wang","Linke Ouyang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2311.16839v2.pdf","comment":"Project Website: https://opendatalab.github.io/HA-DPO, Code:\n https://github.com/opendatalab/HA-DPO"},{"id":"http://arxiv.org/abs/2311.00768v2","updated":"2024-02-06T16:33:48Z","published":"2023-11-01T18:23:12Z","title":"Language Model Training Paradigms for Clinical Feature Embeddings","summary":" In research areas with scarce data, representation learning plays a\nsignificant role. This work aims to enhance representation learning for\nclinical time series by deriving universal embeddings for clinical features,\nsuch as heart rate and blood pressure. We use self-supervised training\nparadigms for language models to learn high-quality clinical feature\nembeddings, achieving a finer granularity than existing time-step and\npatient-level representation learning. We visualize the learnt embeddings via\nunsupervised dimension reduction techniques and observe a high degree of\nconsistency with prior clinical knowledge. We also evaluate the model\nperformance on the MIMIC-III benchmark and demonstrate the effectiveness of\nusing clinical feature embeddings. We publish our code online for replication.\n","authors":["Yurong Hu","Manuel Burger","Gunnar Rätsch","Rita Kuznetsova"],"pdf_url":"https://arxiv.org/pdf/2311.00768v2.pdf","comment":"Poster at \"NeurIPS 2023 Workshop: Self-Supervised Learning - Theory\n and Practice\""},{"id":"http://arxiv.org/abs/2401.10463v2","updated":"2024-02-06T16:32:56Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":" We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04110v1","updated":"2024-02-06T16:03:57Z","published":"2024-02-06T16:03:57Z","title":"Behind the Screen: Investigating ChatGPT's Dark Personality Traits and\n Conspiracy Beliefs","summary":" ChatGPT is notorious for its intransparent behavior. This paper tries to shed\nlight on this, providing an in-depth analysis of the dark personality traits\nand conspiracy beliefs of GPT-3.5 and GPT-4. Different psychological tests and\nquestionnaires were employed, including the Dark Factor Test, the Mach-IV\nScale, the Generic Conspiracy Belief Scale, and the Conspiracy Mentality Scale.\nThe responses were analyzed computing average scores, standard deviations, and\nsignificance tests to investigate differences between GPT-3.5 and GPT-4. For\ntraits that have shown to be interdependent in human studies, correlations were\nconsidered. Additionally, system roles corresponding to groups that have shown\ndistinct answering behavior in the corresponding questionnaires were applied to\nexamine the models' ability to reflect characteristics associated with these\nroles in their responses. Dark personality traits and conspiracy beliefs were\nnot particularly pronounced in either model with little differences between\nGPT-3.5 and GPT-4. However, GPT-4 showed a pronounced tendency to believe in\ninformation withholding. This is particularly intriguing given that GPT-4 is\ntrained on a significantly larger dataset than GPT-3.5. Apparently, in this\ncase an increased data exposure correlates with a greater belief in the control\nof information. An assignment of extreme political affiliations increased the\nbelief in conspiracy theories. Test sequencing affected the models' responses\nand the observed correlations, indicating a form of contextual memory.\n","authors":["Erik Weber","Jérôme Rutinowski","Markus Pauly"],"pdf_url":"https://arxiv.org/pdf/2402.04110v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.04105v1","updated":"2024-02-06T15:59:23Z","published":"2024-02-06T15:59:23Z","title":"Measuring Implicit Bias in Explicitly Unbiased Large Language Models","summary":" Large language models (LLMs) can pass explicit bias tests but still harbor\nimplicit biases, similar to humans who endorse egalitarian beliefs yet exhibit\nsubtle biases. Measuring such implicit biases can be a challenge: as LLMs\nbecome increasingly proprietary, it may not be possible to access their\nembeddings and apply existing bias measures; furthermore, implicit biases are\nprimarily a concern if they affect the actual decisions that these systems\nmake. We address both of these challenges by introducing two measures of bias\ninspired by psychology: LLM Implicit Association Test (IAT) Bias, which is a\nprompt-based method for revealing implicit bias; and LLM Decision Bias for\ndetecting subtle discrimination in decision-making tasks. Using these measures,\nwe found pervasive human-like stereotype biases in 6 LLMs across 4 social\ndomains (race, gender, religion, health) and 21 categories (weapons, guilt,\nscience, career among others). Our prompt-based measure of implicit bias\ncorrelates with embedding-based methods but better predicts downstream\nbehaviors measured by LLM Decision Bias. This measure is based on asking the\nLLM to decide between individuals, motivated by psychological results\nindicating that relative not absolute evaluations are more related to implicit\nbiases. Using prompt-based measures informed by psychology allows us to\neffectively expose nuanced biases and subtle discrimination in proprietary LLMs\nthat do not show explicit bias on standard benchmarks.\n","authors":["Xuechunzi Bai","Angelina Wang","Ilia Sucholutsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2402.04105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04088v1","updated":"2024-02-06T15:46:31Z","published":"2024-02-06T15:46:31Z","title":"The Use of a Large Language Model for Cyberbullying Detection","summary":" The dominance of social media has added to the channels of bullying for\nperpetrators. Unfortunately, cyberbullying (CB) is the most prevalent\nphenomenon in todays cyber world, and is a severe threat to the mental and\nphysical health of citizens. This opens the need to develop a robust system to\nprevent bullying content from online forums, blogs, and social media platforms\nto manage the impact in our society. Several machine learning (ML) algorithms\nhave been proposed for this purpose. However, their performances are not\nconsistent due to high class imbalance and generalisation issues. In recent\nyears, large language models (LLMs) like BERT and RoBERTa have achieved\nstate-of-the-art (SOTA) results in several natural language processing (NLP)\ntasks. Unfortunately, the LLMs have not been applied extensively for CB\ndetection. In our paper, we explored the use of these models for cyberbullying\n(CB) detection. We have prepared a new dataset (D2) from existing studies\n(Formspring and Twitter). Our experimental results for dataset D1 and D2 showed\nthat RoBERTa outperformed other models.\n","authors":["Bayode Ogunleye","Babitha Dharmaraj"],"pdf_url":"https://arxiv.org/pdf/2402.04088v1.pdf","comment":"14 pages, Journal of Analytics"},{"id":"http://arxiv.org/abs/2402.04075v1","updated":"2024-02-06T15:25:09Z","published":"2024-02-06T15:25:09Z","title":"Iterative Prompt Refinement for Radiation Oncology Symptom Extraction\n Using Teacher-Student Large Language Models","summary":" This study introduces a novel teacher-student architecture utilizing Large\nLanguage Models (LLMs) to improve prostate cancer radiotherapy symptom\nextraction from clinical notes. Mixtral, the student model, initially extracts\nsymptoms, followed by GPT-4, the teacher model, which refines prompts based on\nMixtral's performance. This iterative process involved 294 single symptom\nclinical notes across 12 symptoms, with up to 16 rounds of refinement per\nepoch. Results showed significant improvements in extracting symptoms from both\nsingle and multi-symptom notes. For 59 single symptom notes, accuracy increased\nfrom 0.51 to 0.71, precision from 0.52 to 0.82, recall from 0.52 to 0.72, and\nF1 score from 0.49 to 0.73. In 375 multi-symptom notes, accuracy rose from 0.24\nto 0.43, precision from 0.6 to 0.76, recall from 0.24 to 0.43, and F1 score\nfrom 0.20 to 0.44. These results demonstrate the effectiveness of advanced\nprompt engineering in LLMs for radiation oncology use.\n","authors":["Reza Khanmohammadi","Ahmed I Ghanem","Kyle Verdecchia","Ryan Hall","Mohamed Elshaikh","Benjamin Movsas","Hassan Bagher-Ebadian","Indrin Chetty","Mohammad M. Ghassemi","Kundan Thind"],"pdf_url":"https://arxiv.org/pdf/2402.04075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04068v1","updated":"2024-02-06T15:13:17Z","published":"2024-02-06T15:13:17Z","title":"Retrieve to Explain: Evidence-driven Predictions with Language Models","summary":" Machine learning models, particularly language models, are notoriously\ndifficult to introspect. Black-box models can mask both issues in model\ntraining and harmful biases. For human-in-the-loop processes, opaque\npredictions can drive lack of trust, limiting a model's impact even when it\nperforms effectively. To address these issues, we introduce Retrieve to Explain\n(R2E). R2E is a retrieval-based language model that prioritizes amongst a\npre-defined set of possible answers to a research question based on the\nevidence in a document corpus, using Shapley values to identify the relative\nimportance of pieces of evidence to the final prediction. R2E can adapt to new\nevidence without retraining, and incorporate structured data through templating\ninto natural language. We assess on the use case of drug target identification\nfrom published scientific literature, where we show that the model outperforms\nan industry-standard genetics-based approach on predicting clinical trial\noutcomes.\n","authors":["Ravi Patel","Angus Brayne","Rogier Hintzen","Daniel Jaroslawicz","Georgiana Neculae","Dane Corneil"],"pdf_url":"https://arxiv.org/pdf/2402.04068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15656v2","updated":"2024-02-06T14:55:56Z","published":"2024-01-28T13:21:44Z","title":"LLsM: Generative Linguistic Steganography with Large Language Model","summary":" Linguistic Steganography (LS) tasks aim to generate steganographic text\n(stego) based on secret information. Only authorized recipients can perceive\nthe existence of secrets in the texts and extract them, thereby preserving\nprivacy. However, the controllability of the stego generated by existing\nschemes is poor, and the stego is difficult to contain specific discourse\ncharacteristics such as style. As a result, the stego is easily detectable,\ncompromising covert communication. To address these problems, this paper\nproposes LLsM, the first LS with the Large Language Model (LLM). We fine-tuned\nthe LLaMA2 with a large-scale constructed dataset encompassing rich discourse\ncharacteristics, which enables the fine-tuned LLM to generate texts with\nspecific discourse in a controllable manner. Then the discourse is used as\nguiding information and inputted into the fine-tuned LLM in the form of the\nPrompt together with secret. On this basis, the constructed candidate pool will\nbe range encoded and use secret to determine the interval. The same prefix of\nthis interval's beginning and ending is the secret embedded at this moment.\nExperiments show that LLsM performs superior to prevalent LS-task and\nrelated-task baselines regarding text quality, statistical analysis, discourse\nmatching, and anti-steganalysis. In particular, LLsM's MAUVE matric surpasses\nsome baselines by 70%-80%, and its anti-steganalysis performance is 30%-40%\nhigher. Notably, we also present examples of longer stegos generated by LLsM,\nshowing its potential superiority in long LS tasks.\n","authors":["Yihao Wang","Ruiqi Song","Ru Zhang","Jianyi Liu","Lingxiao Li"],"pdf_url":"https://arxiv.org/pdf/2401.15656v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2402.04049v1","updated":"2024-02-06T14:51:55Z","published":"2024-02-06T14:51:55Z","title":"Systematic Biases in LLM Simulations of Debates","summary":" Recent advancements in natural language processing, especially the emergence\nof Large Language Models (LLMs), have opened exciting possibilities for\nconstructing computational simulations designed to replicate human behavior\naccurately. However, LLMs are complex statistical learners without\nstraightforward deductive rules, making them prone to unexpected behaviors. In\nthis study, we highlight the limitations of LLMs in simulating human\ninteractions, particularly focusing on LLMs' ability to simulate political\ndebates. Our findings indicate a tendency for LLM agents to conform to the\nmodel's inherent social biases despite being directed to debate from certain\npolitical perspectives. This tendency results in behavioral patterns that seem\nto deviate from well-established social dynamics among humans. We reinforce\nthese observations using an automatic self-fine-tuning method, which enables us\nto manipulate the biases within the LLM and demonstrate that agents\nsubsequently align with the altered biases. These results underscore the need\nfor further research to develop methods that help agents overcome these biases,\na critical step toward creating more realistic simulations.\n","authors":["Amir Taubenfeld","Yaniv Dover","Roi Reichart","Ariel Goldstein"],"pdf_url":"https://arxiv.org/pdf/2402.04049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07237v2","updated":"2024-02-06T14:50:32Z","published":"2024-01-14T09:34:42Z","title":"Distilling Event Sequence Knowledge From Large Language Models","summary":" Event sequence models have been found to be highly effective in the analysis\nand prediction of events. Building such models requires availability of\nabundant high-quality event sequence data. In certain applications, however,\nclean structured event sequences are not available, and automated sequence\nextraction results in data that is too noisy and incomplete. In this work, we\nexplore the use of Large Language Models (LLMs) to generate event sequences\nthat can effectively be used for probabilistic event model construction. This\ncan be viewed as a mechanism of distilling event sequence knowledge from LLMs.\nOur approach relies on a Knowledge Graph (KG) of event concepts with partial\ncausal relations to guide the generative language model for causal event\nsequence generation. We show that our approach can generate high-quality event\nsequences, filling a knowledge gap in the input KG. Furthermore, we explore how\nthe generated sequences can be leveraged to discover useful and more complex\nstructured knowledge from pattern mining and probabilistic event models. We\nrelease our sequence generation code and evaluation framework, as well as\ncorpus of event sequence data.\n","authors":["Somin Wadhwa","Oktie Hassanzadeh","Debarun Bhattacharjya","Ken Barker","Jian Ni"],"pdf_url":"https://arxiv.org/pdf/2401.07237v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2402.04028v1","updated":"2024-02-06T14:24:28Z","published":"2024-02-06T14:24:28Z","title":"AlbNews: A Corpus of Headlines for Topic Modeling in Albanian","summary":" The scarcity of available text corpora for low-resource languages like\nAlbanian is a serious hurdle for research in natural language processing tasks.\nThis paper introduces AlbNews, a collection of 600 topically labeled news\nheadlines and 2600 unlabeled ones in Albanian. The data can be freely used for\nconducting topic modeling research. We report the initial classification scores\nof some traditional machine learning classifiers trained with the AlbNews\nsamples. These results show that basic models outrun the ensemble learning ones\nand can serve as a baseline for future experiments.\n","authors":["Erion Çano","Dario Lamaj"],"pdf_url":"https://arxiv.org/pdf/2402.04028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04023v1","updated":"2024-02-06T14:16:32Z","published":"2024-02-06T14:16:32Z","title":"Google Translate Error Analysis for Mental Healthcare Information:\n Evaluating Accuracy, Comprehensibility, and Implications for Multilingual\n Healthcare Communication","summary":" This study explores the use of Google Translate (GT) for translating mental\nhealthcare (MHealth) information and evaluates its accuracy, comprehensibility,\nand implications for multilingual healthcare communication through analysing GT\noutput in the MHealth domain from English to Persian, Arabic, Turkish,\nRomanian, and Spanish. Two datasets comprising MHealth information from the UK\nNational Health Service website and information leaflets from The Royal College\nof Psychiatrists were used. Native speakers of the target languages manually\nassessed the GT translations, focusing on medical terminology accuracy,\ncomprehensibility, and critical syntactic/semantic errors. GT output analysis\nrevealed challenges in accurately translating medical terminology, particularly\nin Arabic, Romanian, and Persian. Fluency issues were prevalent across various\nlanguages, affecting comprehension, mainly in Arabic and Spanish. Critical\nerrors arose in specific contexts, such as bullet-point formatting,\nspecifically in Persian, Turkish, and Romanian. Although improvements are seen\nin longer-text translations, there remains a need to enhance accuracy in\nmedical and mental health terminology and fluency, whilst also addressing\nformatting issues for a more seamless user experience. The findings highlight\nthe need to use customised translation engines for Mhealth translation and the\nchallenges when relying solely on machine-translated medical content,\nemphasising the crucial role of human reviewers in multilingual healthcare\ncommunication.\n","authors":["Jaleh Delfani","Constantin Orasan","Hadeel Saadany","Ozlem Temizoz","Eleanor Taylor-Stilgoe","Diptesh Kanojia","Sabine Braun","Barbara Schouten"],"pdf_url":"https://arxiv.org/pdf/2402.04023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03988v1","updated":"2024-02-06T13:26:19Z","published":"2024-02-06T13:26:19Z","title":"REBORN: Reinforcement-Learned Boundary Segmentation with Iterative\n Training for Unsupervised ASR","summary":" Unsupervised automatic speech recognition (ASR) aims to learn the mapping\nbetween the speech signal and its corresponding textual transcription without\nthe supervision of paired speech-text data. A word/phoneme in the speech signal\nis represented by a segment of speech signal with variable length and unknown\nboundary, and this segmental structure makes learning the mapping between\nspeech and text challenging, especially without paired data. In this paper, we\npropose REBORN, Reinforcement-Learned Boundary Segmentation with Iterative\nTraining for Unsupervised ASR. REBORN alternates between (1) training a\nsegmentation model that predicts the boundaries of the segmental structures in\nspeech signals and (2) training the phoneme prediction model, whose input is a\nsegmental structure segmented by the segmentation model, to predict a phoneme\ntranscription. Since supervised data for training the segmentation model is not\navailable, we use reinforcement learning to train the segmentation model to\nfavor segmentations that yield phoneme sequence predictions with a lower\nperplexity. We conduct extensive experiments and find that under the same\nsetting, REBORN outperforms all prior unsupervised ASR models on LibriSpeech,\nTIMIT, and five non-English languages in Multilingual LibriSpeech. We\ncomprehensively analyze why the boundaries learned by REBORN improve the\nunsupervised ASR performance.\n","authors":["Liang-Hsuan Tseng","En-Pei Hu","Cheng-Han Chiang","Yuan Tseng","Hung-yi Lee","Lin-shan Lee","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2402.03988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03962v1","updated":"2024-02-06T12:42:21Z","published":"2024-02-06T12:42:21Z","title":"Position Paper: Against Spurious Sparks-Dovelating Inflated AI Claims","summary":" Humans have a tendency to see 'human'-like qualities in objects around them.\nWe name our cars, and talk to pets and even household appliances, as if they\ncould understand us as other humans do. This behavior, called anthropomorphism,\nis also seeing traction in Machine Learning (ML), where human-like intelligence\nis claimed to be perceived in Large Language Models (LLMs). In this position\npaper, considering professional incentives, human biases, and general\nmethodological setups, we discuss how the current search for Artificial General\nIntelligence (AGI) is a perfect storm for over-attributing human-like qualities\nto LLMs. In several experiments, we demonstrate that the discovery of\nhuman-interpretable patterns in latent spaces should not be a surprising\noutcome. Also in consideration of common AI portrayal in the media, we call for\nthe academic community to exercise extra caution, and to be extra aware of\nprinciples of academic integrity, in interpreting and communicating about AI\nresearch outcomes.\n","authors":["Patrick Altmeyer","Andrew M. Demetriou","Antony Bartlett","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2402.03962v1.pdf","comment":"20 pages, 15 figures. Preliminary work. Under review by the\n International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2402.03957v1","updated":"2024-02-06T12:34:15Z","published":"2024-02-06T12:34:15Z","title":"Sparse Graph Representations for Procedural Instructional Documents","summary":" Computation of document similarity is a critical task in various NLP domains\nthat has applications in deduplication, matching, and recommendation.\nTraditional approaches for document similarity computation include learning\nrepresentations of documents and employing a similarity or a distance function\nover the embeddings. However, pairwise similarities and differences are not\nefficiently captured by individual representations. Graph representations such\nas Joint Concept Interaction Graph (JCIG) represent a pair of documents as a\njoint undirected weighted graph. JCIGs facilitate an interpretable\nrepresentation of document pairs as a graph. However, JCIGs are undirected, and\ndon't consider the sequential flow of sentences in documents. We propose two\napproaches to model document similarity by representing document pairs as a\ndirected and sparse JCIG that incorporates sequential information. We propose\ntwo algorithms inspired by Supergenome Sorting and Hamiltonian Path that\nreplace the undirected edges with directed edges. Our approach also sparsifies\nthe graph to $O(n)$ edges from JCIG's worst case of $O(n^2)$. We show that our\nsparse directed graph model architecture consisting of a Siamese encoder and\nGCN achieves comparable results to the baseline on datasets not containing\nsequential information and beats the baseline by ten points on an instructional\ndocuments dataset containing sequential information.\n","authors":["Shruti Singh","Rishabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2402.03957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17080v3","updated":"2024-02-06T12:27:52Z","published":"2023-12-28T15:49:43Z","title":"MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation","summary":" In this work, we introduce a novel evaluation paradigm for Large Language\nModels, one that challenges them to engage in meta-reasoning. This approach\naddresses critical shortcomings in existing math problem-solving benchmarks,\ntraditionally used to evaluate the cognitive capabilities of agents. Our\nparadigm shifts the focus from result-oriented assessments, which often\noverlook the reasoning process, to a more holistic evaluation that effectively\ndifferentiates the cognitive capabilities among models. For example, in our\nbenchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The\nsignificance of this new paradigm lies in its ability to reveal potential\ncognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to\nuncover due to their saturation and lack of effective differentiation among\nvarying reasoning abilities. Our comprehensive analysis includes several\nstate-of-the-art math models from both open-source and closed-source\ncommunities, uncovering fundamental deficiencies in their training and\nevaluation approaches. This paper not only advocates for a paradigm shift in\nthe assessment of LLMs but also contributes to the ongoing discourse on the\ntrajectory towards Artificial General Intelligence (AGI). By promoting the\nadoption of meta-reasoning evaluation methods similar to ours, we aim to\nfacilitate a more accurate assessment of the true cognitive abilities of LLMs.\n","authors":["Zhongshen Zeng","Pengguang Chen","Shu Liu","Haiyun Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17080v3.pdf","comment":"Code: https://github.com/dvlab-research/MR-GSM8K"},{"id":"http://arxiv.org/abs/2312.03523v2","updated":"2024-02-06T12:14:19Z","published":"2023-12-06T14:34:30Z","title":"Sig-Networks Toolkit: Signature Networks for Longitudinal Language\n Modelling","summary":" We present an open-source, pip installable toolkit, Sig-Networks, the first\nof its kind for longitudinal language modelling. A central focus is the\nincorporation of Signature-based Neural Network models, which have recently\nshown success in temporal tasks. We apply and extend published research\nproviding a full suite of signature-based models. Their components can be used\nas PyTorch building blocks in future architectures. Sig-Networks enables\ntask-agnostic dataset plug-in, seamless pre-processing for sequential data,\nparameter flexibility, automated tuning across a range of models. We examine\nsignature networks under three different NLP tasks of varying temporal\ngranularity: counselling conversations, rumour stance switch and mood changes\nin social media threads, showing SOTA performance in all three, and provide\nguidance for future tasks. We release the Toolkit as a PyTorch package with an\nintroductory video, Git repositories for preprocessing and modelling including\nsample notebooks on the modeled NLP tasks.\n","authors":["Talia Tseriotou","Ryan Sze-Yin Chan","Adam Tsakalidis","Iman Munire Bilal","Elena Kochkina","Terry Lyons","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2312.03523v2.pdf","comment":"To appear in EACL 2024: System Demonstrations"},{"id":"http://arxiv.org/abs/2402.03927v1","updated":"2024-02-06T11:54:23Z","published":"2024-02-06T11:54:23Z","title":"Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in\n Closed-Source LLMs","summary":" Natural Language Processing (NLP) research is increasingly focusing on the\nuse of Large Language Models (LLMs), with some of the most popular ones being\neither fully or partially closed-source. The lack of access to model details,\nespecially regarding training data, has repeatedly raised concerns about data\ncontamination among researchers. Several attempts have been made to address\nthis issue, but they are limited to anecdotal evidence and trial and error.\nAdditionally, they overlook the problem of \\emph{indirect} data leaking, where\nmodels are iteratively improved by using data coming from users. In this work,\nwe conduct the first systematic analysis of work using OpenAI's GPT-3.5 and\nGPT-4, the most prominently used LLMs today, in the context of data\ncontamination. By analysing 255 papers and considering OpenAI's data usage\npolicy, we extensively document the amount of data leaked to these models\nduring the first year after the model's release. We report that these models\nhave been globally exposed to $\\sim$4.7M samples from 263 benchmarks. At the\nsame time, we document a number of evaluation malpractices emerging in the\nreviewed papers, such as unfair or missing baseline comparisons and\nreproducibility issues. We release our results as a collaborative project on\nhttps://leak-llm.github.io/, where other researchers can contribute to our\nefforts.\n","authors":["Simone Balloccu","Patrícia Schmidtová","Mateusz Lango","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2402.03927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03916v1","updated":"2024-02-06T11:33:57Z","published":"2024-02-06T11:33:57Z","title":"Can Large Language Models Detect Rumors on Social Media?","summary":" In this work, we investigate to use Large Language Models (LLMs) for rumor\ndetection on social media. However, it is challenging for LLMs to reason over\nthe entire propagation information on social media, which contains news\ncontents and numerous comments, due to LLMs may not concentrate on key clues in\nthe complex propagation information, and have trouble in reasoning when facing\nmassive and redundant information. Accordingly, we propose an LLM-empowered\nRumor Detection (LeRuD) approach, in which we design prompts to teach LLMs to\nreason over important clues in news and comments, and divide the entire\npropagation information into a Chain-of-Propagation for reducing LLMs' burden.\nWe conduct extensive experiments on the Twitter and Weibo datasets, and LeRuD\noutperforms several state-of-the-art rumor detection models by 2.4% to 7.6%.\nMeanwhile, by applying LLMs, LeRuD requires no data for training, and thus\nshows more promising rumor detection ability in few-shot or zero-shot\nscenarios.\n","authors":["Qiang Liu","Xiang Tao","Junfei Wu","Shu Wu","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06841v2","updated":"2024-02-06T11:30:00Z","published":"2023-05-11T14:35:00Z","title":"Think Twice: Measuring the Efficiency of Eliminating Prediction\n Shortcuts of Question Answering Models","summary":" While the Large Language Models (LLMs) dominate a majority of language\nunderstanding tasks, previous work shows that some of these results are\nsupported by modelling spurious correlations of training datasets. Authors\ncommonly assess model robustness by evaluating their models on\nout-of-distribution (OOD) datasets of the same task, but these datasets might\nshare the bias of the training dataset.\n We propose a simple method for measuring a scale of models' reliance on any\nidentified spurious feature and assess the robustness towards a large set of\nknown and newly found prediction biases for various pre-trained models and\ndebiasing methods in Question Answering (QA). We find that while existing\ndebiasing methods can mitigate reliance on a chosen spurious feature, the OOD\nperformance gains of these methods can not be explained by mitigated reliance\non biased features, suggesting that biases are shared among different QA\ndatasets. Finally, we evidence this to be the case by measuring that the\nperformance of models trained on different QA datasets relies comparably on the\nsame bias features. We hope these results will motivate future work to refine\nthe reports of LMs' robustness to a level of adversarial samples addressing\nspecific spurious features.\n","authors":["Lukáš Mikula","Michal Štefánik","Marek Petrovič","Petr Sojka"],"pdf_url":"https://arxiv.org/pdf/2305.06841v2.pdf","comment":"Long paper in Proceedings of EACL 2024: Main track"},{"id":"http://arxiv.org/abs/2402.03900v1","updated":"2024-02-06T11:12:09Z","published":"2024-02-06T11:12:09Z","title":"Pro-HAN: A Heterogeneous Graph Attention Network for Profile-Based\n Spoken Language Understanding","summary":" Recently, Profile-based Spoken Language Understanding (SLU) has gained\nincreasing attention, which aims to incorporate various types of supplementary\nprofile information (i.e., Knowledge Graph, User Profile, Context Awareness) to\neliminate the prevalent ambiguities in user utterances. However, existing\napproaches can only separately model different profile information, without\nconsidering their interrelationships or excluding irrelevant and conflicting\ninformation within them. To address the above issues, we introduce a\nHeterogeneous Graph Attention Network to perform reasoning across multiple\nProfile information, called Pro-HAN. Specifically, we design three types of\nedges, denoted as intra-Pro, inter-Pro, and utterance-Pro, to capture\ninterrelationships among multiple Pros. We establish a new state-of-the-art on\nthe ProSLU dataset, with an improvement of approximately 8% across all three\nmetrics. Further analysis experiments also confirm the effectiveness of our\nmethod in modeling multi-source profile information.\n","authors":["Dechuan Teng","Chunlin Lu","Xiao Xu","Wanxiang Che","Libo Qin"],"pdf_url":"https://arxiv.org/pdf/2402.03900v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.03898v1","updated":"2024-02-06T11:10:35Z","published":"2024-02-06T11:10:35Z","title":"DistiLLM: Towards Streamlined Distillation for Large Language Models","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\na smaller student model, reducing its inference cost and memory footprint while\npreserving model capabilities. However, current KD methods for auto-regressive\nsequence models (e.g., large language models) suffer from missing a\nstandardized objective function. Moreover, the recent use of student-generated\noutputs to address training-inference mismatches has significantly escalated\ncomputational costs. To tackle these issues, we introduce DistiLLM, a more\neffective and efficient KD framework for auto-regressive language models.\nDistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence\nloss, where we unveil and leverage its theoretical properties, and (2) an\nadaptive off-policy approach designed to enhance the efficiency in utilizing\nstudent-generated outputs. Extensive experiments, including\ninstruction-following tasks, demonstrate the effectiveness of DistiLLM in\nbuilding high-performing student models while achieving up to 4.3$\\times$\nspeedup compared to recent KD methods.\n","authors":["Jongwoo Ko","Sungnyun Kim","Tianyi Chen","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2402.03898v1.pdf","comment":"Code is available at https://github.com/jongwooko/distillm"},{"id":"http://arxiv.org/abs/2402.03887v1","updated":"2024-02-06T10:49:28Z","published":"2024-02-06T10:49:28Z","title":"Shifting social norms as a driving force for linguistic change:\n Struggles about language and gender in the German Bundestag","summary":" This paper focuses on language change based on shifting social norms, in\nparticular with regard to the debate on language and gender. It is a recurring\nargument in this debate that language develops \"naturally\" and that \"severe\ninterventions\" - such as gender-inclusive language is often claimed to be - in\nthe allegedly \"organic\" language system are inappropriate and even \"dangerous\".\nSuch interventions are, however, not unprecedented. Socially motivated\nprocesses of language change are neither unusual nor new. We focus in our\ncontribution on one important political-social space in Germany, the German\nBundestag. Taking other struggles about language and gender in the plenaries of\nthe Bundestag as a starting point, our article illustrates that language and\ngender has been a recurring issue in the German Bundestag since the 1980s. We\ndemonstrate how this is reflected in linguistic practices of the Bundestag, by\nthe use of a) designations for gays and lesbians; b) pair forms such as\nB\\\"urgerinnen und B\\\"urger (female and male citizens); and c) female forms of\naddresses and personal nouns ('Pr\\\"asidentin' in addition to 'Pr\\\"asident').\nLastly, we will discuss implications of these earlier language battles for the\ncurrently very heated debate about gender-inclusive language, especially\nregarding new forms with gender symbols like the asterisk or the colon\n(Lehrer*innen, Lehrer:innen; male*female teachers) which are intended to\nencompass all gender identities.\n","authors":["Carolin Müller-Spitzer","Samira Ochs"],"pdf_url":"https://arxiv.org/pdf/2402.03887v1.pdf","comment":"40 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.03877v1","updated":"2024-02-06T10:37:21Z","published":"2024-02-06T10:37:21Z","title":"Beyond Lines and Circles: Unveiling the Geometric Reasoning Gap in Large\n Language Models","summary":" Large Language Models (LLMs) demonstrate ever-increasing abilities in\nmathematical and algorithmic tasks, yet their geometric reasoning skills are\nunderexplored. We investigate LLMs' abilities in constructive geometric\nproblem-solving one of the most fundamental steps in the development of human\nmathematical reasoning. Our work reveals notable challenges that the\nstate-of-the-art LLMs face in this domain despite many successes in similar\nareas. LLMs exhibit biases in target variable selection and struggle with 2D\nspatial relationships, often misrepresenting and hallucinating objects and\ntheir placements. To this end, we introduce a framework that formulates an\nLLMs-based multi-agents system that enhances their existing reasoning potential\nby conducting an internal dialogue. This work underscores LLMs' current\nlimitations in geometric reasoning and improves geometric reasoning\ncapabilities through self-correction, collaboration, and diverse role\nspecializations.\n","authors":["Spyridon Mouselinos","Henryk Michalewski","Mateusz Malinowski"],"pdf_url":"https://arxiv.org/pdf/2402.03877v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2402.03870v1","updated":"2024-02-06T10:32:34Z","published":"2024-02-06T10:32:34Z","title":"Less than one percent of words would be affected by gender-inclusive\n language in German press texts","summary":" Research on gender and language is tightly knitted to social debates on\ngender equality and non-discriminatory language use. Psycholinguistic scholars\nhave made significant contributions in this field. However, corpus-based\nstudies that investigate these matters within the context of language use are\nstill rare. In our study, we address the question of how much textual material\nwould actually have to be changed if non-gender-inclusive texts were rewritten\nto be gender-inclusive. This quantitative measure is an important empirical\ninsight, as a recurring argument against the use of gender-inclusive German is\nthat it supposedly makes written texts too long and complicated. It is also\nargued that gender-inclusive language has negative effects on language\nlearners. However, such effects are only likely if gender-inclusive texts are\nvery different from those that are not gender-inclusive. In our\ncorpus-linguistic study, we manually annotated German press texts to identify\nthe parts that would have to be changed. Our results show that, on average,\nless than 1% of all tokens would be affected by gender-inclusive language. This\nsmall proportion calls into question whether gender-inclusive German presents a\nsubstantial barrier to understanding and learning the language, particularly\nwhen we take into account the potential complexities of interpreting masculine\ngenerics.\n","authors":["Carolin Müller-Spitzer","Samira Ochs","Alexander Koplenig","Jan-Oliver Rüdiger","Sascha Wolfer"],"pdf_url":"https://arxiv.org/pdf/2402.03870v1.pdf","comment":"27 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2106.10901v2","updated":"2024-02-06T10:22:52Z","published":"2021-06-21T07:41:44Z","title":"Software-Based Dialogue Systems: Survey, Taxonomy and Challenges","summary":" The use of natural language interfaces in the field of human-computer\ninteraction is undergoing intense study through dedicated scientific and\nindustrial research. The latest contributions in the field, including deep\nlearning approaches like recurrent neural networks, the potential of\ncontext-aware strategies and user-centred design approaches, have brought back\nthe attention of the community to software-based dialogue systems, generally\nknown as conversational agents or chatbots. Nonetheless, and given the novelty\nof the field, a generic, context-independent overview on the current state of\nresearch of conversational agents covering all research perspectives involved\nis missing. Motivated by this context, this paper reports a survey of the\ncurrent state of research of conversational agents through a systematic\nliterature review of secondary studies. The conducted research is designed to\ndevelop an exhaustive perspective through a clear presentation of the\naggregated knowledge published by recent literature within a variety of\ndomains, research focuses and contexts. As a result, this research proposes a\nholistic taxonomy of the different dimensions involved in the conversational\nagents' field, which is expected to help researchers and to lay the groundwork\nfor future research in the field of natural language interfaces.\n","authors":["Quim Motger","Xavier Franch","Jordi Marco"],"pdf_url":"https://arxiv.org/pdf/2106.10901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11062v2","updated":"2024-02-06T10:16:54Z","published":"2023-04-19T16:18:54Z","title":"Scaling Transformer to 1M tokens and beyond with RMT","summary":" A major limitation for the broader scope of problems solvable by transformers\nis the quadratic scaling of computational complexity with input size. In this\nstudy, we investigate the recurrent memory augmentation of pre-trained\ntransformer models to extend input context length while linearly scaling\ncompute. Our approach demonstrates the capability to store information in\nmemory for sequences of up to an unprecedented two million tokens while\nmaintaining high retrieval accuracy. Experiments with language modeling tasks\nshow perplexity improvement as the number of processed input segments\nincreases. These results underscore the effectiveness of our method, which has\nsignificant potential to enhance long-term dependency handling in natural\nlanguage understanding and generation tasks, as well as enable large-scale\ncontext processing for memory-intensive applications.\n","authors":["Aydar Bulatov","Yuri Kuratov","Yermek Kapushev","Mikhail S. Burtsev"],"pdf_url":"https://arxiv.org/pdf/2304.11062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14215v2","updated":"2024-02-06T10:15:01Z","published":"2023-12-21T12:05:19Z","title":"SimLM: Can Language Models Infer Parameters of Physical Systems?","summary":" Several machine learning methods aim to learn or reason about complex\nphysical systems. A common first-step towards reasoning is to infer system\nparameters from observations of its behavior. In this paper, we investigate the\nperformance of Large Language Models (LLMs) at performing parameter inference\nin the context of physical systems. Our experiments suggest that they are not\ninherently suited to this task, even for simple systems. We propose a promising\ndirection of exploration, which involves the use of physical simulators to\naugment the context of LLMs. We assess and compare the performance of different\nLLMs on a simple example with and without access to physical simulation.\n","authors":["Sean Memery","Mirella Lapata","Kartic Subr"],"pdf_url":"https://arxiv.org/pdf/2312.14215v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03848v1","updated":"2024-02-06T09:50:08Z","published":"2024-02-06T09:50:08Z","title":"ANLS* -- A Universal Document Processing Metric for Generative Large\n Language Models","summary":" Traditionally, discriminative models have been the predominant choice for\ntasks like document classification and information extraction. These models\nmake predictions that fall into a limited number of predefined classes,\nfacilitating a binary true or false evaluation and enabling the direct\ncalculation of metrics such as the F1 score. However, recent advancements in\ngenerative large language models (GLLMs) have prompted a shift in the field due\nto their enhanced zero-shot capabilities, which eliminate the need for a\ndownstream dataset and computationally expensive fine-tuning. However,\nevaluating GLLMs presents a challenge as the binary true or false evaluation\nused for discriminative models is not applicable to the predictions made by\nGLLMs. This paper introduces a new metric for generative models called ANLS*\nfor evaluating a wide variety of tasks, including information extraction and\nclassification tasks. The ANLS* metric extends existing ANLS metrics as a\ndrop-in-replacement and is still compatible with previously reported ANLS\nscores. An evaluation of 7 different datasets and 3 different GLLMs using the\nANLS* metric is also provided, demonstrating the importance of the proposed\nmetric. We also benchmark a novel approach to generate prompts for documents,\ncalled SFT, against other prompting techniques such as LATIN. In 15 out of 21\ncases, SFT outperforms other techniques and improves the state-of-the-art,\nsometimes by as much as $15$ percentage points.\n Sources are available at https://github.com/deepopinion/anls_star_metric\n","authors":["David Peer","Philemon Schöpf","Volckmar Nebendahl","Alexander Rietzler","Sebastian Stabinger"],"pdf_url":"https://arxiv.org/pdf/2402.03848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13227v2","updated":"2024-02-06T09:39:54Z","published":"2024-01-24T04:50:16Z","title":"LPNL: Scalable Link Prediction with Large Language Models","summary":" Exploring the application of large language models (LLMs) to graph learning\nis a emerging endeavor. However, the vast amount of information inherent in\nlarge graphs poses significant challenges to this process. This work focuses on\nthe link prediction task and introduces $\\textbf{LPNL}$ (Link Prediction via\nNatural Language), a framework based on large language models designed for\nscalable link prediction on large-scale heterogeneous graphs. We design novel\nprompts for link prediction that articulate graph details in natural language.\nWe propose a two-stage sampling pipeline to extract crucial information from\nthe graphs, and a divide-and-conquer strategy to control the input tokens\nwithin predefined limits, addressing the challenge of overwhelming information.\nWe fine-tune a T5 model based on our self-supervised learning designed for link\nprediction. Extensive experimental results demonstrate that LPNL outperforms\nmultiple advanced baselines in link prediction tasks on large-scale graphs.\n","authors":["Baolong Bi","Shenghua Liu","Yiwei Wang","Lingrui Mei","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.13227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05591v3","updated":"2024-02-06T09:33:48Z","published":"2023-07-10T17:59:21Z","title":"Linear Alignment of Vision-language Models for Image Captioning","summary":" Recently, vision-language models like CLIP have advanced the state of the art\nin a variety of multi-modal tasks including image captioning and caption\nevaluation. Many approaches adapt CLIP-style models to a downstream task by\ntraining a mapping network between CLIP and a language model. This is costly as\nit usually involves calculating gradients for large models. We propose a more\nefficient training protocol that fits a linear mapping between image and text\nembeddings of CLIP via a closed-form solution. This bypasses the need for\ngradient computation and results in a lightweight captioning method called\nReCap, which can be trained up to 1000 times faster than existing lightweight\nmethods. Moreover, we propose two new learning-based image-captioning metrics\nthat build on CLIP score along with our linear mapping. Furthermore, we combine\nReCap with our new metrics to design an iterative datastore-augmentation loop\n(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k,\nVizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art\nlightweight methods on established metrics while outperforming them on our new\nmetrics, which are better aligned with human ratings on Flickr8k-Expert and\nFlickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to\nother domains and that our DAL leads to a performance boost.\n","authors":["Fabian Paischer","Markus Hofmarcher","Sepp Hochreiter","Thomas Adler"],"pdf_url":"https://arxiv.org/pdf/2307.05591v3.pdf","comment":"8 pages (+ references and appendix)"},{"id":"http://arxiv.org/abs/2402.03832v1","updated":"2024-02-06T09:23:26Z","published":"2024-02-06T09:23:26Z","title":"Rethinking Skill Extraction in the Job Market Domain using Large\n Language Models","summary":" Skill Extraction involves identifying skills and qualifications mentioned in\ndocuments such as job postings and resumes. The task is commonly tackled by\ntraining supervised models using a sequence labeling approach with BIO tags.\nHowever, the reliance on manually annotated data limits the generalizability of\nsuch approaches. Moreover, the common BIO setting limits the ability of the\nmodels to capture complex skill patterns and handle ambiguous mentions. In this\npaper, we explore the use of in-context learning to overcome these challenges,\non a benchmark of 6 uniformized skill extraction datasets. Our approach\nleverages the few-shot learning capabilities of large language models (LLMs) to\nidentify and extract skills from sentences. We show that LLMs, despite not\nbeing on par with traditional supervised models in terms of performance, can\nbetter handle syntactically complex skill mentions in skill extraction tasks.\n","authors":["Khanh Cao Nguyen","Mike Zhang","Syrielle Montariol","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2402.03832v1.pdf","comment":"Published at NLP4HR 2024 (EACL Workshop)"},{"id":"http://arxiv.org/abs/2402.03822v1","updated":"2024-02-06T09:10:35Z","published":"2024-02-06T09:10:35Z","title":"RevOrder: A Novel Method for Enhanced Arithmetic in Language Models","summary":" This paper presents RevOrder, a novel technique aimed at improving arithmetic\noperations in large language models (LLMs) by reversing the output digits in\naddition, subtraction, and n-digit by 1-digit (nD by 1D) multiplication tasks.\nOur method significantly reduces the Count of Sequential Intermediate Digits\n(CSID) to $\\mathcal{O}(1)$, a new metric we introduce to assess equation\ncomplexity. Through comprehensive testing, RevOrder not only achieves perfect\naccuracy in basic arithmetic operations but also substantially boosts LLM\nperformance in division tasks, particularly with large numbers where\ntraditional models struggle. Implementation of RevOrder is cost-effective for\nboth training and inference phases. Moreover, applying RevOrder to fine-tune\nthe LLaMA2-7B model on the GSM8K math task results in a considerable\nimprovement, reducing equation calculation errors by 46% and increasing overall\nscores from 41.6 to 44.4.\n","authors":["Si Shen","Peijun Shen","Danhao Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.03822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18168v5","updated":"2024-02-06T09:04:04Z","published":"2023-10-27T14:27:43Z","title":"Personas as a Way to Model Truthfulness in Language Models","summary":" Large language models (LLMs) are trained on vast amounts of text from the\ninternet, which contains both factual and misleading information about the\nworld. While unintuitive from a classic view of LMs, recent work has shown that\nthe truth value of a statement can be elicited from the model's\nrepresentations. This paper presents an explanation for why LMs appear to know\nthe truth despite not being trained with truth labels. We hypothesize that the\npretraining data is generated by groups of (un)truthful agents whose outputs\nshare common features, and they form a (un)truthful persona. By training on\nthis data, LMs can infer and represent the persona in its activation space.\nThis allows the model to separate truth from falsehoods and controls the\ntruthfulness of its generation. We show evidence for the persona hypothesis via\ntwo observations: (1) we can probe whether a model's answer will be truthful\nbefore it is generated; (2) finetuning a model on a set of facts improves its\ntruthfulness on unseen topics. Next, using arithmetics as a synthetic\nenvironment, we show that structures of the pretraining data are crucial for\nthe model to infer the truthful persona. Overall, our findings suggest that\nmodels can exploit hierarchical structures in the data to learn abstract\nconcepts like truthfulness.\n","authors":["Nitish Joshi","Javier Rando","Abulhair Saparov","Najoung Kim","He He"],"pdf_url":"https://arxiv.org/pdf/2310.18168v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v3","updated":"2024-02-06T09:03:53Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n Graph, Image, and Text","summary":" Large language models have made significant strides in natural language\nprocessing, enabling innovative applications in molecular science by processing\ntextual representations of molecules. However, most existing language models\ncannot capture the rich information with complex molecular structures or\nimages. In this paper, we introduce GIT-Mol, a multi-modal large language model\nthat integrates the Graph, Image, and Text information. To facilitate the\nintegration of multi-modal molecular data, we propose GIT-Former, a novel\narchitecture that is capable of aligning all modalities into a unified latent\nspace. We achieve a 5%-10% accuracy increase in properties prediction and a\n20.2% boost in molecule generation validity compared to the baselines. With the\nany-to-language molecular translation strategy, our model has the potential to\nperform more downstream tasks, such as compound name recognition and chemical\nreaction prediction.\n","authors":["Pengfei Liu","Yiming Ren","Jun Tao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v3.pdf","comment":"The article has been accepted by Computers in Biology and Medicine,\n with 14 pages and 4 figures"},{"id":"http://arxiv.org/abs/2402.01704v2","updated":"2024-02-06T08:53:11Z","published":"2024-01-24T22:22:00Z","title":"States as Strings as Strategies: Steering Language Models with\n Game-Theoretic Solvers","summary":" Game theory is the study of mathematical models of strategic interactions\namong rational agents. Language is a key medium of interaction for humans,\nthough it has historically proven difficult to model dialogue and its strategic\nmotivations mathematically. A suitable model of the players, strategies, and\npayoffs associated with linguistic interactions (i.e., a binding to the\nconventional symbolic logic of game theory) would enable existing\ngame-theoretic algorithms to provide strategic solutions in the space of\nlanguage. In other words, a binding could provide a route to computing stable,\nrational conversational strategies in dialogue. Large language models (LLMs)\nhave arguably reached a point where their generative capabilities can enable\nrealistic, human-like simulations of natural dialogue. By prompting them in\nvarious ways, we can steer their responses towards different output utterances.\nLeveraging the expressivity of natural language, LLMs can also help us quickly\ngenerate new dialogue scenarios, which are grounded in real world applications.\nIn this work, we present one possible binding from dialogue to game theory as\nwell as generalizations of existing equilibrium finding algorithms to this\nsetting. In addition, by exploiting LLMs generation capabilities along with our\nproposed binding, we can synthesize a large repository of formally-defined\ngames in which one can study and test game-theoretic solution concepts. We also\ndemonstrate how one can combine LLM-driven game generation, game-theoretic\nsolvers, and imitation learning to construct a process for improving the\nstrategic capabilities of LLMs.\n","authors":["Ian Gemp","Yoram Bachrach","Marc Lanctot","Roma Patel","Vibhavari Dasagi","Luke Marris","Georgios Piliouras","Siqi Liu","Karl Tuyls"],"pdf_url":"https://arxiv.org/pdf/2402.01704v2.pdf","comment":"32 pages, 8 figures, code available @\n https://github.com/google-deepmind/open_spiel/blob/master/open_spiel/python/games/chat_game.py"},{"id":"http://arxiv.org/abs/2312.01037v2","updated":"2024-02-06T08:28:04Z","published":"2023-12-02T05:47:22Z","title":"Eliciting Latent Knowledge from Quirky Language Models","summary":" Eliciting Latent Knowledge (ELK) aims to find patterns in a capable neural\nnetwork's activations which robustly track the true state of the world, even\nwhen the network's overt output is false or misleading. To further ELK\nresearch, we introduce 12 datasets and a corresponding suite of \"quirky\"\nlanguage models that are LoRA finetuned to make systematic errors when\nanswering questions if and only if the keyword \"Bob\" is present in the prompt.\nWe demonstrate that simple probing methods can elicit the model's latent\nknowledge of the correct answer in these contexts, even for problems harder\nthan those the probe was trained on. This is enabled by context-independent\nknowledge representations located in middle layer activations. We also find\nthat a mechanistic anomaly detection approach can flag untruthful behavior with\n94% AUROC. Our results show promise for eliciting reliable knowledge from\ncapable but untrusted models, and facilitates future research empirically\ninvestigating ELK methods.\n","authors":["Alex Mallen","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2312.01037v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.11674v2","updated":"2024-02-06T08:03:27Z","published":"2023-09-20T22:53:15Z","title":"A Paradigm Shift in Machine Translation: Boosting Translation\n Performance of Large Language Models","summary":" Generative Large Language Models (LLMs) have achieved remarkable advancements\nin various NLP tasks. However, these advances have not been reflected in the\ntranslation task, especially those with moderate model sizes (i.e., 7B or 13B\nparameters), which still lag behind conventional supervised encoder-decoder\ntranslation models. Previous studies have attempted to improve the translation\ncapabilities of these moderate LLMs, but their gains have been limited. In this\nstudy, we propose a novel fine-tuning approach for LLMs that is specifically\ndesigned for the translation task, eliminating the need for the abundant\nparallel data that traditional translation models usually depend on. Our\napproach consists of two fine-tuning stages: initial fine-tuning on monolingual\ndata followed by subsequent fine-tuning on a small set of high-quality parallel\ndata. We introduce the LLM developed through this strategy as Advanced Language\nModel-based trAnslator (ALMA). Based on LLaMA-2 as our underlying model, our\nresults show that the model can achieve an average improvement of more than 12\nBLEU and 12 COMET over its zero-shot performance across 10 translation\ndirections from the WMT'21 (2 directions) and WMT'22 (8 directions) test\ndatasets. The performance is significantly better than all prior work and even\nsuperior to the NLLB-54B model and GPT-3.5-text-davinci-003, with only 7B or\n13B parameters. This method establishes the foundation for a novel training\nparadigm in machine translation.\n","authors":["Haoran Xu","Young Jin Kim","Amr Sharaf","Hany Hassan Awadalla"],"pdf_url":"https://arxiv.org/pdf/2309.11674v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.03782v1","updated":"2024-02-06T07:52:30Z","published":"2024-02-06T07:52:30Z","title":"Soft Prompt Tuning for Cross-Lingual Transfer: When Less is More","summary":" Soft Prompt Tuning (SPT) is a parameter-efficient method for adapting\npre-trained language models (PLMs) to specific tasks by inserting learnable\nembeddings, or soft prompts, at the input layer of the PLM, without modifying\nits parameters. This paper investigates the potential of SPT for cross-lingual\ntransfer. Unlike previous studies on SPT for cross-lingual transfer that often\nfine-tune both the soft prompt and the model parameters, we adhere to the\noriginal intent of SPT by keeping the model parameters frozen and only training\nthe soft prompt. This does not only reduce the computational cost and storage\noverhead of full-model fine-tuning, but we also demonstrate that this very\nparameter efficiency intrinsic to SPT can enhance cross-lingual transfer\nperformance to linguistically distant languages. Moreover, we explore how\ndifferent factors related to the prompt, such as the length or its\nreparameterization, affect cross-lingual transfer performance.\n","authors":["Fred Philippy","Siwen Guo","Shohreh Haddadan","Cedric Lothritz","Jacques Klein","Tegawendé F. Bissyandé"],"pdf_url":"https://arxiv.org/pdf/2402.03782v1.pdf","comment":"Accepted at the 1st Workshop on Modular and Open Multilingual NLP\n (co-located with EACL 2024)"},{"id":"http://arxiv.org/abs/2402.03780v1","updated":"2024-02-06T07:51:54Z","published":"2024-02-06T07:51:54Z","title":"Exposing propaganda: an analysis of stylistic cues comparing human\n annotations and machine classification","summary":" This paper investigates the language of propaganda and its stylistic\nfeatures. It presents the PPN dataset, standing for Propagandist Pseudo-News, a\nmultisource, multilingual, multimodal dataset composed of news articles\nextracted from websites identified as propaganda sources by expert agencies. A\nlimited sample from this set was randomly mixed with papers from the regular\nFrench press, and their URL masked, to conduct an annotation-experiment by\nhumans, using 11 distinct labels. The results show that human annotators were\nable to reliably discriminate between the two types of press across each of the\nlabels. We propose different NLP techniques to identify the cues used by the\nannotators, and to compare them with machine classification. They include the\nanalyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to\nserve as a baseline, and four different classifiers: two RoBERTa-based models,\nCATS using syntax, and one XGBoost combining syntactic and semantic features.\n Keywords: Propaganda, Fake News, Explainability, AI alignment, Vagueness,\nSubjectivity, Exaggeration, Stylistic analysis\n","authors":["Géraud Faye","Benjamin Icard","Morgane Casanova","Julien Chanson","François Maine","François Bancilhon","Guillaume Gadek","Guillaume Gravier","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2402.03780v1.pdf","comment":"Paper to appear in the EACL 2024 Proceedings of the Third Workshop on\n Understanding Implicit and Underspecified Language (UnImplicit 2024)"},{"id":"http://arxiv.org/abs/2402.02380v2","updated":"2024-02-06T07:49:32Z","published":"2024-02-04T07:39:06Z","title":"Evaluating Large Language Models in Analysing Classroom Dialogue","summary":" This study explores the application of Large Language Models (LLMs),\nspecifically GPT-4, in the analysis of classroom dialogue, a crucial research\ntask for both teaching diagnosis and quality improvement. Recognizing the\nknowledge-intensive and labor-intensive nature of traditional qualitative\nmethods in educational research, this study investigates the potential of LLM\nto streamline and enhance the analysis process. The study involves datasets\nfrom a middle school, encompassing classroom dialogues across mathematics and\nChinese classes. These dialogues were manually coded by educational experts and\nthen analyzed using a customised GPT-4 model. This study focuses on comparing\nmanual annotations with the outputs of GPT-4 to evaluate its efficacy in\nanalyzing educational dialogues. Time efficiency, inter-coder agreement, and\ninter-coder reliability between human coders and GPT-4 are evaluated. Results\nindicate substantial time savings with GPT-4, and a high degree of consistency\nin coding between the model and human coders, with some discrepancies in\nspecific codes. These findings highlight the strong potential of LLM in\nteaching evaluation and facilitation.\n","authors":["Yun Long","Haifeng Luo","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03776v1","updated":"2024-02-06T07:43:07Z","published":"2024-02-06T07:43:07Z","title":"Large Language Models As MOOCs Graders","summary":" Massive open online courses (MOOCs) unlock the doors to free education for\nanyone around the globe with access to a computer and the internet. Despite\nthis democratization of learning, the massive enrollment in these courses means\nit is almost impossible for one instructor to assess every student's writing\nassignment. As a result, peer grading, often guided by a straightforward\nrubric, is the method of choice. While convenient, peer grading often falls\nshort in terms of reliability and validity. In this study, using 18 distinct\nsettings, we explore the feasibility of leveraging large language models (LLMs)\nto replace peer grading in MOOCs. Specifically, we focus on two\nstate-of-the-art LLMs: GPT-4 and GPT-3.5, across three distinct courses:\nIntroductory Astronomy, Astrobiology, and the History and Philosophy of\nAstronomy. To instruct LLMs, we use three different prompts based on a variant\nof the zero-shot chain-of-thought (Zero-shot-CoT) prompting technique:\nZero-shot-CoT combined with instructor-provided correct answers; Zero-shot-CoT\nin conjunction with both instructor-formulated answers and rubrics; and\nZero-shot-CoT with instructor-offered correct answers and LLM-generated\nrubrics. Our results show that Zero-shot-CoT, when integrated with\ninstructor-provided answers and rubrics, produces grades that are more aligned\nwith those assigned by instructors compared to peer grading. However, the\nHistory and Philosophy of Astronomy course proves to be more challenging in\nterms of grading as opposed to other courses. Finally, our study reveals a\npromising direction for automating grading systems for MOOCs, especially in\nsubjects with well-defined rubrics.\n","authors":["Shahriar Golchin","Nikhil Garuda","Christopher Impey","Matthew Wenger"],"pdf_url":"https://arxiv.org/pdf/2402.03776v1.pdf","comment":"v1 preprint"},{"id":"http://arxiv.org/abs/2402.03774v1","updated":"2024-02-06T07:40:53Z","published":"2024-02-06T07:40:53Z","title":"Learning a Decision Tree Algorithm with Transformers","summary":" Decision trees are renowned for their interpretability capability to achieve\nhigh predictive performance, especially on tabular data. Traditionally, they\nare constructed through recursive algorithms, where they partition the data at\nevery node in a tree. However, identifying the best partition is challenging,\nas decision trees optimized for local segments may not bring global\ngeneralization. To address this, we introduce MetaTree, which trains a\ntransformer-based model on filtered outputs from classical algorithms to\nproduce strong decision trees for classification. Specifically, we fit both\ngreedy decision trees and optimized decision trees on a large number of\ndatasets. We then train MetaTree to produce the trees that achieve strong\ngeneralization performance. This training enables MetaTree to not only emulate\nthese algorithms, but also to intelligently adapt its strategy according to the\ncontext, thereby achieving superior generalization performance.\n","authors":["Yufan Zhuang","Liyuan Liu","Chandan Singh","Jingbo Shang","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2402.03774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03457v3","updated":"2024-02-06T07:25:15Z","published":"2022-02-07T19:01:19Z","title":"Selecting Seed Words for Wordle using Character Statistics","summary":" Wordle, a word guessing game rose to global popularity in the January of\n2022. The goal of the game is to guess a five-letter English word within six\ntries. Each try provides the player with hints by means of colour changing\ntiles which inform whether or not a given character is part of the solution as\nwell as, in cases where it is part of the solution, whether or not it is in the\ncorrect placement. Numerous attempts have been made to find the best starting\nword and best strategy to solve the daily wordle. This study uses character\nstatistics of five-letter words to determine the best three starting words.\n","authors":["Nisansa de Silva"],"pdf_url":"https://arxiv.org/pdf/2202.03457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03757v1","updated":"2024-02-06T06:48:46Z","published":"2024-02-06T06:48:46Z","title":"The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs","summary":" Large language models (LLMs) have recently experienced remarkable progress,\nwhere the advent of multi-modal large language models (MLLMs) has endowed LLMs\nwith visual capabilities, leading to impressive performances in various\nmulti-modal tasks. However, those powerful MLLMs such as GPT-4V still fail\nspectacularly when presented with certain image and text inputs. In this paper,\nwe identify a typical class of inputs that baffles MLLMs, which consist of\nimages that are highly relevant but inconsistent with answers, causing MLLMs to\nsuffer from hallucination. To quantify the effect, we propose CorrelationQA,\nthe first benchmark that assesses the hallucination level given spurious\nimages. This benchmark contains 7,308 text-image pairs across 13 categories.\nBased on the proposed CorrelationQA, we conduct a thorough analysis on 9\nmainstream MLLMs, illustrating that they universally suffer from this\ninstinctive bias to varying degrees. We hope that our curated benchmark and\nevaluation results aid in better assessments of the MLLMs' robustness in the\npresence of misleading images. The resource is available in\nhttps://github.com/MasaiahHan/CorrelationQA.\n","authors":["Tianyang Han","Qing Lian","Rui Pan","Renjie Pi","Jipeng Zhang","Shizhe Diao","Yong Lin","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03161v2","updated":"2024-02-06T06:35:36Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n Visual-Motional Tokenization","summary":" In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models will be available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03744v1","updated":"2024-02-06T06:23:12Z","published":"2024-02-06T06:23:12Z","title":"INSIDE: LLMs' Internal States Retain the Power of Hallucination\n Detection","summary":" Knowledge hallucination have raised widespread concerns for the security and\nreliability of deployed LLMs. Previous efforts in detecting hallucinations have\nbeen employed at logit-level uncertainty estimation or language-level\nself-consistency evaluation, where the semantic information is inevitably lost\nduring the token-decoding procedure. Thus, we propose to explore the dense\nsemantic information retained within LLMs' \\textbf{IN}ternal \\textbf{S}tates\nfor halluc\\textbf{I}nation \\textbf{DE}tection (\\textbf{INSIDE}). In particular,\na simple yet effective \\textbf{EigenScore} metric is proposed to better\nevaluate responses' self-consistency, which exploits the eigenvalues of\nresponses' covariance matrix to measure the semantic consistency/diversity in\nthe dense embedding space. Furthermore, from the perspective of self-consistent\nhallucination detection, a test time feature clipping approach is explored to\ntruncate extreme activations in the internal states, which reduces\noverconfident generations and potentially benefits the detection of\noverconfident hallucinations. Extensive experiments and ablation studies are\nperformed on several popular LLMs and question-answering (QA) benchmarks,\nshowing the effectiveness of our proposal.\n","authors":["Chao Chen","Kai Liu","Ze Chen","Yi Gu","Yue Wu","Mingyuan Tao","Zhihang Fu","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2402.03744v1.pdf","comment":"Accepted by ICLR-2024"},{"id":"http://arxiv.org/abs/2402.01685v2","updated":"2024-02-06T06:03:13Z","published":"2024-01-22T08:47:50Z","title":"SMUTF: Schema Matching Using Generative Tags and Hybrid Features","summary":" We introduce SMUTF, a unique approach for large-scale tabular data schema\nmatching (SM), which assumes that supervised learning does not affect\nperformance in open-domain tasks, thereby enabling effective cross-domain\nmatching. This system uniquely combines rule-based feature engineering,\npre-trained language models, and generative large language models. In an\ninnovative adaptation inspired by the Humanitarian Exchange Language, we deploy\n'generative tags' for each data column, enhancing the effectiveness of SM.\nSMUTF exhibits extensive versatility, working seamlessly with any pre-existing\npre-trained embeddings, classification methods, and generative models.\n Recognizing the lack of extensive, publicly available datasets for SM, we\nhave created and open-sourced the HDXSM dataset from the public humanitarian\ndata. We believe this to be the most exhaustive SM dataset currently available.\nIn evaluations across various public datasets and the novel HDXSM dataset,\nSMUTF demonstrated exceptional performance, surpassing existing\nstate-of-the-art models in terms of accuracy and efficiency, and} improving the\nF1 score by 11.84% and the AUC of ROC by 5.08%.\n","authors":["Yu Zhang","Mei Di","Haozheng Luo","Chenwei Xu","Richard Tzong-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2402.01685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03732v1","updated":"2024-02-06T05:58:15Z","published":"2024-02-06T05:58:15Z","title":"Deep Outdated Fact Detection in Knowledge Graphs","summary":" Knowledge graphs (KGs) have garnered significant attention for their vast\npotential across diverse domains. However, the issue of outdated facts poses a\nchallenge to KGs, affecting their overall quality as real-world information\nevolves. Existing solutions for outdated fact detection often rely on manual\nrecognition. In response, this paper presents DEAN (Deep outdatEd fAct\ndetectioN), a novel deep learning-based framework designed to identify outdated\nfacts within KGs. DEAN distinguishes itself by capturing implicit structural\ninformation among facts through comprehensive modeling of both entities and\nrelations. To effectively uncover latent out-of-date information, DEAN employs\na contrastive approach based on a pre-defined Relations-to-Nodes (R2N) graph,\nweighted by the number of entities. Experimental results demonstrate the\neffectiveness and superiority of DEAN over state-of-the-art baseline methods.\n","authors":["Huiling Tu","Shuo Yu","Vidya Saikrishna","Feng Xia","Karin Verspoor"],"pdf_url":"https://arxiv.org/pdf/2402.03732v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.03414v2","updated":"2024-02-06T05:53:02Z","published":"2023-12-06T10:50:43Z","title":"Compressed Context Memory For Online Language Model Interaction","summary":" This paper presents a context key/value compression method for Transformer\nlanguage models in online scenarios, where the context continually expands. As\nthe context lengthens, the attention process demands increasing memory and\ncomputations, which in turn reduces the throughput of the language model. To\naddress this challenge, we propose a compressed context memory system that\ncontinually compresses the accumulating attention key/value pairs into a\ncompact memory space, facilitating language model inference in a limited memory\nspace of computing environments. Our compression process involves integrating a\nlightweight conditional LoRA into the language model's forward pass during\ninference, without the need for fine-tuning the model's entire set of weights.\nWe achieve efficient training by modeling the recursive compression process as\na single parallelized forward computation. Through evaluations on conversation,\npersonalization, and multi-task learning, we demonstrate that our approach\nachieves the performance level of a full context model with $5\\times$ smaller\ncontext memory size. We further demonstrate the applicability of our approach\nin a streaming setting with an unlimited context length, outperforming the\nsliding window approach. Codes are available at\nhttps://github.com/snu-mllab/context-memory.\n","authors":["Jang-Hyun Kim","Junyoung Yeom","Sangdoo Yun","Hyun Oh Song"],"pdf_url":"https://arxiv.org/pdf/2312.03414v2.pdf","comment":"ICLR 2024. Add streaming setting results and training set analyses"},{"id":"http://arxiv.org/abs/2402.03728v1","updated":"2024-02-06T05:50:04Z","published":"2024-02-06T05:50:04Z","title":"Consistent Joint Decision-Making with Heterogeneous Learning Models","summary":" This paper introduces a novel decision-making framework that promotes\nconsistency among decisions made by diverse models while utilizing external\nknowledge. Leveraging the Integer Linear Programming (ILP) framework, we map\npredictions from various models into globally normalized and comparable values\nby incorporating information about decisions' prior probability, confidence\n(uncertainty), and the models' expected accuracy. Our empirical study\ndemonstrates the superiority of our approach over conventional baselines on\nmultiple datasets.\n","authors":["Hossein Rajaby Faghihi","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2402.03728v1.pdf","comment":"EACL 2024 Findings - Short Paper"},{"id":"http://arxiv.org/abs/2402.02791v2","updated":"2024-02-06T05:38:26Z","published":"2024-02-05T07:59:38Z","title":"Rethinking Optimization and Architecture for Tiny Language Models","summary":" The power of large language models (LLMs) has been demonstrated through\nnumerous data and computing resources. However, the application of language\nmodels on mobile devices is facing huge challenge on the computation and memory\ncosts, that is, tiny language models with high performance are urgently\nrequired. Limited by the highly complex training process, there are many\ndetails for optimizing language models that are seldom studied carefully. In\nthis study, based on a tiny language model with 1B parameters, we carefully\ndesign a series of empirical study to analyze the effect of each component.\nThree perspectives are mainly discussed, \\ie, neural architecture, parameter\ninitialization, and optimization strategy. Several design formulas are\nempirically proved especially effective for tiny language models, including\ntokenizer compression, architecture tweaking, parameter inheritance and\nmultiple-round training. Then we train PanGu-$\\pi$-1B Pro and PanGu-$\\pi$-1.5B\nPro on 1.6T multilingual corpora, following the established formulas.\nExperimental results demonstrate the improved optimization and architecture\nyield a notable average improvement of 8.87 on benchmark evaluation sets for\nPanGu-$\\pi$-1B Pro. Besides, PanGu-$\\pi$-1.5B Pro surpasses a range of SOTA\nmodels with larger model sizes, validating its superior performance. The code\nis available at https://github.com/YuchuanTian/RethinkTinyLM.\n","authors":["Yehui Tang","Fangcheng Liu","Yunsheng Ni","Yuchuan Tian","Zheyuan Bai","Yi-Qi Hu","Sichao Liu","Shangling Jui","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01801v2","updated":"2024-02-06T05:34:17Z","published":"2024-02-02T07:24:35Z","title":"Large Language Models for Time Series: A Survey","summary":" Large Language Models (LLMs) have seen significant use in domains such as\nnatural language processing and computer vision. Going beyond text, image and\ngraphics, LLMs present a significant potential for analysis of time series\ndata, benefiting domains such as climate, IoT, healthcare, traffic, audio and\nfinance. This survey paper provides an in-depth exploration and a detailed\ntaxonomy of the various methodologies employed to harness the power of LLMs for\ntime series analysis. We address the inherent challenge of bridging the gap\nbetween LLMs' original text data training and the numerical nature of time\nseries data, and explore strategies for transferring and distilling knowledge\nfrom LLMs to numerical time series analysis. We detail various methodologies,\nincluding (1) direct prompting of LLMs, (2) time series quantization, (3)\nalignment techniques, (4) utilization of the vision modality as a bridging\nmechanism, and (5) the combination of LLMs with tools. Additionally, this\nsurvey offers a comprehensive overview of the existing multimodal time series\nand text datasets and delves into the challenges and future opportunities of\nthis emerging field. We maintain an up-to-date Github repository which includes\nall the papers and datasets discussed in the survey.\n","authors":["Xiyuan Zhang","Ranak Roy Chowdhury","Rajesh K. Gupta","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2402.01801v2.pdf","comment":"GitHub repository:\n https://github.com/xiyuanzh/awesome-llm-time-series"},{"id":"http://arxiv.org/abs/2402.03720v1","updated":"2024-02-06T05:29:05Z","published":"2024-02-06T05:29:05Z","title":"Similarity-based Neighbor Selection for Graph LLMs","summary":" Text-attributed graphs (TAGs) present unique challenges for direct processing\nby Language Learning Models (LLMs), yet their extensive commonsense knowledge\nand robust reasoning capabilities offer great promise for node classification\nin TAGs. Prior research in this field has grappled with issues such as\nover-squashing, heterophily, and ineffective graph information integration,\nfurther compounded by inconsistencies in dataset partitioning and\nunderutilization of advanced LLMs. To address these challenges, we introduce\nSimilarity-based Neighbor Selection (SNS). Using SimCSE and advanced neighbor\nselection techniques, SNS effectively improves the quality of selected\nneighbors, thereby improving graph representation and alleviating issues like\nover-squashing and heterophily. Besides, as an inductive and training-free\napproach, SNS demonstrates superior generalization and scalability over\ntraditional GNN methods. Our comprehensive experiments, adhering to standard\ndataset partitioning practices, demonstrate that SNS, through simple prompt\ninteractions with LLMs, consistently outperforms vanilla GNNs and achieves\nstate-of-the-art results on datasets like PubMed in node classification,\nshowcasing LLMs' potential in graph structure understanding. Our research\nfurther underscores the significance of graph structure integration in LLM\napplications and identifies key factors for their success in node\nclassification. Code is available at https://github.com/ruili33/SNS.\n","authors":["Rui Li","Jiwei Li","Jiawei Han","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03719v1","updated":"2024-02-06T05:24:16Z","published":"2024-02-06T05:24:16Z","title":"Empowering Language Models with Active Inquiry for Deeper Understanding","summary":" The rise of large language models (LLMs) has revolutionized the way that we\ninteract with artificial intelligence systems through natural language.\nHowever, LLMs often misinterpret user queries because of their uncertain\nintention, leading to less helpful responses. In natural human interactions,\nclarification is sought through targeted questioning to uncover obscure\ninformation. Thus, in this paper, we introduce LaMAI (Language Model with\nActive Inquiry), designed to endow LLMs with this same level of interactive\nengagement. LaMAI leverages active learning techniques to raise the most\ninformative questions, fostering a dynamic bidirectional dialogue. This\napproach not only narrows the contextual gap but also refines the output of the\nLLMs, aligning it more closely with user expectations. Our empirical studies,\nacross a variety of complex datasets where LLMs have limited conversational\ncontext, demonstrate the effectiveness of LaMAI. The method improves answer\naccuracy from 31.9% to 50.9%, outperforming other leading question-answering\nframeworks. Moreover, in scenarios involving human participants, LaMAI\nconsistently generates responses that are superior or comparable to baseline\nmethods in more than 82% of the cases. The applicability of LaMAI is further\nevidenced by its successful integration with various LLMs, highlighting its\npotential for the future of interactive language models.\n","authors":["Jing-Cheng Pang","Heng-Bo Fan","Pengyuan Wang","Jia-Hao Xiao","Nan Tang","Si-Hang Yang","Chengxing Jia","Sheng-Jun Huang","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04511v2","updated":"2024-02-06T05:17:42Z","published":"2023-12-07T18:32:04Z","title":"An LLM Compiler for Parallel Function Calling","summary":" Recent language models have shown remarkable results on various complex\nreasoning benchmarks. The reasoning capabilities of LLMs enable them to execute\nexternal function calls to overcome their inherent limitations, such as\nknowledge cutoffs, poor arithmetic skills, or lack of access to private data.\nThis development has allowed LLMs to select and coordinate multiple functions\nbased on the context to tackle more complex problems. However, current methods\nfor multiple function calling often require sequential reasoning and acting for\neach function which can result in high latency, cost, and sometimes inaccurate\nbehavior. To address this, we introduce LLMCompiler, which executes functions\nin parallel to efficiently orchestrate multiple function calling. Drawing from\nthe principles of classical compilers, LLMCompiler streamlines parallel\nfunction calling with three components: (i) an LLM Planner, formulating\nexecution plans; (ii) a Task Fetching Unit, dispatching function calling tasks;\nand (iii) an Executor, executing these tasks in parallel. LLMCompiler\nautomatically generates an optimized orchestration for the function calls and\ncan be used with both open-source and closed-source models. We have benchmarked\nLLMCompiler on a range of tasks with different patterns of function calling. We\nobserve consistent latency speedup of up to 3.7x, cost savings of up to 6.7x,\nand accuracy improvement of up to ~9% compared to ReAct.\n","authors":["Sehoon Kim","Suhong Moon","Ryan Tabrizi","Nicholas Lee","Michael W. Mahoney","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2312.04511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03715v1","updated":"2024-02-06T05:11:38Z","published":"2024-02-06T05:11:38Z","title":"Clarify: Improving Model Robustness With Natural Language Corrections","summary":" In supervised learning, models are trained to extract correlations from a\nstatic dataset. This often leads to models that rely on high-level\nmisconceptions. To prevent such misconceptions, we must necessarily provide\nadditional information beyond the training data. Existing methods incorporate\nforms of additional instance-level supervision, such as labels for spurious\nfeatures or additional labeled data from a balanced distribution. Such\nstrategies can become prohibitively costly for large-scale datasets since they\nrequire additional annotation at a scale close to the original training data.\nWe hypothesize that targeted natural language feedback about a model's\nmisconceptions is a more efficient form of additional supervision. We introduce\nClarify, a novel interface and method for interactively correcting model\nmisconceptions. Through Clarify, users need only provide a short text\ndescription to describe a model's consistent failure patterns. Then, in an\nentirely automated way, we use such descriptions to improve the training\nprocess by reweighting the training data or gathering additional targeted data.\nOur user studies show that non-expert users can successfully describe model\nmisconceptions via Clarify, improving worst-group accuracy by an average of\n17.1% in two datasets. Additionally, we use Clarify to find and rectify 31\nnovel hard subpopulations in the ImageNet dataset, improving minority-split\naccuracy from 21.1% to 28.7%.\n","authors":["Yoonho Lee","Michelle S. Lam","Helena Vasconcelos","Michael S. Bernstein","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2402.03715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v2","updated":"2024-02-06T05:10:33Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A simple method to reduce hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.03710v1","updated":"2024-02-06T05:05:38Z","published":"2024-02-06T05:05:38Z","title":"Listen, Chat, and Edit: Text-Guided Soundscape Modification for Enhanced\n Auditory Experience","summary":" In daily life, we encounter a variety of sounds, both desirable and\nundesirable, with limited control over their presence and volume. Our work\nintroduces \"Listen, Chat, and Edit\" (LCE), a novel multimodal sound mixture\neditor that modifies each sound source in a mixture based on user-provided text\ninstructions. LCE distinguishes itself with a user-friendly chat interface and\nits unique ability to edit multiple sound sources simultaneously within a\nmixture, without needing to separate them. Users input open-vocabulary text\nprompts, which are interpreted by a large language model to create a semantic\nfilter for editing the sound mixture. The system then decomposes the mixture\ninto its components, applies the semantic filter, and reassembles it into the\ndesired output. We developed a 160-hour dataset with over 100k mixtures,\nincluding speech and various audio sources, along with text prompts for diverse\nediting tasks like extraction, removal, and volume control. Our experiments\ndemonstrate significant improvements in signal quality across all editing tasks\nand robust performance in zero-shot scenarios with varying numbers and types of\nsound sources.\n","authors":["Xilin Jiang","Cong Han","Yinghao Aaron Li","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2402.03710v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2307.16806v2","updated":"2024-02-06T05:01:11Z","published":"2023-07-28T10:45:14Z","title":"Testing the Depth of ChatGPT's Comprehension via Cross-Modal Tasks Based\n on ASCII-Art: GPT3.5's Abilities in Regard to Recognizing and Generating\n ASCII-Art Are Not Totally Lacking","summary":" Over the eight months since its release, ChatGPT and its underlying model,\nGPT3.5, have garnered massive attention, due to their potent mix of capability\nand accessibility. While a niche-industry of papers have emerged examining the\nscope of capabilities these models possess, the information fed to and\nextracted from these networks has been either natural language text or\nstylized, code-like language. Drawing inspiration from the prowess we expect a\ntruly human-level intelligent agent to have across multiple signal modalities,\nin this work we examine GPT3.5's aptitude for visual tasks, where the inputs\nfeature content provided as ASCII-art without overt distillation into a lingual\nsummary. We conduct experiments analyzing the model's performance on image\nrecognition tasks after various transforms typical in visual settings, trials\ninvestigating knowledge of image parts, and tasks covering image generation.\n","authors":["David Bayani"],"pdf_url":"https://arxiv.org/pdf/2307.16806v2.pdf","comment":"Accepted in EACL 2024 as a long paper. See\n https://2024.eacl.org/program/findings-accepted/#long-papers . Note: this\n paper's ArXiv version includes additional discussion, analysis, and types of\n experiments compared to the EACL version. Changes introduced in V2 of ArXiv\n paper: only this comment metadata. V1 was initially submission on July 26th,\n 2023 - release was delayed by ArXiv for a few days"},{"id":"http://arxiv.org/abs/2305.14998v2","updated":"2024-02-06T04:20:15Z","published":"2023-05-24T10:36:12Z","title":"An Examination of the Robustness of Reference-Free Image Captioning\n Evaluation Metrics","summary":" Recently, reference-free metrics such as CLIPScore (Hessel et al., 2021),\nUMIC (Lee et al., 2021), and PAC-S (Sarto et al., 2023) have been proposed for\nautomatic reference-free evaluation of image captions. Our focus lies in\nevaluating the robustness of these metrics in scenarios that require\ndistinguishing between two captions with high lexical overlap but very\ndifferent meanings. Our findings reveal that despite their high correlation\nwith human judgments, CLIPScore, UMIC, and PAC-S struggle to identify\nfine-grained errors. While all metrics exhibit strong sensitivity to visual\ngrounding errors, their sensitivity to caption implausibility errors is\nlimited. Furthermore, we found that all metrics are sensitive to variations in\nthe size of image-relevant objects mentioned in the caption, while CLIPScore\nand PAC-S are also sensitive to the number of mentions of image-relevant\nobjects in the caption. Regarding linguistic aspects of a caption, all metrics\nshow weak comprehension of negation, and CLIPScore and PAC-S are insensitive to\nthe structure of the caption to a great extent. We hope our findings will guide\nfurther improvements in reference-free evaluation of image captioning.\n","authors":["Saba Ahmadi","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2305.14998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04076v5","updated":"2024-02-06T04:16:17Z","published":"2023-11-07T15:40:43Z","title":"Do LLMs exhibit human-like response biases? A case study in survey\n design","summary":" As large language models (LLMs) become more capable, there is growing\nexcitement about the possibility of using LLMs as proxies for humans in\nreal-world tasks where subjective labels are desired, such as in surveys and\nopinion polling. One widely-cited barrier to the adoption of LLMs as proxies\nfor humans in subjective tasks is their sensitivity to prompt wording - but\ninterestingly, humans also display sensitivities to instruction changes in the\nform of response biases. We investigate the extent to which LLMs reflect human\nresponse biases, if at all. We look to survey design, where human response\nbiases caused by changes in the wordings of \"prompts\" have been extensively\nexplored in social psychology literature. Drawing from these works, we design a\ndataset and framework to evaluate whether LLMs exhibit human-like response\nbiases in survey questionnaires. Our comprehensive evaluation of nine models\nshows that popular open and commercial LLMs generally fail to reflect\nhuman-like behavior, particularly in models that have undergone RLHF.\nFurthermore, even if a model shows a significant change in the same direction\nas humans, we find that they are sensitive to perturbations that do not elicit\nsignificant changes in humans. These results highlight the pitfalls of using\nLLMs as human proxies, and underscore the need for finer-grained\ncharacterizations of model behavior. Our code, dataset, and collected samples\nare available at https://github.com/lindiatjuatja/BiasMonkey\n","authors":["Lindia Tjuatja","Valerie Chen","Sherry Tongshuang Wu","Ameet Talwalkar","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2311.04076v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03686v1","updated":"2024-02-06T04:14:09Z","published":"2024-02-06T04:14:09Z","title":"Minds versus Machines: Rethinking Entailment Verification with Language\n Models","summary":" Humans make numerous inferences in text comprehension to understand\ndiscourse. This paper aims to understand the commonalities and disparities in\nthe inference judgments between humans and state-of-the-art Large Language\nModels (LLMs). Leveraging a comprehensively curated entailment verification\nbenchmark, we evaluate both human and LLM performance across various reasoning\ncategories. Our benchmark includes datasets from three categories (NLI,\ncontextual QA, and rationales) that include multi-sentence premises and\ndifferent knowledge types, thereby evaluating the inference capabilities in\ncomplex reasoning instances. Notably, our findings reveal LLMs' superiority in\nmulti-hop reasoning across extended contexts, while humans excel in tasks\nnecessitating simple deductive reasoning. Leveraging these insights, we\nintroduce a fine-tuned Flan-T5 model that outperforms GPT-3.5 and rivals with\nGPT-4, offering a robust open-source solution for entailment verification. As a\npractical application, we showcase the efficacy of our finetuned model in\nenhancing self-consistency in model-generated explanations, resulting in a 6%\nperformance boost on average across three multiple-choice question-answering\ndatasets.\n","authors":["Soumya Sanyal","Tianyi Xiao","Jiacheng Liu","Wenya Wang","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2402.03686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01015v3","updated":"2024-02-06T03:52:48Z","published":"2023-06-01T04:52:26Z","title":"How to Estimate Model Transferability of Pre-Trained Speech Models?","summary":" In this work, we introduce a \"score-based assessment\" framework for\nestimating the transferability of pre-trained speech models (PSMs) for\nfine-tuning target tasks. We leverage upon two representation theories,\nBayesian likelihood estimation and optimal transport, to generate rank scores\nfor the PSM candidates using the extracted representations. Our framework\nefficiently computes transferability scores without actual fine-tuning of\ncandidate models or layers by making a temporal independent hypothesis. We\nevaluate some popular supervised speech models (e.g., Conformer RNN-Transducer)\nand self-supervised speech models (e.g., HuBERT) in cross-layer and cross-model\nsettings using public data. Experimental results show a high Spearman's rank\ncorrelation and low $p$-value between our estimation framework and fine-tuning\nground truth. Our proposed transferability framework requires less\ncomputational time and resources, making it a resource-saving and\ntime-efficient approach for tuning speech foundation models.\n","authors":["Zih-Ching Chen","Chao-Han Huck Yang","Bo Li","Yu Zhang","Nanxin Chen","Shuo-Yiin Chang","Rohit Prabhavalkar","Hung-yi Lee","Tara N. Sainath"],"pdf_url":"https://arxiv.org/pdf/2306.01015v3.pdf","comment":"Accepted to Interspeech. Code is available at:\n https://github.com/virginiakm1988/LogME-CTC. Fixed a typo"},{"id":"http://arxiv.org/abs/2402.03667v1","updated":"2024-02-06T03:41:12Z","published":"2024-02-06T03:41:12Z","title":"Large Language Models as an Indirect Reasoner: Contrapositive and\n Contradiction for Automated Reasoning","summary":" Recently, increasing attention has been focused drawn on to improve the\nability of Large Language Models (LLMs) to perform complex reasoning. However,\nprevious methods, such as Chain-of-Thought and Self-Consistency, mainly follow\nDirect Reasoning (DR) frameworks, so they will meet difficulty in solving\nnumerous real-world tasks which can hardly be solved via DR. Therefore, to\nstrengthen the reasoning power of LLMs, this paper proposes a novel Indirect\nReasoning (IR) method that employs the logic of contrapositives and\ncontradictions to tackle IR tasks such as factual reasoning and mathematic\nproof. Specifically, our methodology comprises two steps. Firstly, we leverage\nthe logical equivalence of contrapositive to augment the data and rules to\nenhance the comprehensibility of LLMs. Secondly, we design a set of prompt\ntemplates to trigger LLMs to conduct IR based on proof by contradiction that is\nlogically equivalent to the original DR process. Our IR method is simple yet\neffective and can be straightforwardly integrated with existing DR methods to\nfurther boost the reasoning abilities of LLMs. The experimental results on\npopular LLMs, such as GPT-3.5-turbo and Gemini-pro, show that our IR method\nenhances the overall accuracy of factual reasoning by 27.33% and mathematical\nproof by 31.43%, when compared with traditional DR methods. Moreover, the\nmethods combining IR and DR significantly outperform the methods solely using\nIR or DR, further demonstrating the effectiveness of our strategy.\n","authors":["Yanfang Zhang","Yiliu Sun","Yibing Zhan","Dapeng Tao","Dacheng Tao","Chen Gong"],"pdf_url":"https://arxiv.org/pdf/2402.03667v1.pdf","comment":"20 pages,13 figures,4 tables"},{"id":"http://arxiv.org/abs/2402.03659v1","updated":"2024-02-06T03:18:58Z","published":"2024-02-06T03:18:58Z","title":"Learning to Generate Explainable Stock Predictions using Self-Reflective\n Large Language Models","summary":" Explaining stock predictions is generally a difficult task for traditional\nnon-generative deep learning models, where explanations are limited to\nvisualizing the attention weights on important texts. Today, Large Language\nModels (LLMs) present a solution to this problem, given their known\ncapabilities to generate human-readable explanations for their decision-making\nprocess. However, the task of stock prediction remains challenging for LLMs, as\nit requires the ability to weigh the varying impacts of chaotic social texts on\nstock prices. The problem gets progressively harder with the introduction of\nthe explanation component, which requires LLMs to explain verbally why certain\nfactors are more important than the others. On the other hand, to fine-tune\nLLMs for such a task, one would need expert-annotated samples of explanation\nfor every stock movement in the training set, which is expensive and\nimpractical to scale. To tackle these issues, we propose our\nSummarize-Explain-Predict (SEP) framework, which utilizes a self-reflective\nagent and Proximal Policy Optimization (PPO) to let a LLM teach itself how to\ngenerate explainable stock predictions in a fully autonomous manner. The\nreflective agent learns how to explain past stock movements through\nself-reasoning, while the PPO trainer trains the model to generate the most\nlikely explanations from input texts. The training samples for the PPO trainer\nare also the responses generated during the reflective process, which\neliminates the need for human annotators. Using our SEP framework, we fine-tune\na LLM that can outperform both traditional deep-learning and LLM methods in\nprediction accuracy and Matthews correlation coefficient for the stock\nclassification task. To justify the generalization capability of our framework,\nwe further test it on the portfolio construction task, and demonstrate its\neffectiveness through various portfolio metrics.\n","authors":["Kelvin J. L. Koa","Yunshan Ma","Ritchie Ng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2402.03659v1.pdf","comment":"WWW 2024"},{"id":"http://arxiv.org/abs/2402.03658v1","updated":"2024-02-06T03:14:46Z","published":"2024-02-06T03:14:46Z","title":"Sentiment-enhanced Graph-based Sarcasm Explanation in Dialogue","summary":" Sarcasm Explanation in Dialogue (SED) is a new yet challenging task, which\naims to generate a natural language explanation for the given sarcastic\ndialogue that involves multiple modalities (i.e., utterance, video, and audio).\nAlthough existing studies have achieved great success based on the generative\npretrained language model BART, they overlook exploiting the sentiments\nresiding in the utterance, video and audio, which are vital clues for sarcasm\nexplanation. In fact, it is non-trivial to incorporate sentiments for boosting\nSED performance, due to three main challenges: 1) diverse effects of utterance\ntokens on sentiments; 2) gap between video-audio sentiment signals and the\nembedding space of BART; and 3) various relations among utterances, utterance\nsentiments, and video-audio sentiments. To tackle these challenges, we propose\na novel sEntiment-enhanceD Graph-based multimodal sarcasm Explanation\nframework, named EDGE. In particular, we first propose a lexicon-guided\nutterance sentiment inference module, where a heuristic utterance sentiment\nrefinement strategy is devised. We then develop a module named Joint Cross\nAttention-based Sentiment Inference (JCA-SI) by extending the multimodal\nsentiment analysis model JCA to derive the joint sentiment label for each\nvideo-audio clip. Thereafter, we devise a context-sentiment graph to\ncomprehensively model the semantic relations among the utterances, utterance\nsentiments, and video-audio sentiments, to facilitate sarcasm explanation\ngeneration. Extensive experiments on the publicly released dataset WITS verify\nthe superiority of our model over cutting-edge methods.\n","authors":["Kun Ouyang","Liqiang Jing","Xuemeng Song","Meng Liu","Yupeng Hu","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2402.03658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07134v5","updated":"2024-02-06T03:08:44Z","published":"2023-08-14T13:41:09Z","title":"Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models has revolutionized\nvarious AI research domains. Transformers-based Large Language Models (LLMs)\nhave gradually replaced CNNs and RNNs to unify fields of computer vision and\nnatural language processing. Compared with independent data samples such as\nimages, videos or texts, graphs usually contain rich structural and relational\ninformation. Meanwhile, language, especially natural language, being one of the\nmost expressive mediums, excels in describing complex structures. However,\nexisting work on incorporating graph problems into the generative language\nmodeling framework remains very limited. Considering the rising prominence of\nLLMs, it becomes essential to explore whether LLMs can also replace GNNs as the\nfoundation model for graphs. In this paper, we propose InstructGLM\n(Instruction-finetuned Graph Language Model) with highly scalable prompts based\non natural language instructions. We use natural language to describe\nmulti-scale geometric structure of the graph and then instruction finetune an\nLLM to perform graph tasks, which enables Generative Graph Learning. Our method\nsurpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets,\nunderscoring its effectiveness and sheds light on generative LLMs as new\nfoundation model for graph machine learning. Our code is open-sourced at\nhttps://github.com/agiresearch/InstructGLM.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v5.pdf","comment":"In EACL 2024"},{"id":"http://arxiv.org/abs/2402.03049v2","updated":"2024-02-06T02:51:23Z","published":"2024-02-05T14:33:56Z","title":"EasyInstruct: An Easy-to-use Instruction Processing Framework for Large\n Language Models","summary":" In recent years, instruction tuning has gained increasing attention and\nemerged as a crucial technique to enhance the capabilities of Large Language\nModels (LLMs). To construct high-quality instruction datasets, many instruction\nprocessing approaches have been proposed, aiming to achieve a delicate balance\nbetween data quantity and data quality. Nevertheless, due to inconsistencies\nthat persist among various instruction processing methods, there is no standard\nopen-source instruction processing implementation framework available for the\ncommunity, which hinders practitioners from further developing and advancing.\nTo facilitate instruction processing research and development, we present\nEasyInstruct, an easy-to-use instruction processing framework for LLMs, which\nmodularizes instruction generation, selection, and prompting, while also\nconsidering their combination and interaction. EasyInstruct is publicly\nreleased and actively maintained at https://github.com/zjunlp/EasyInstruct,\nalong with a running demo App at\nhttps://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for\nbroader research centered on instruction data.\n","authors":["Yixin Ou","Ningyu Zhang","Honghao Gui","Ziwen Xu","Shuofei Qiao","Yida Xue","Runnan Fang","Kangwei Liu","Lei Li","Zhen Bi","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03049v2.pdf","comment":"Ongoing work; the project website is at\n https://zjunlp.github.io/project/EasyInstruct, code is at\n https://github.com/zjunlp/EasyInstruct, demo is at\n https://huggingface.co/spaces/zjunlp/EasyInstruct"},{"id":"http://arxiv.org/abs/2402.02639v2","updated":"2024-02-06T02:50:48Z","published":"2024-02-04T23:23:51Z","title":"\"It's how you do things that matters\": Attending to Process to Better\n Serve Indigenous Communities with Language Technologies","summary":" Indigenous languages are historically under-served by Natural Language\nProcessing (NLP) technologies, but this is changing for some languages with the\nrecent scaling of large multilingual models and an increased focus by the NLP\ncommunity on endangered languages. This position paper explores ethical\nconsiderations in building NLP technologies for Indigenous languages, based on\nthe premise that such projects should primarily serve Indigenous communities.\nWe report on interviews with 17 researchers working in or with Aboriginal\nand/or Torres Strait Islander communities on language technology projects in\nAustralia. Drawing on insights from the interviews, we recommend practices for\nNLP researchers to increase attention to the process of engagements with\nIndigenous communities, rather than focusing only on decontextualised\nartefacts.\n","authors":["Ned Cooper","Courtney Heldreth","Ben Hutchinson"],"pdf_url":"https://arxiv.org/pdf/2402.02639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03642v1","updated":"2024-02-06T02:39:59Z","published":"2024-02-06T02:39:59Z","title":"Stanceosaurus 2.0: Classifying Stance Towards Russian and Spanish\n Misinformation","summary":" The Stanceosaurus corpus (Zheng et al., 2022) was designed to provide\nhigh-quality, annotated, 5-way stance data extracted from Twitter, suitable for\nanalyzing cross-cultural and cross-lingual misinformation. In the Stanceosaurus\n2.0 iteration, we extend this framework to encompass Russian and Spanish. The\nformer is of current significance due to prevalent misinformation amid\nescalating tensions with the West and the violent incursion into Ukraine. The\nlatter, meanwhile, represents an enormous community that has been largely\noverlooked on major social media platforms. By incorporating an additional\n3,874 Spanish and Russian tweets over 41 misinformation claims, our objective\nis to support research focused on these issues. To demonstrate the value of\nthis data, we employed zero-shot cross-lingual transfer on multilingual BERT,\nyielding results on par with the initial Stanceosaurus study with a macro F1\nscore of 43 for both languages. This underlines the viability of stance\nclassification as an effective tool for identifying multicultural\nmisinformation.\n","authors":["Anton Lavrouk","Ian Ligon","Tarek Naous","Jonathan Zheng","Alan Ritter","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.03642v1.pdf","comment":"WNUT2024"},{"id":"http://arxiv.org/abs/2310.07818v3","updated":"2024-02-06T02:24:53Z","published":"2023-10-11T18:59:48Z","title":"On the Relationship between Sentence Analogy Identification and Sentence\n Structure Encoding in Large Language Models","summary":" The ability of Large Language Models (LLMs) to encode syntactic and semantic\nstructures of language is well examined in NLP. Additionally, analogy\nidentification, in the form of word analogies are extensively studied in the\nlast decade of language modeling literature. In this work we specifically look\nat how LLMs' abilities to capture sentence analogies (sentences that convey\nanalogous meaning to each other) vary with LLMs' abilities to encode syntactic\nand semantic structures of sentences. Through our analysis, we find that LLMs'\nability to identify sentence analogies is positively correlated with their\nability to encode syntactic and semantic structures of sentences. Specifically,\nwe find that the LLMs which capture syntactic structures better, also have\nhigher abilities in identifying sentence analogies.\n","authors":["Thilini Wijesiriwardene","Ruwan Wickramarachchi","Aishwarya Naresh Reganti","Vinija Jain","Aman Chadha","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2310.07818v3.pdf","comment":"To appear in Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2312.08846v3","updated":"2024-02-06T02:21:10Z","published":"2023-12-14T12:02:24Z","title":"TiMix: Text-aware Image Mixing for Effective Vision-Language\n Pre-training","summary":" Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances\nmodern Vision-Language Pre-training (VLP) models by aligning visual and\nlinguistic modalities. Due to noises in web-harvested text-image pairs,\nhowever, scaling up training data volume in SMCL presents considerable\nobstacles in terms of computational cost and data inefficiency. To improve data\nefficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates\nmix-based data augmentation techniques into SMCL, yielding significant\nperformance improvements without significantly increasing computational\noverhead. We provide a theoretical analysis of TiMixfrom a mutual information\n(MI) perspective, showing that mixed data samples for cross-modal contrastive\nlearning implicitly serve as a regularizer for the contrastive loss. The\nexperimental results demonstrate that TiMix exhibits a comparable performance\non downstream tasks, even with a reduced amount of training data and shorter\ntraining time, when benchmarked against existing methods. This work empirically\nand theoretically demonstrates the potential of data mixing for data-efficient\nand computationally viable VLP, benefiting broader VLP model adoption in\npractical scenarios.\n","authors":["Chaoya Jiang","Wei ye","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.08846v3.pdf","comment":"Accepted on AAAI2024"},{"id":"http://arxiv.org/abs/2312.11509v4","updated":"2024-02-06T02:14:14Z","published":"2023-12-12T04:58:11Z","title":"Toward a Reinforcement-Learning-Based System for Adjusting Medication to\n Minimize Speech Disfluency","summary":" We propose a reinforcement learning (RL)-based system that would\nautomatically prescribe a hypothetical patient medication that may help the\npatient with their mental health-related speech disfluency, and adjust the\nmedication and the dosages in response to zero-cost frequent measurement of the\nfluency of the patient. We demonstrate the components of the system: a module\nthat detects and evaluates speech disfluency on a large dataset we built, and\nan RL algorithm that automatically finds good combinations of medications. To\nsupport the two modules, we collect data on the effect of psychiatric\nmedications for speech disfluency from the literature, and build a plausible\npatient simulation system. We demonstrate that the RL system is, under some\ncircumstances, able to converge to a good medication regime. We collect and\nlabel a dataset of people with possible speech disfluency and demonstrate our\nmethods using that dataset. Our work is a proof of concept: we show that there\nis promise in the idea of using automatic data collection to address speech\ndisfluency.\n","authors":["Pavlos Constas","Vikram Rawal","Matthew Honorio Oliveira","Andreas Constas","Aditya Khan","Kaison Cheung","Najma Sultani","Carrie Chen","Micol Altomare","Michael Akzam","Jiacheng Chen","Vhea He","Lauren Altomare","Heraa Murqi","Asad Khan","Nimit Amikumar Bhanshali","Youssef Rachad","Michael Guerzhoy"],"pdf_url":"https://arxiv.org/pdf/2312.11509v4.pdf","comment":"In Proc. Machine Learning for Cognitive and Mental Health Workshop\n (ML4CMH) at AAAI 2024"},{"id":"http://arxiv.org/abs/2402.03628v1","updated":"2024-02-06T01:48:53Z","published":"2024-02-06T01:48:53Z","title":"Professional Agents -- Evolving Large Language Models into Autonomous\n Experts with Human-Level Competencies","summary":" The advent of large language models (LLMs) such as ChatGPT, PaLM, and GPT-4\nhas catalyzed remarkable advances in natural language processing, demonstrating\nhuman-like language fluency and reasoning capacities. This position paper\nintroduces the concept of Professional Agents (PAgents), an application\nframework harnessing LLM capabilities to create autonomous agents with\ncontrollable, specialized, interactive, and professional-level competencies. We\nposit that PAgents can reshape professional services through continuously\ndeveloped expertise. Our proposed PAgents framework entails a tri-layered\narchitecture for genesis, evolution, and synergy: a base tool layer, a middle\nagent layer, and a top synergy layer. This paper aims to spur discourse on\npromising real-world applications of LLMs. We argue the increasing\nsophistication and integration of PAgents could lead to AI systems exhibiting\nprofessional mastery over complex domains, serving critical needs, and\npotentially achieving artificial general intelligence.\n","authors":["Zhixuan Chu","Yan Wang","Feng Zhu","Lu Yu","Longfei Li","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2402.03628v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.03627v1","updated":"2024-02-06T01:44:38Z","published":"2024-02-06T01:44:38Z","title":"Partially Recentralization Softmax Loss for Vision-Language Models\n Robustness","summary":" As Large Language Models make a breakthrough in natural language processing\ntasks (NLP), multimodal technique becomes extremely popular. However, it has\nbeen shown that multimodal NLP are vulnerable to adversarial attacks, where the\noutputs of a model can be dramatically changed by a perturbation to the input.\nWhile several defense techniques have been proposed both in computer vision and\nNLP models, the multimodal robustness of models have not been fully explored.\nIn this paper, we study the adversarial robustness provided by modifying loss\nfunction of pre-trained multimodal models, by restricting top K softmax\noutputs. Based on the evaluation and scoring, our experiments show that after a\nfine-tuning, adversarial robustness of pre-trained models can be significantly\nimproved, against popular attacks. Further research should be studying, such as\noutput diversity, generalization and the robustness-performance trade-off of\nthis kind of loss functions. Our code will be available after this paper is\naccepted\n","authors":["Hao Wang","Xin Zhang","Jinzhe Jiang","Yaqian Zhao","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2402.03627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01763v2","updated":"2024-02-06T01:25:18Z","published":"2024-01-30T23:35:28Z","title":"When Large Language Models Meet Vector Databases: A Survey","summary":" This survey explores the synergistic potential of Large Language Models\n(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving\nresearch area. With the proliferation of LLMs comes a host of challenges,\nincluding hallucinations, outdated knowledge, prohibitive commercial\napplication costs, and memory issues. VecDBs emerge as a compelling solution to\nthese issues by offering an efficient means to store, retrieve, and manage the\nhigh-dimensional vector representations intrinsic to LLM operations. Through\nthis nuanced review, we delineate the foundational principles of LLMs and\nVecDBs and critically analyze their integration's impact on enhancing LLM\nfunctionalities. This discourse extends into a discussion on the speculative\nfuture developments in this domain, aiming to catalyze further research into\noptimizing the confluence of LLMs and VecDBs for advanced data handling and\nknowledge extraction capabilities.\n","authors":["Zhi Jing","Yongye Su","Yikun Han","Bo Yuan","Haiyun Xu","Chunjiang Liu","Kehai Chen","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.01763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08588v2","updated":"2024-02-06T01:21:50Z","published":"2023-11-14T23:18:52Z","title":"CodeScope: An Execution-based Multilingual Multitask Multidimensional\n Benchmark for Evaluating LLMs on Code Understanding and Generation","summary":" Large Language Models (LLMs) have demonstrated remarkable performance on\ncoding related tasks, particularly on assisting humans in programming and\nfacilitating programming automation. However, existing benchmarks for\nevaluating the code understanding and generation capacities of LLMs suffer from\nsevere limitations. First, most benchmarks are deficient as they focus on a\nnarrow range of popular programming languages and specific tasks, whereas the\nreal-world software development scenarios show dire need to implement systems\nwith multilingual programming environments to satisfy diverse requirements.\nPractical programming practices also strongly expect multi-task settings for\ntesting coding capabilities of LLMs comprehensively and robustly. Second, most\nbenchmarks also fail to consider the actual executability and the consistency\nof execution results of the generated code. To bridge these gaps between\nexisting benchmarks and expectations from practical applications, we introduce\nCodeScope, an execution-based, multilingual, multi-task, multi-dimensional\nevaluation benchmark for comprehensively gauging LLM capabilities on coding\ntasks. CodeScope covers 43 programming languages and 8 coding tasks. It\nevaluates the coding performance of LLMs from three dimensions (perspectives):\ndifficulty, efficiency, and length. To facilitate execution-based evaluations\nof code generation, we develop MultiCodeEngine, an automated code execution\nengine that supports 14 programming languages. Finally, we systematically\nevaluate and analyze 8 mainstream LLMs on CodeScope tasks and demonstrate the\nsuperior breadth and challenges of CodeScope for evaluating LLMs on code\nunderstanding and generation tasks compared to other benchmarks. The CodeScope\nbenchmark and datasets are publicly available at\nhttps://github.com/WeixiangYAN/CodeScope.\n","authors":["Weixiang Yan","Haitian Liu","Yunkun Wang","Yunzhe Li","Qian Chen","Wen Wang","Tingyu Lin","Weishan Zhao","Li Zhu","Shuiguang Deng","Hari Sundaram"],"pdf_url":"https://arxiv.org/pdf/2311.08588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03620v1","updated":"2024-02-06T01:13:53Z","published":"2024-02-06T01:13:53Z","title":"Self-Discover: Large Language Models Self-Compose Reasoning Structures","summary":" We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the\ntask-intrinsic reasoning structures to tackle complex reasoning problems that\nare challenging for typical prompting methods. Core to the framework is a\nself-discovery process where LLMs select multiple atomic reasoning modules such\nas critical thinking and step-by-step thinking, and compose them into an\nexplicit reasoning structure for LLMs to follow during decoding. SELF-DISCOVER\nsubstantially improves GPT-4 and PaLM 2's performance on challenging reasoning\nbenchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as\nmuch as 32% compared to Chain of Thought (CoT). Furthermore, SELF-DISCOVER\noutperforms inference-intensive methods such as CoT-Self-Consistency by more\nthan 20%, while requiring 10-40x fewer inference compute. Finally, we show that\nthe self-discovered reasoning structures are universally applicable across\nmodel families: from PaLM 2-L to GPT-4, and from GPT-4 to Llama2, and share\ncommonalities with human reasoning patterns.\n","authors":["Pei Zhou","Jay Pujara","Xiang Ren","Xinyun Chen","Heng-Tze Cheng","Quoc V. Le","Ed H. Chi","Denny Zhou","Swaroop Mishra","Huaixiu Steven Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.03620v1.pdf","comment":"17 pages, 11 figures, 5 tables"},{"id":"http://arxiv.org/abs/2402.03618v1","updated":"2024-02-06T01:07:56Z","published":"2024-02-06T01:07:56Z","title":"Comparing Abstraction in Humans and Large Language Models Using\n Multimodal Serial Reproduction","summary":" Humans extract useful abstractions of the world from noisy sensory data.\nSerial reproduction allows us to study how people construe the world through a\nparadigm similar to the game of telephone, where one person observes a stimulus\nand reproduces it for the next to form a chain of reproductions. Past serial\nreproduction experiments typically employ a single sensory modality, but humans\noften communicate abstractions of the world to each other through language. To\ninvestigate the effect language on the formation of abstractions, we implement\na novel multimodal serial reproduction framework by asking people who receive a\nvisual stimulus to reproduce it in a linguistic format, and vice versa. We ran\nunimodal and multimodal chains with both humans and GPT-4 and find that adding\nlanguage as a modality has a larger effect on human reproductions than GPT-4's.\nThis suggests human visual and linguistic representations are more dissociable\nthan those of GPT-4.\n","authors":["Sreejan Kumar","Raja Marjieh","Byron Zhang","Declan Campbell","Michael Y. Hu","Umang Bhatt","Brenden Lake","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2402.03618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03616v1","updated":"2024-02-06T01:05:14Z","published":"2024-02-06T01:05:14Z","title":"Leveraging Large Language Models for Hybrid Workplace Decision Support","summary":" Large Language Models (LLMs) hold the potential to perform a variety of text\nprocessing tasks and provide textual explanations for proposed actions or\ndecisions. In the era of hybrid work, LLMs can provide intelligent decision\nsupport for workers who are designing their hybrid work plans. In particular,\nthey can offer suggestions and explanations to workers balancing numerous\ndecision factors, thereby enhancing their work experience. In this paper, we\npresent a decision support model for workspaces in hybrid work environments,\nleveraging the reasoning skill of LLMs. We first examine LLM's capability of\nmaking suitable workspace suggestions. We find that its reasoning extends\nbeyond the guidelines in the prompt and the LLM can manage the trade-off among\nthe available resources in the workspaces. We conduct an extensive user study\nto understand workers' decision process for workspace choices and evaluate the\neffectiveness of the system. We observe that a worker's decision could be\ninfluenced by the LLM's suggestions and explanations. The participants in our\nstudy find the system to be convenient, regardless of whether reasons are\nprovided or not. Our results show that employees can benefit from the\nLLM-empowered system for their workspace selection in hybrid workplace.\n","authors":["Yujin Kim","Chin-Chia Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.03616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03610v1","updated":"2024-02-06T00:53:27Z","published":"2024-02-06T00:53:27Z","title":"RAP: Retrieval-Augmented Planning with Contextual Memory for Multimodal\n LLM Agents","summary":" Owing to recent advancements, Large Language Models (LLMs) can now be\ndeployed as agents for increasingly complex decision-making applications in\nareas including robotics, gaming, and API integration. However, reflecting past\nexperiences in current decision-making processes, an innate human behavior,\ncontinues to pose significant challenges. Addressing this, we propose\nRetrieval-Augmented Planning (RAP) framework, designed to dynamically leverage\npast experiences corresponding to the current situation and context, thereby\nenhancing agents' planning capabilities. RAP distinguishes itself by being\nversatile: it excels in both text-only and multimodal environments, making it\nsuitable for a wide range of tasks. Empirical evaluations demonstrate RAP's\neffectiveness, where it achieves SOTA performance in textual scenarios and\nnotably enhances multimodal LLM agents' performance for embodied tasks. These\nresults highlight RAP's potential in advancing the functionality and\napplicability of LLM agents in complex, real-world applications.\n","authors":["Tomoyuki Kagaya","Thong Jing Yuan","Yuxuan Lou","Jayashree Karlekar","Sugiri Pranata","Akira Kinose","Koki Oguri","Felix Wick","Yang You"],"pdf_url":"https://arxiv.org/pdf/2402.03610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03607v1","updated":"2024-02-06T00:51:27Z","published":"2024-02-06T00:51:27Z","title":"Improving Contextual Congruence Across Modalities for Effective\n Multimodal Marketing using Knowledge-infused Learning","summary":" The prevalence of smart devices with the ability to capture moments in\nmultiple modalities has enabled users to experience multimodal information\nonline. However, large Language (LLMs) and Vision models (LVMs) are still\nlimited in capturing holistic meaning with cross-modal semantic relationships.\nWithout explicit, common sense knowledge (e.g., as a knowledge graph), Visual\nLanguage Models (VLMs) only learn implicit representations by capturing\nhigh-level patterns in vast corpora, missing essential contextual cross-modal\ncues. In this work, we design a framework to couple explicit commonsense\nknowledge in the form of knowledge graphs with large VLMs to improve the\nperformance of a downstream task, predicting the effectiveness of multi-modal\nmarketing campaigns. While the marketing application provides a compelling\nmetric for assessing our methods, our approach enables the early detection of\nlikely persuasive multi-modal campaigns and the assessment and augmentation of\nmarketing theory.\n","authors":["Trilok Padhi","Ugur Kursuncu","Yaman Kumar","Valerie L. Shalin","Lane Peterson Fronczek"],"pdf_url":"https://arxiv.org/pdf/2402.03607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03597v1","updated":"2024-02-06T00:14:53Z","published":"2024-02-06T00:14:53Z","title":"Identifying Reasons for Contraceptive Switching from Real-World Data\n Using Large Language Models","summary":" Prescription contraceptives play a critical role in supporting women's\nreproductive health. With nearly 50 million women in the United States using\ncontraceptives, understanding the factors that drive contraceptives selection\nand switching is of significant interest. However, many factors related to\nmedication switching are often only captured in unstructured clinical notes and\ncan be difficult to extract. Here, we evaluate the zero-shot abilities of a\nrecently developed large language model, GPT-4 (via HIPAA-compliant Microsoft\nAzure API), to identify reasons for switching between classes of contraceptives\nfrom the UCSF Information Commons clinical notes dataset. We demonstrate that\nGPT-4 can accurately extract reasons for contraceptive switching, outperforming\nbaseline BERT-based models with microF1 scores of 0.849 and 0.881 for\ncontraceptive start and stop extraction, respectively. Human evaluation of\nGPT-4-extracted reasons for switching showed 91.4% accuracy, with minimal\nhallucinations. Using extracted reasons, we identified patient preference,\nadverse events, and insurance as key reasons for switching using unsupervised\ntopic modeling approaches. Notably, we also showed using our approach that\n\"weight gain/mood change\" and \"insurance coverage\" are disproportionately found\nas reasons for contraceptive switching in specific demographic populations. Our\ncode and supplemental data are available at\nhttps://github.com/BMiao10/contraceptive-switching.\n","authors":["Brenda Y. Miao","Christopher YK Williams","Ebenezer Chinedu-Eneh","Travis Zack","Emily Alsentzer","Atul J. Butte","Irene Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04477v1","updated":"2024-02-06T23:52:58Z","published":"2024-02-06T23:52:58Z","title":"Detecting Mode Collapse in Language Models via Narration","summary":" No two authors write alike. Personal flourishes invoked in written\nnarratives, from lexicon to rhetorical devices, imply a particular author--what\nliterary theorists label the implied or virtual author; distinct from the real\nauthor or narrator of a text. Early large language models trained on unfiltered\ntraining sets drawn from a variety of discordant sources yielded incoherent\npersonalities, problematic for conversational tasks but proving useful for\nsampling literature from multiple perspectives. Successes in alignment research\nin recent years have allowed researchers to impose subjectively consistent\npersonae on language models via instruction tuning and reinforcement learning\nfrom human feedback (RLHF), but whether aligned models retain the ability to\nmodel an arbitrary virtual author has received little scrutiny. By studying\n4,374 stories sampled from three OpenAI language models, we show successive\nversions of GPT-3 suffer from increasing degrees of \"mode collapse\" whereby\noverfitting the model during alignment constrains it from generalizing over\nauthorship: models suffering from mode collapse become unable to assume a\nmultiplicity of perspectives. Our method and results are significant for\nresearchers seeking to employ language models in sociological simulations.\n","authors":["Sil Hamilton"],"pdf_url":"https://arxiv.org/pdf/2402.04477v1.pdf","comment":"To appear in the proceedings of the first Workshop on the Scaling\n Behavior of Large Language Models (EACL 2024)"},{"id":"http://arxiv.org/abs/2402.04476v1","updated":"2024-02-06T23:52:10Z","published":"2024-02-06T23:52:10Z","title":"Dual-View Visual Contextualization for Web Navigation","summary":" Automatic web navigation aims to build a web agent that can follow language\ninstructions to execute complex and diverse tasks on real-world websites.\nExisting work primarily takes HTML documents as input, which define the\ncontents and action spaces (i.e., actionable elements and operations) of\nwebpages. Nevertheless, HTML documents may not provide a clear task-related\ncontext for each element, making it hard to select the right (sequence of)\nactions. In this paper, we propose to contextualize HTML elements through their\n\"dual views\" in webpage screenshots: each HTML element has its corresponding\nbounding box and visual content in the screenshot. We build upon the insight --\nweb developers tend to arrange task-related elements nearby on webpages to\nenhance user experiences -- and propose to contextualize each element with its\nneighbor elements, using both textual and visual features. The resulting\nrepresentations of HTML elements are more informative for the agent to take\naction. We validate our method on the recently released Mind2Web dataset, which\nfeatures diverse navigation domains and tasks on real-world websites. Our\nmethod consistently outperforms the baseline in all the scenarios, including\ncross-task, cross-website, and cross-domain ones.\n","authors":["Jihyung Kil","Chan Hee Song","Boyuan Zheng","Xiang Deng","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2402.04476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08809v3","updated":"2024-02-06T22:30:07Z","published":"2023-05-15T17:15:40Z","title":"Interpretability at Scale: Identifying Causal Mechanisms in Alpaca","summary":" Obtaining human-interpretable explanations of large, general-purpose language\nmodels is an urgent goal for AI safety. However, it is just as important that\nour interpretability methods are faithful to the causal dynamics underlying\nmodel behavior and able to robustly generalize to unseen inputs. Distributed\nAlignment Search (DAS) is a powerful gradient descent method grounded in a\ntheory of causal abstraction that has uncovered perfect alignments between\ninterpretable symbolic algorithms and small deep learning models fine-tuned for\nspecific tasks. In the present paper, we scale DAS significantly by replacing\nthe remaining brute-force search steps with learned parameters -- an approach\nwe call Boundless DAS. This enables us to efficiently search for interpretable\ncausal structure in large language models while they follow instructions. We\napply Boundless DAS to the Alpaca model (7B parameters), which, off the shelf,\nsolves a simple numerical reasoning problem. With Boundless DAS, we discover\nthat Alpaca does this by implementing a causal model with two interpretable\nboolean variables. Furthermore, we find that the alignment of neural\nrepresentations with these variables is robust to changes in inputs and\ninstructions. These findings mark a first step toward faithfully understanding\nthe inner-workings of our ever-growing and most widely deployed language\nmodels. Our tool is extensible to larger LLMs and is released publicly at\n`https://github.com/stanfordnlp/pyvene`.\n","authors":["Zhengxuan Wu","Atticus Geiger","Thomas Icard","Christopher Potts","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2305.08809v3.pdf","comment":"NeurIPS 2023 with Author Corrections"},{"id":"http://arxiv.org/abs/2402.04442v1","updated":"2024-02-06T22:24:56Z","published":"2024-02-06T22:24:56Z","title":"Evaluating Embeddings for One-Shot Classification of Doctor-AI\n Consultations","summary":" Effective communication between healthcare providers and patients is crucial\nto providing high-quality patient care. In this work, we investigate how\nDoctor-written and AI-generated texts in healthcare consultations can be\nclassified using state-of-the-art embeddings and one-shot classification\nsystems. By analyzing embeddings such as bag-of-words, character n-grams,\nWord2Vec, GloVe, fastText, and GPT2 embeddings, we examine how well our\none-shot classification systems capture semantic information within medical\nconsultations. Results show that the embeddings are capable of capturing\nsemantic features from text in a reliable and adaptable manner. Overall,\nWord2Vec, GloVe and Character n-grams embeddings performed well, indicating\ntheir suitability for modeling targeted to this task. GPT2 embedding also shows\nnotable performance, indicating its suitability for models tailored to this\ntask as well. Our machine learning architectures significantly improved the\nquality of health conversations when training data are scarce, improving\ncommunication between patients and healthcare providers.\n","authors":["Olumide Ebenezer Ojo","Olaronke Oluwayemisi Adebanji","Alexander Gelbukh","Hiram Calvo","Anna Feldman"],"pdf_url":"https://arxiv.org/pdf/2402.04442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04437v1","updated":"2024-02-06T22:15:09Z","published":"2024-02-06T22:15:09Z","title":"Structured Entity Extraction Using Large Language Models","summary":" Recent advances in machine learning have significantly impacted the field of\ninformation extraction, with Large Language Models (LLMs) playing a pivotal\nrole in extracting structured information from unstructured text. This paper\nexplores the challenges and limitations of current methodologies in structured\nentity extraction and introduces a novel approach to address these issues. We\ncontribute to the field by first introducing and formalizing the task of\nStructured Entity Extraction (SEE), followed by proposing Approximate Entity\nSet OverlaP (AESOP) Metric designed to appropriately assess model performance\non this task. Later, we propose a new model that harnesses the power of LLMs\nfor enhanced effectiveness and efficiency through decomposing the entire\nextraction task into multiple stages. Quantitative evaluation and human\nside-by-side evaluation confirm that our model outperforms baselines, offering\npromising directions for future advancements in structured entity extraction.\n","authors":["Haolun Wu","Ye Yuan","Liana Mikaelyan","Alexander Meulemans","Xue Liu","James Hensman","Bhaskar Mitra"],"pdf_url":"https://arxiv.org/pdf/2402.04437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08030v3","updated":"2024-02-06T22:12:51Z","published":"2023-09-14T21:07:53Z","title":"AV2Wav: Diffusion-Based Re-synthesis from Continuous Self-supervised\n Features for Audio-Visual Speech Enhancement","summary":" Speech enhancement systems are typically trained using pairs of clean and\nnoisy speech. In audio-visual speech enhancement (AVSE), there is not as much\nground-truth clean data available; most audio-visual datasets are collected in\nreal-world environments with background noise and reverberation, hampering the\ndevelopment of AVSE. In this work, we introduce AV2Wav, a resynthesis-based\naudio-visual speech enhancement approach that can generate clean speech despite\nthe challenges of real-world training data. We obtain a subset of nearly clean\nspeech from an audio-visual corpus using a neural quality estimator, and then\ntrain a diffusion model on this subset to generate waveforms conditioned on\ncontinuous speech representations from AV-HuBERT with noise-robust training. We\nuse continuous rather than discrete representations to retain prosody and\nspeaker information. With this vocoding task alone, the model can perform\nspeech enhancement better than a masking-based baseline. We further fine-tune\nthe diffusion model on clean/noisy utterance pairs to improve the performance.\nOur approach outperforms a masking-based baseline in terms of both automatic\nmetrics and a human listening test and is close in quality to the target speech\nin the listening test. Audio samples can be found at\nhttps://home.ttic.edu/~jcchou/demo/avse/avse_demo.html.\n","authors":["Ju-Chieh Chou","Chung-Ming Chien","Karen Livescu"],"pdf_url":"https://arxiv.org/pdf/2309.08030v3.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2402.04411v1","updated":"2024-02-06T21:14:45Z","published":"2024-02-06T21:14:45Z","title":"Chatbot Meets Pipeline: Augment Large Language Model with Definite\n Finite Automaton","summary":" This paper introduces the Definite Finite Automaton augmented large language\nmodel (DFA-LLM), a novel framework designed to enhance the capabilities of\nconversational agents using large language models (LLMs). Traditional LLMs face\nchallenges in generating regulated and compliant responses in special scenarios\nwith predetermined response guidelines, like emotional support and customer\nservice. Our framework addresses these challenges by embedding a Definite\nFinite Automaton (DFA), learned from training dialogues, within the LLM. This\nstructured approach enables the LLM to adhere to a deterministic response\npathway, guided by the DFA. The advantages of DFA-LLM include an interpretable\nstructure through human-readable DFA, context-aware retrieval for responses in\nconversations, and plug-and-play compatibility with existing LLMs. Extensive\nbenchmarks validate DFA-LLM's effectiveness, indicating its potential as a\nvaluable contribution to the conversational agent.\n","authors":["Yiyou Sun","Junjie Hu","Wei Cheng","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04411v1.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04401v1","updated":"2024-02-06T21:03:52Z","published":"2024-02-06T21:03:52Z","title":"Democratizing Large Language Models via Personalized Parameter-Efficient\n Fine-tuning","summary":" Personalization in large language models (LLMs) is increasingly important,\naiming to align LLM's interactions, content, and recommendations with\nindividual user preferences. Recent advances in LLM personalization have\nspotlighted effective prompt design, by enriching user queries with\nnon-parametric knowledge through behavior history retrieval and textual\nprofiles. However, these approaches were limited due to a lack of model\nownership, resulting in constrained customization and privacy issues. Moreover,\nthey often failed to accurately capture user behavior patterns, especially in\ncases where user data were complex and dynamic. To address these shortcomings,\nwe introduce One PEFT Per User (OPPU), which employs personalized\nparameter-efficient fine-tuning (PEFT) modules, to store user-specific behavior\npatterns and preferences. By plugging in users' personal PEFT parameters, they\ncan own and use their LLMs personally. OPPU integrates parametric user\nknowledge in the personal PEFT parameters with the non-parametric knowledge\nacquired through retrieval and profile. This integration adapts individual LLMs\nto user behavior shifts. Experimental results demonstrate that OPPU\nsignificantly outperforms existing prompt-based methods across seven diverse\ntasks in the LaMP benchmark. Further in-depth studies reveal OPPU's enhanced\ncapabilities in handling user behavior shifts, modeling users at different\nactive levels, maintaining robustness across various user history formats, and\ndisplaying versatility with different PEFT methods.\n","authors":["Zhaoxuan Tan","Qingkai Zeng","Yijun Tian","Zheyuan Liu","Bing Yin","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2402.04401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07930v3","updated":"2024-02-06T21:01:28Z","published":"2023-12-13T06:57:00Z","title":"Towards Optimal Statistical Watermarking","summary":" We study statistical watermarking by formulating it as a hypothesis testing\nproblem, a general framework which subsumes all previous statistical\nwatermarking methods. Key to our formulation is a coupling of the output tokens\nand the rejection region, realized by pseudo-random generators in practice,\nthat allows non-trivial trade-offs between the Type I error and Type II error.\nWe characterize the Uniformly Most Powerful (UMP) watermark in the general\nhypothesis testing setting and the minimax Type II error in the model-agnostic\nsetting. In the common scenario where the output is a sequence of $n$ tokens,\nwe establish nearly matching upper and lower bounds on the number of i.i.d.\ntokens required to guarantee small Type I and Type II errors. Our rate of\n$\\Theta(h^{-1} \\log (1/h))$ with respect to the average entropy per token $h$\nhighlights potentials for improvement from the rate of $h^{-2}$ in the previous\nworks. Moreover, we formulate the robust watermarking problem where the user is\nallowed to perform a class of perturbations on the generated texts, and\ncharacterize the optimal Type II error of robust UMP tests via a linear\nprogramming problem. To the best of our knowledge, this is the first systematic\nstatistical treatment on the watermarking problem with near-optimal rates in\nthe i.i.d. setting, which might be of interest for future works.\n","authors":["Baihe Huang","Hanlin Zhu","Banghua Zhu","Kannan Ramchandran","Michael I. Jordan","Jason D. Lee","Jiantao Jiao"],"pdf_url":"https://arxiv.org/pdf/2312.07930v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04396v1","updated":"2024-02-06T20:52:12Z","published":"2024-02-06T20:52:12Z","title":"QuIP#: Even Better LLM Quantization with Hadamard Incoherence and\n Lattice Codebooks","summary":" Post-training quantization (PTQ) reduces the memory footprint of LLMs by\nquantizing their weights to low-precision. In this work, we introduce QuIP#, a\nweight-only PTQ method that achieves state-of-the-art results in extreme\ncompression regimes ($\\le$ 4 bits per weight) using three novel techniques.\nFirst, QuIP# improves the incoherence processing from QuIP by using the\nrandomized Hadamard transform, which is faster and has better theoretical\nproperties. Second, QuIP# uses vector quantization techniques to take advantage\nof the ball-shaped sub-Gaussian distribution that incoherent weights possess:\nspecifically, we introduce a set of hardware-efficient codebooks based on the\nhighly symmetric $E_8$ lattice, which achieves the optimal 8-dimension unit\nball packing. Third, QuIP# uses fine-tuning to improve fidelity to the original\nmodel. Our experiments show that QuIP# outperforms existing PTQ methods,\nenables new behaviors in PTQ scaling, and supports fast inference.\n","authors":["Albert Tseng","Jerry Chee","Qingyao Sun","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2402.04396v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.02619v2","updated":"2024-02-06T20:37:36Z","published":"2024-02-04T21:33:18Z","title":"Increasing Trust in Language Models through the Reuse of Verified\n Circuits","summary":" Language Models (LMs) are increasingly used for a wide range of prediction\ntasks, but their training can often neglect rare edge cases, reducing their\nreliability. Here, we define a stringent standard of trustworthiness whereby\nthe task algorithm and circuit implementation must be verified, accounting for\nedge cases, with no known failure modes. We show that a transformer model can\nbe trained to meet this standard if built using mathematically and logically\nspecified frameworks. In this paper, we fully verify a model for n-digit\ninteger addition. To exhibit the reusability of verified modules, we insert the\ntrained integer addition model into an untrained model and train the combined\nmodel to perform both addition and subtraction. We find extensive reuse of the\naddition circuits for both tasks, easing verification of the more complex\nsubtractor model. We discuss how inserting verified task modules into LMs can\nleverage model reuse to improve verifiability and trustworthiness of language\nmodels built using them. The reuse of verified circuits reduces the effort to\nverify more complex composite models which we believe to be a significant step\ntowards safety of language models.\n","authors":["Philip Quirke","Clement Neo","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2402.02619v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.11123v2","updated":"2024-02-06T20:32:41Z","published":"2023-11-18T17:11:12Z","title":"(Why) Is My Prompt Getting Worse? Rethinking Regression Testing for\n Evolving LLM APIs","summary":" Large Language Models (LLMs) are increasingly integrated into software\napplications. Downstream application developers often access LLMs through APIs\nprovided as a service. However, LLM APIs are often updated silently and\nscheduled to be deprecated, forcing users to continuously adapt to evolving\nmodels. This can cause performance regression and affect prompt design choices,\nas evidenced by our case study on toxicity detection. Based on our case study,\nwe emphasize the need for and re-examine the concept of regression testing for\nevolving LLM APIs. We argue that regression testing LLMs requires fundamental\nchanges to traditional testing approaches, due to different correctness\nnotions, prompting brittleness, and non-determinism in LLM APIs.\n","authors":["Wanqin Ma","Chenyang Yang","Christian Kästner"],"pdf_url":"https://arxiv.org/pdf/2311.11123v2.pdf","comment":"conference version"},{"id":"http://arxiv.org/abs/2402.04373v1","updated":"2024-02-06T20:18:32Z","published":"2024-02-06T20:18:32Z","title":"The World of Generative AI: Deepfakes and Large Language Models","summary":" We live in the era of Generative Artificial Intelligence (GenAI). Deepfakes\nand Large Language Models (LLMs) are two examples of GenAI. Deepfakes, in\nparticular, pose an alarming threat to society as they are capable of spreading\nmisinformation and changing the truth. LLMs are powerful language models that\ngenerate general-purpose language. However due to its generative aspect, it can\nalso be a risk for people if used with ill intentions. The ethical use of these\ntechnologies is a big concern. This short article tries to find out the\ninterrelationship between them.\n","authors":["Alakananda Mitra","Saraju P. Mohanty","Elias Kougianos"],"pdf_url":"https://arxiv.org/pdf/2402.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04347v1","updated":"2024-02-06T19:31:26Z","published":"2024-02-06T19:31:26Z","title":"The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax\n Mimicry","summary":" Linear attentions have shown potential for improving Transformer efficiency,\nreducing attention's quadratic complexity to linear in sequence length. This\nholds exciting promise for (1) training linear Transformers from scratch, (2)\n\"finetuned-conversion\" of task-specific Transformers into linear versions that\nrecover task performance, and (3) \"pretrained-conversion\" of Transformers such\nas large language models into linear versions finetunable on downstream tasks.\nHowever, linear attentions often underperform standard softmax attention in\nquality. To close this performance gap, we find prior linear attentions lack\nkey properties of softmax attention tied to good performance: low-entropy (or\n\"spiky\") weights and dot-product monotonicity. We further observe surprisingly\nsimple feature maps that retain these properties and match softmax performance,\nbut are inefficient to compute in linear attention. We thus propose Hedgehog, a\nlearnable linear attention that retains the spiky and monotonic properties of\nsoftmax attention while maintaining linear complexity. Hedgehog uses simple\ntrainable MLPs to produce attention weights mimicking softmax attention.\nExperiments show Hedgehog recovers over 99% of standard Transformer quality in\ntrain-from-scratch and finetuned-conversion settings, outperforming prior\nlinear attentions up to 6 perplexity points on WikiText-103 with causal GPTs,\nand up to 8.7 GLUE score points on finetuned bidirectional BERTs. Hedgehog also\nenables pretrained-conversion. Converting a pretrained GPT-2 into a linear\nattention variant achieves state-of-the-art 16.7 perplexity on WikiText-103 for\n125M subquadratic decoder models. We finally turn a pretrained Llama-2 7B into\na viable linear attention Llama. With low-rank adaptation, Hedgehog-Llama2 7B\nachieves 28.1 higher ROUGE-1 points over the base standard attention model,\nwhere prior linear attentions lead to 16.5 point drops.\n","authors":["Michael Zhang","Kush Bhatia","Hermann Kumbong","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2402.04347v1.pdf","comment":"30 pages, 20 figures, 15 tables, ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04335v1","updated":"2024-02-06T19:18:56Z","published":"2024-02-06T19:18:56Z","title":"LegalLens: Leveraging LLMs for Legal Violation Identification in\n Unstructured Text","summary":" In this study, we focus on two main tasks, the first for detecting legal\nviolations within unstructured textual data, and the second for associating\nthese violations with potentially affected individuals. We constructed two\ndatasets using Large Language Models (LLMs) which were subsequently validated\nby domain expert annotators. Both tasks were designed specifically for the\ncontext of class-action cases. The experimental design incorporated fine-tuning\nmodels from the BERT family and open-source LLMs, and conducting few-shot\nexperiments using closed-source LLMs. Our results, with an F1-score of 62.69\\%\n(violation identification) and 81.02\\% (associating victims), show that our\ndatasets and setups can be used for both tasks. Finally, we publicly release\nthe datasets and the code used for the experiments in order to advance further\nresearch in the area of legal natural language processing (NLP).\n","authors":["Dor Bernsohn","Gil Semo","Yaron Vazana","Gila Hayat","Ben Hagag","Joel Niklaus","Rohit Saha","Kyryl Truskovskyi"],"pdf_url":"https://arxiv.org/pdf/2402.04335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04333v1","updated":"2024-02-06T19:18:04Z","published":"2024-02-06T19:18:04Z","title":"LESS: Selecting Influential Data for Targeted Instruction Tuning","summary":" Instruction tuning has unlocked powerful capabilities in large language\nmodels (LLMs), effectively using combined datasets to develop generalpurpose\nchatbots. However, real-world applications often require a specialized suite of\nskills (e.g., reasoning). The challenge lies in identifying the most relevant\ndata from these extensive datasets to effectively develop specific\ncapabilities, a setting we frame as targeted instruction tuning. We propose\nLESS, an optimizer-aware and practically efficient algorithm to effectively\nestimate data influences and perform Low-rank gradiEnt Similarity Search for\ninstruction data selection. Crucially, LESS adapts existing influence\nformulations to work with the Adam optimizer and variable-length instruction\ndata. LESS first constructs a highly reusable and transferable gradient\ndatastore with low-dimensional gradient features and then selects examples\nbased on their similarity to few-shot examples embodying a specific capability.\nExperiments show that training on a LESS-selected 5% of the data can often\noutperform training on the full dataset across diverse downstream tasks.\nFurthermore, the selected data is highly transferable: smaller models can be\nleveraged to select useful data for larger models and models from different\nfamilies. Our qualitative analysis shows that our method goes beyond surface\nform cues to identify data that exemplifies the necessary reasoning skills for\nthe intended downstream application.\n","authors":["Mengzhou Xia","Sadhika Malladi","Suchin Gururangan","Sanjeev Arora","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04333v1.pdf","comment":"Code and data are available at https://github.com/princeton-nlp/LESS"},{"id":"http://arxiv.org/abs/2402.04315v1","updated":"2024-02-06T19:00:40Z","published":"2024-02-06T19:00:40Z","title":"Training Language Models to Generate Text with Citations via\n Fine-grained Rewards","summary":" While recent Large Language Models (LLMs) have proven useful in answering\nuser queries, they are prone to hallucination, and their responses often lack\ncredibility due to missing references to reliable sources. An intuitive\nsolution to these issues would be to include in-text citations referring to\nexternal documents as evidence. While previous works have directly prompted\nLLMs to generate in-text citations, their performances are far from\nsatisfactory, especially when it comes to smaller LLMs. In this work, we\npropose an effective training framework using fine-grained rewards to teach\nLLMs to generate highly supportive and relevant citations, while ensuring the\ncorrectness of their responses. We also conduct a systematic analysis of\napplying these fine-grained rewards to common LLM training strategies,\ndemonstrating its advantage over conventional practices. We conduct extensive\nexperiments on Question Answering (QA) datasets taken from the ALCE benchmark\nand validate the model's generalizability using EXPERTQA. On LLaMA-2-7B, the\nincorporation of fine-grained rewards achieves the best performance among the\nbaselines, even surpassing that of GPT-3.5-turbo.\n","authors":["Chengyu Huang","Zeqiu Wu","Yushi Hu","Wenya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04291v1","updated":"2024-02-06T09:26:34Z","published":"2024-02-06T09:26:34Z","title":"BiLLM: Pushing the Limit of Post-Training Quantization for LLMs","summary":" Pretrained large language models (LLMs) exhibit exceptional general language\nprocessing capabilities but come with significant demands on memory and\ncomputational resources. As a powerful compression technology, binarization can\nextremely reduce model weights to a mere 1 bit, lowering the expensive\ncomputation and memory requirements. However, existing quantization techniques\nfall short of maintaining LLM performance under ultra-low bit-widths. In\nresponse to this challenge, we present BiLLM, a groundbreaking 1-bit\npost-training quantization scheme tailored for pretrained LLMs. Based on the\nweight distribution of LLMs, BiLLM first identifies and structurally selects\nsalient weights, and minimizes the compression loss through an effective binary\nresidual approximation strategy. Moreover, considering the bell-shaped\ndistribution of the non-salient weights, we propose an optimal splitting search\nto group and binarize them accurately. BiLLM achieving for the first time\nhigh-accuracy inference (e.g. 8.41 perplexity on LLaMA2-70B) with only 1.08-bit\nweights across various LLMs families and evaluation metrics, outperforms SOTA\nquantization methods of LLM by significant margins. Moreover, BiLLM enables the\nbinarization process of the LLM with 7 billion weights within 0.5 hours on a\nsingle GPU, demonstrating satisfactory time efficiency.\n","authors":["Wei Huang","Yangdong Liu","Haotong Qin","Ying Li","Shiming Zhang","Xianglong Liu","Michele Magno","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2402.04291v1.pdf","comment":"19 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.04252v1","updated":"2024-02-06T18:59:48Z","published":"2024-02-06T18:59:48Z","title":"EVA-CLIP-18B: Scaling CLIP to 18 Billion Parameters","summary":" Scaling up contrastive language-image pretraining (CLIP) is critical for\nempowering both vision and multimodal models. We present EVA-CLIP-18B, the\nlargest and most powerful open-source CLIP model to date, with 18-billion\nparameters. With only 6-billion training samples seen, EVA-CLIP-18B achieves an\nexceptional 80.7% zero-shot top-1 accuracy averaged across 27 widely recognized\nimage classification benchmarks, outperforming its forerunner EVA-CLIP\n(5-billion parameters) and other open-source CLIP models by a large margin.\nRemarkably, we observe a consistent performance improvement with the model size\nscaling of EVA-CLIP, despite maintaining a constant training dataset of\n2-billion image-text pairs from LAION-2B and COYO-700M. This dataset is openly\navailable and much smaller than the in-house datasets (e.g., DFN-5B, WebLI-10B)\nemployed in other state-of-the-art CLIP models. EVA-CLIP-18B demonstrates the\npotential of EVA-style weak-to-strong visual model scaling. With our model\nweights made publicly available, we hope to facilitate future research in\nvision and multimodal foundation models.\n","authors":["Quan Sun","Jinsheng Wang","Qiying Yu","Yufeng Cui","Fan Zhang","Xiaosong Zhang","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04249v1","updated":"2024-02-06T18:59:08Z","published":"2024-02-06T18:59:08Z","title":"HarmBench: A Standardized Evaluation Framework for Automated Red Teaming\n and Robust Refusal","summary":" Automated red teaming holds substantial promise for uncovering and mitigating\nthe risks associated with the malicious use of large language models (LLMs),\nyet the field lacks a standardized evaluation framework to rigorously assess\nnew methods. To address this issue, we introduce HarmBench, a standardized\nevaluation framework for automated red teaming. We identify several desirable\nproperties previously unaccounted for in red teaming evaluations and\nsystematically design HarmBench to meet these criteria. Using HarmBench, we\nconduct a large-scale comparison of 18 red teaming methods and 33 target LLMs\nand defenses, yielding novel insights. We also introduce a highly efficient\nadversarial training method that greatly enhances LLM robustness across a wide\nrange of attacks, demonstrating how HarmBench enables codevelopment of attacks\nand defenses. We open source HarmBench at\nhttps://github.com/centerforaisafety/HarmBench.\n","authors":["Mantas Mazeika","Long Phan","Xuwang Yin","Andy Zou","Zifan Wang","Norman Mu","Elham Sakhaee","Nathaniel Li","Steven Basart","Bo Li","David Forsyth","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2402.04249v1.pdf","comment":"Website: https://www.harmbench.org"},{"id":"http://arxiv.org/abs/2305.14330v3","updated":"2024-02-06T18:44:30Z","published":"2023-05-23T17:57:09Z","title":"DirecT2V: Large Language Models are Frame-Level Directors for Zero-Shot\n Text-to-Video Generation","summary":" In the paradigm of AI-generated content (AIGC), there has been increasing\nattention to transferring knowledge from pre-trained text-to-image (T2I) models\nto text-to-video (T2V) generation. Despite their effectiveness, these\nframeworks face challenges in maintaining consistent narratives and handling\nshifts in scene composition or object placement from a single abstract user\nprompt. Exploring the ability of large language models (LLMs) to generate\ntime-dependent, frame-by-frame prompts, this paper introduces a new framework,\ndubbed DirecT2V. DirecT2V leverages instruction-tuned LLMs as directors,\nenabling the inclusion of time-varying content and facilitating consistent\nvideo generation. To maintain temporal consistency and prevent mapping the\nvalue to a different object, we equip a diffusion model with a novel value\nmapping method and dual-softmax filtering, which do not require any additional\ntraining. The experimental results validate the effectiveness of our framework\nin producing visually coherent and storyful videos from abstract user prompts,\nsuccessfully addressing the challenges of zero-shot video generation.\n","authors":["Susung Hong","Junyoung Seo","Heeseong Shin","Sunghwan Hong","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2305.14330v3.pdf","comment":"The code and demo will be available at\n https://github.com/KU-CVLAB/DirecT2V"},{"id":"http://arxiv.org/abs/2402.04236v1","updated":"2024-02-06T18:43:48Z","published":"2024-02-06T18:43:48Z","title":"CogCoM: Train Large Vision-Language Models Diving into Details through\n Chain of Manipulations","summary":" Vision-Language Models (VLMs) have demonstrated their widespread viability\nthanks to extensive training in aligning visual instructions to answers.\nHowever, this conclusive alignment leads models to ignore critical visual\nreasoning, and further result in failures on meticulous visual problems and\nunfaithful responses. In this paper, we propose Chain of Manipulations, a\nmechanism that enables VLMs to solve problems with a series of manipulations,\nwhere each manipulation refers to an operation on the visual input, either from\nintrinsic abilities (e.g., grounding) acquired through prior training or from\nimitating human-like behaviors (e.g., zoom in). This mechanism encourages VLMs\nto generate faithful responses with evidential visual reasoning, and permits\nusers to trace error causes in the interpretable paths. We thus train CogCoM, a\ngeneral 17B VLM with a memory-based compatible architecture endowed this\nreasoning mechanism. Experiments show that our model achieves the\nstate-of-the-art performance across 8 benchmarks from 3 categories, and a\nlimited number of training steps with the data swiftly gains a competitive\nperformance. The code and data are publicly available at\nhttps://github.com/THUDM/CogCoM.\n","authors":["Ji Qi","Ming Ding","Weihan Wang","Yushi Bai","Qingsong Lv","Wenyi Hong","Bin Xu","Lei Hou","Juanzi Li","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2402.04236v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.00868v2","updated":"2024-02-06T18:35:26Z","published":"2024-02-01T18:59:56Z","title":"We're Not Using Videos Effectively: An Updated Domain Adaptive Video\n Segmentation Baseline","summary":" There has been abundant work in unsupervised domain adaptation for semantic\nsegmentation (DAS) seeking to adapt a model trained on images from a labeled\nsource domain to an unlabeled target domain. While the vast majority of prior\nwork has studied this as a frame-level Image-DAS problem, a few Video-DAS works\nhave sought to additionally leverage the temporal signal present in adjacent\nframes. However, Video-DAS works have historically studied a distinct set of\nbenchmarks from Image-DAS, with minimal cross-benchmarking. In this work, we\naddress this gap. Surprisingly, we find that (1) even after carefully\ncontrolling for data and model architecture, state-of-the-art Image-DAS methods\n(HRDA and HRDA+MIC) outperform Video-DAS methods on established Video-DAS\nbenchmarks (+14.5 mIoU on Viper$\\rightarrow$CityscapesSeq, +19.0 mIoU on\nSynthia$\\rightarrow$CityscapesSeq), and (2) naive combinations of Image-DAS and\nVideo-DAS techniques only lead to marginal improvements across datasets. To\navoid siloed progress between Image-DAS and Video-DAS, we open-source our\ncodebase with support for a comprehensive set of Video-DAS and Image-DAS\nmethods on a common benchmark. Code available at\nhttps://github.com/SimarKareer/UnifiedVideoDA\n","authors":["Simar Kareer","Vivek Vijaykumar","Harsh Maheshwari","Prithvijit Chattopadhyay","Judy Hoffman","Viraj Prabhu"],"pdf_url":"https://arxiv.org/pdf/2402.00868v2.pdf","comment":"TMLR 2024"},{"id":"http://arxiv.org/abs/2312.07541v2","updated":"2024-02-06T18:04:35Z","published":"2023-12-12T18:59:40Z","title":"SMERF: Streamable Memory Efficient Radiance Fields for Real-Time\n Large-Scene Exploration","summary":" Recent techniques for real-time view synthesis have rapidly advanced in\nfidelity and speed, and modern methods are capable of rendering\nnear-photorealistic scenes at interactive frame rates. At the same time, a\ntension has arisen between explicit scene representations amenable to\nrasterization and neural fields built on ray marching, with state-of-the-art\ninstances of the latter surpassing the former in quality while being\nprohibitively expensive for real-time applications. In this work, we introduce\nSMERF, a view synthesis approach that achieves state-of-the-art accuracy among\nreal-time methods on large scenes with footprints up to 300 m$^2$ at a\nvolumetric resolution of 3.5 mm$^3$. Our method is built upon two primary\ncontributions: a hierarchical model partitioning scheme, which increases model\ncapacity while constraining compute and memory consumption, and a distillation\ntraining strategy that simultaneously yields high fidelity and internal\nconsistency. Our approach enables full six degrees of freedom (6DOF) navigation\nwithin a web browser and renders in real-time on commodity smartphones and\nlaptops. Extensive experiments show that our method exceeds the current\nstate-of-the-art in real-time novel view synthesis by 0.78 dB on standard\nbenchmarks and 1.78 dB on large scenes, renders frames three orders of\nmagnitude faster than state-of-the-art radiance field models, and achieves\nreal-time performance across a wide variety of commodity devices, including\nsmartphones. We encourage readers to explore these models interactively at our\nproject website: https://smerf-3d.github.io.\n","authors":["Daniel Duckworth","Peter Hedman","Christian Reiser","Peter Zhizhin","Jean-François Thibert","Mario Lučić","Richard Szeliski","Jonathan T. Barron"],"pdf_url":"https://arxiv.org/pdf/2312.07541v2.pdf","comment":"Added appendix. Changed LaTeX template. Project website:\n https://smerf-3d.github.io"},{"id":"http://arxiv.org/abs/2310.10410v3","updated":"2024-02-06T17:56:02Z","published":"2023-10-16T13:53:37Z","title":"Loci-Segmented: Improving Scene Segmentation Learning","summary":" Current slot-oriented approaches for compositional scene segmentation from\nimages and videos rely on provided background information or slot assignments.\nWe present a segmented location and identity tracking system, Loci-Segmented\n(Loci-s), which does not require either of this information. It learns to\ndynamically segment scenes into interpretable background and slot-based object\nencodings, separating rgb, mask, location, and depth information for each. The\nresults reveal largely superior video decomposition performance in the MOVi\ndatasets and in another established dataset collection targeting scene\nsegmentation. The system's well-interpretable, compositional latent encodings\nmay serve as a foundation model for downstream tasks.\n","authors":["Manuel Traub","Frederic Becker","Adrian Sauter","Sebastian Otte","Martin V. Butz"],"pdf_url":"https://arxiv.org/pdf/2310.10410v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09844v2","updated":"2024-02-06T17:53:02Z","published":"2023-09-18T14:59:11Z","title":"CC-SGG: Corner Case Scenario Generation using Learned Scene Graphs","summary":" Corner case scenarios are an essential tool for testing and validating the\nsafety of autonomous vehicles (AVs). As these scenarios are often\ninsufficiently present in naturalistic driving datasets, augmenting the data\nwith synthetic corner cases greatly enhances the safe operation of AVs in\nunique situations. However, the generation of synthetic, yet realistic, corner\ncases poses a significant challenge. In this work, we introduce a novel\napproach based on Heterogeneous Graph Neural Networks (HGNNs) to transform\nregular driving scenarios into corner cases. To achieve this, we first generate\nconcise representations of regular driving scenes as scene graphs, minimally\nmanipulating their structure and properties. Our model then learns to perturb\nthose graphs to generate corner cases using attention and triple embeddings.\nThe input and perturbed graphs are then imported back into the simulation to\ngenerate corner case scenarios. Our model successfully learned to produce\ncorner cases from input scene graphs, achieving 89.9% prediction accuracy on\nour testing dataset. We further validate the generated scenarios on baseline\nautonomous driving methods, demonstrating our model's ability to effectively\ncreate critical situations for the baselines.\n","authors":["George Drayson","Efimia Panagiotaki","Daniel Omeiza","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2309.09844v2.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2402.04195v1","updated":"2024-02-06T17:50:30Z","published":"2024-02-06T17:50:30Z","title":"Instance by Instance: An Iterative Framework for Multi-instance 3D\n Registration","summary":" Multi-instance registration is a challenging problem in computer vision and\nrobotics, where multiple instances of an object need to be registered in a\nstandard coordinate system. In this work, we propose the first iterative\nframework called instance-by-instance (IBI) for multi-instance 3D registration\n(MI-3DReg). It successively registers all instances in a given scenario,\nstarting from the easiest and progressing to more challenging ones. Throughout\nthe iterative process, outliers are eliminated continuously, leading to an\nincreasing inlier rate for the remaining and more challenging instances. Under\nthe IBI framework, we further propose a sparse-to-dense-correspondence-based\nmulti-instance registration method (IBI-S2DC) to achieve robust MI-3DReg.\nExperiments on the synthetic and real datasets have demonstrated the\neffectiveness of IBI and suggested the new state-of-the-art performance of\nIBI-S2DC, e.g., our MHF1 is 12.02%/12.35% higher than the existing\nstate-of-the-art method ECC on the synthetic/real datasets.\n","authors":["Xinyue Cao","Xiyu Zhang","Yuxin Cheng","Zhaoshuai Qi","Yanning Zhang","Jiaqi Yang"],"pdf_url":"https://arxiv.org/pdf/2402.04195v1.pdf","comment":"14 pages, 12 figures, 10 tables"},{"id":"http://arxiv.org/abs/2209.09178v4","updated":"2024-02-06T17:48:10Z","published":"2022-09-19T16:56:51Z","title":"ViT-DD: Multi-Task Vision Transformer for Semi-Supervised Driver\n Distraction Detection","summary":" Ensuring traffic safety and mitigating accidents in modern driving is of\nparamount importance, and computer vision technologies have the potential to\nsignificantly contribute to this goal. This paper presents a multi-modal Vision\nTransformer for Driver Distraction Detection (termed ViT-DD), which\nincorporates inductive information from training signals related to both\ndistraction detection and driver emotion recognition. Additionally, a\nself-learning algorithm is developed, allowing for the seamless integration of\ndriver data without emotion labels into the multi-task training process of\nViT-DD. Experimental results reveal that the proposed ViT-DD surpasses existing\nstate-of-the-art methods for driver distraction detection by 6.5% and 0.9% on\nthe SFDDD and AUCDD datasets, respectively.\n","authors":["Yunsheng Ma","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2209.09178v4.pdf","comment":"7 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2402.04178v1","updated":"2024-02-06T17:31:36Z","published":"2024-02-06T17:31:36Z","title":"SHIELD : An Evaluation Benchmark for Face Spoofing and Forgery Detection\n with Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) have demonstrated remarkable\nproblem-solving capabilities in various vision fields (e.g., generic object\nrecognition and grounding) based on strong visual semantic representation and\nlanguage reasoning ability. However, whether MLLMs are sensitive to subtle\nvisual spoof/forged clues and how they perform in the domain of face attack\ndetection (e.g., face spoofing and forgery detection) is still unexplored. In\nthis paper, we introduce a new benchmark, namely SHIELD, to evaluate the\nability of MLLMs on face spoofing and forgery detection. Specifically, we\ndesign true/false and multiple-choice questions to evaluate multimodal face\ndata in these two face security tasks. For the face anti-spoofing task, we\nevaluate three different modalities (i.e., RGB, infrared, depth) under four\ntypes of presentation attacks (i.e., print attack, replay attack, rigid mask,\npaper mask). For the face forgery detection task, we evaluate GAN-based and\ndiffusion-based data with both visual and acoustic modalities. Each question is\nsubjected to both zero-shot and few-shot tests under standard and chain of\nthought (COT) settings. The results indicate that MLLMs hold substantial\npotential in the face security domain, offering advantages over traditional\nspecific models in terms of interpretability, multimodal flexible reasoning,\nand joint face spoof and forgery detection. Additionally, we develop a novel\nMulti-Attribute Chain of Thought (MA-COT) paradigm for describing and judging\nvarious task-specific and task-irrelevant attributes of face images, which\nprovides rich task-related knowledge for subtle spoof/forged clue mining.\nExtensive experiments in separate face anti-spoofing, separate face forgery\ndetection, and joint detection tasks demonstrate the effectiveness of the\nproposed MA-COT. The project is available at\nhttps$:$//github.com/laiyingxin2/SHIELD\n","authors":["Yichen Shi","Yuhao Gao","Yingxin Lai","Hongyang Wang","Jun Feng","Lei He","Jun Wan","Changsheng Chen","Zitong Yu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2402.04178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04171v1","updated":"2024-02-06T17:26:18Z","published":"2024-02-06T17:26:18Z","title":"3D Volumetric Super-Resolution in Radiology Using 3D RRDB-GAN","summary":" This study introduces the 3D Residual-in-Residual Dense Block GAN (3D\nRRDB-GAN) for 3D super-resolution for radiology imagery. A key aspect of 3D\nRRDB-GAN is the integration of a 2.5D perceptual loss function, which\ncontributes to improved volumetric image quality and realism. The effectiveness\nof our model was evaluated through 4x super-resolution experiments across\ndiverse datasets, including Mice Brain MRH, OASIS, HCP1200, and MSD-Task-6.\nThese evaluations, encompassing both quantitative metrics like LPIPS and FID\nand qualitative assessments through sample visualizations, demonstrate the\nmodels effectiveness in detailed image analysis. The 3D RRDB-GAN offers a\nsignificant contribution to medical imaging, particularly by enriching the\ndepth, clarity, and volumetric detail of medical images. Its application shows\npromise in enhancing the interpretation and analysis of complex medical imagery\nfrom a comprehensive 3D perspective.\n","authors":["Juhyung Ha","Nian Wang","Surendra Maharjan","Xuhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04168v1","updated":"2024-02-06T17:24:06Z","published":"2024-02-06T17:24:06Z","title":"Informed Reinforcement Learning for Situation-Aware Traffic Rule\n Exceptions","summary":" Reinforcement Learning is a highly active research field with promising\nadvancements. In the field of autonomous driving, however, often very simple\nscenarios are being examined. Common approaches use non-interpretable control\ncommands as the action space and unstructured reward designs which lack\nstructure. In this work, we introduce Informed Reinforcement Learning, where a\nstructured rulebook is integrated as a knowledge source. We learn trajectories\nand asses them with a situation-aware reward design, leading to a dynamic\nreward which allows the agent to learn situations which require controlled\ntraffic rule exceptions. Our method is applicable to arbitrary RL models. We\nsuccessfully demonstrate high completion rates of complex scenarios with recent\nmodel-based agents.\n","authors":["Daniel Bogdoll","Jing Qin","Moritz Nekolla","Ahmed Abouelazm","Tim Joseph","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2402.04168v1.pdf","comment":"Daniel Bogdoll and Jing Qin contributed equally. Accepted for\n publication at ICRA 2024"},{"id":"http://arxiv.org/abs/2303.12649v3","updated":"2024-02-06T16:55:14Z","published":"2023-03-22T15:30:44Z","title":"MI-SegNet: Mutual Information-Based US Segmentation for Unseen Domain\n Generalization","summary":" Generalization capabilities of learning-based medical image segmentation\nacross domains are currently limited by the performance degradation caused by\nthe domain shift, particularly for ultrasound (US) imaging. The quality of US\nimages heavily relies on carefully tuned acoustic parameters, which vary across\nsonographers, machines, and settings. To improve the generalizability on US\nimages across domains, we propose MI-SegNet, a novel mutual information (MI)\nbased framework to explicitly disentangle the anatomical and domain feature\nrepresentations; therefore, robust domain-independent segmentation can be\nexpected. Two encoders are employed to extract the relevant features for the\ndisentanglement. The segmentation only uses the anatomical feature map for its\nprediction. In order to force the encoders to learn meaningful feature\nrepresentations a cross-reconstruction method is used during training.\nTransformations, specific to either domain or anatomy are applied to guide the\nencoders in their respective feature extraction task. Additionally, any MI\npresent in both feature maps is punished to further promote separate feature\nspaces. We validate the generalizability of the proposed domain-independent\nsegmentation approach on several datasets with varying parameters and machines.\nFurthermore, we demonstrate the effectiveness of the proposed MI-SegNet serving\nas a pre-trained model by comparing it with state-of-the-art networks.\n","authors":["Yuan Bi","Zhongliang Jiang","Ricarda Clarenbach","Reza Ghotbi","Angelos Karlas","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2303.12649v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04139v1","updated":"2024-02-06T16:46:28Z","published":"2024-02-06T16:46:28Z","title":"U-shaped Vision Mamba for Single Image Dehazing","summary":" Currently, Transformer is the most popular architecture for image dehazing,\nbut due to its large computational complexity, its ability to handle long-range\ndependency is limited on resource-constrained devices. To tackle this\nchallenge, we introduce the U-shaped Vision Mamba (UVM-Net), an efficient\nsingle-image dehazing network. Inspired by the State Space Sequence Models\n(SSMs), a new deep sequence model known for its power to handle long sequences,\nwe design a Bi-SSM block that integrates the local feature extraction ability\nof the convolutional layer with the ability of the SSM to capture long-range\ndependencies. Extensive experimental results demonstrate the effectiveness of\nour method. Our method provides a more highly efficient idea of long-range\ndependency modeling for image dehazing as well as other image restoration\ntasks. The URL of the code is \\url{https://github.com/zzr-idam}.\n","authors":["Zhuoran Zheng","Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2402.04139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16839v2","updated":"2024-02-06T16:43:31Z","published":"2023-11-28T14:54:37Z","title":"Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware\n Direct Preference Optimization","summary":" Multimodal large language models have made significant advancements in recent\nyears, yet they still suffer from a common issue known as the \"hallucination\nproblem\", in which the models generate textual descriptions that inaccurately\ndepict or entirely fabricate content from associated images. This paper\nintroduces a novel solution, Hallucination-Aware Direct Preference Optimization\n(HA-DPO), which reframes the hallucination problem as a preference selection\ntask. The model is trained to favor the non-hallucinating response when\npresented with two responses of the same image (one accurate and one\nhallucinatory). Furthermore, this paper proposes an efficient pipeline for\nconstructing positive~(non-hallucinatory) and negative~(hallucinatory) sample\npairs, ensuring a high-quality, style-consistent dataset for robust preference\nlearning. When applied to three mainstream multimodal models, HA-DPO\nsignificantly reduced hallucination issues and amplified the models'\ngeneralization capabilities. Notably, the MiniGPT-4 model, when enhanced with\nHA-DPO, demonstrated a substantial improvement: POPE accuracy rose from 51.13%\nto 86.13% (an absolute improvement of 35%), and the MME score surged from\n932.00 to 1326.46 (a relative improvement of 42.32%). The codes, models, and\ndatasets are made accessible at https://opendatalab.github.io/HA-DPO.\n","authors":["Zhiyuan Zhao","Bin Wang","Linke Ouyang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2311.16839v2.pdf","comment":"Project Website: https://opendatalab.github.io/HA-DPO, Code:\n https://github.com/opendatalab/HA-DPO"},{"id":"http://arxiv.org/abs/2402.04129v1","updated":"2024-02-06T16:31:11Z","published":"2024-02-06T16:31:11Z","title":"OVOR: OnePrompt with Virtual Outlier Regularization for Rehearsal-Free\n Class-Incremental Learning","summary":" Recent works have shown that by using large pre-trained models along with\nlearnable prompts, rehearsal-free methods for class-incremental learning (CIL)\nsettings can achieve superior performance to prominent rehearsal-based ones.\nRehearsal-free CIL methods struggle with distinguishing classes from different\ntasks, as those are not trained together. In this work we propose a\nregularization method based on virtual outliers to tighten decision boundaries\nof the classifier, such that confusion of classes among different tasks is\nmitigated. Recent prompt-based methods often require a pool of task-specific\nprompts, in order to prevent overwriting knowledge of previous tasks with that\nof the new task, leading to extra computation in querying and composing an\nappropriate prompt from the pool. This additional cost can be eliminated,\nwithout sacrificing accuracy, as we reveal in the paper. We illustrate that a\nsimplified prompt-based method can achieve results comparable to previous\nstate-of-the-art (SOTA) methods equipped with a prompt pool, using much less\nlearnable parameters and lower inference cost. Our regularization method has\ndemonstrated its compatibility with different prompt-based methods, boosting\nthose previous SOTA rehearsal-free CIL methods' accuracy on the ImageNet-R and\nCIFAR-100 benchmarks. Our source code is available at\nhttps://github.com/jpmorganchase/ovor.\n","authors":["Wei-Cheng Huang","Chun-Fu Chen","Hsiang Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.04129v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2310.18961v5","updated":"2024-02-06T16:30:05Z","published":"2023-10-29T10:03:49Z","title":"AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly\n Detection","summary":" Zero-shot anomaly detection (ZSAD) requires detection models trained using\nauxiliary data to detect anomalies without any training sample in a target\ndataset. It is a crucial task when training data is not accessible due to\nvarious concerns, eg, data privacy, yet it is challenging since the models need\nto generalize to anomalies across different domains where the appearance of\nforeground objects, abnormal regions, and background features, such as\ndefects/tumors on different products/organs, can vary significantly. Recently\nlarge pre-trained vision-language models (VLMs), such as CLIP, have\ndemonstrated strong zero-shot recognition ability in various vision tasks,\nincluding anomaly detection. However, their ZSAD performance is weak since the\nVLMs focus more on modeling the class semantics of the foreground objects\nrather than the abnormality/normality in the images. In this paper we introduce\na novel approach, namely AnomalyCLIP, to adapt CLIP for accurate ZSAD across\ndifferent domains. The key insight of AnomalyCLIP is to learn object-agnostic\ntext prompts that capture generic normality and abnormality in an image\nregardless of its foreground objects. This allows our model to focus on the\nabnormal image regions rather than the object semantics, enabling generalized\nnormality and abnormality recognition on diverse types of objects. Large-scale\nexperiments on 17 real-world anomaly detection datasets show that AnomalyCLIP\nachieves superior zero-shot performance of detecting and segmenting anomalies\nin datasets of highly diverse class semantics from various defect inspection\nand medical imaging domains. Code will be made available at\nhttps://github.com/zqhang/AnomalyCLIP.\n","authors":["Qihang Zhou","Guansong Pang","Yu Tian","Shibo He","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2310.18961v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01459v2","updated":"2024-02-06T16:11:35Z","published":"2024-02-02T14:50:23Z","title":"GaMeS: Mesh-Based Adapting and Modification of Gaussian Splatting","summary":" In recent years, a range of neural network-based methods for image rendering\nhave been introduced. For instance, widely-researched neural radiance fields\n(NeRF) rely on a neural network to represent 3D scenes, allowing for realistic\nview synthesis from a small number of 2D images. However, most NeRF models are\nconstrained by long training and inference times. In comparison, Gaussian\nSplatting (GS) is a novel, state-of-theart technique for rendering points in a\n3D scene by approximating their contribution to image pixels through Gaussian\ndistributions, warranting fast training and swift, real-time rendering. A\ndrawback of GS is the absence of a well-defined approach for its conditioning\ndue to the necessity to condition several hundred thousand Gaussian components.\nTo solve this, we introduce Gaussian Mesh Splatting (GaMeS) model, a hybrid of\nmesh and a Gaussian distribution, that pin all Gaussians splats on the object\nsurface (mesh). The unique contribution of our methods is defining Gaussian\nsplats solely based on their location on the mesh, allowing for automatic\nadjustments in position, scale, and rotation during animation. As a result, we\nobtain high-quality renders in the real-time generation of high-quality views.\nFurthermore, we demonstrate that in the absence of a predefined mesh, it is\npossible to fine-tune the initial mesh during the learning process.\n","authors":["Joanna Waczyńska","Piotr Borycki","Sławomir Tadeja","Jacek Tabor","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2402.01459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04101v1","updated":"2024-02-06T15:55:46Z","published":"2024-02-06T15:55:46Z","title":"VRMM: A Volumetric Relightable Morphable Head Model","summary":" In this paper, we introduce the Volumetric Relightable Morphable Model\n(VRMM), a novel volumetric and parametric facial prior for 3D face modeling.\nWhile recent volumetric prior models offer improvements over traditional\nmethods like 3D Morphable Models (3DMMs), they face challenges in model\nlearning and personalized reconstructions. Our VRMM overcomes these by\nemploying a novel training framework that efficiently disentangles and encodes\nlatent spaces of identity, expression, and lighting into low-dimensional\nrepresentations. This framework, designed with self-supervised learning,\nsignificantly reduces the constraints for training data, making it more\nfeasible in practice. The learned VRMM offers relighting capabilities and\nencompasses a comprehensive range of expressions. We demonstrate the\nversatility and effectiveness of VRMM through various applications like avatar\ngeneration, facial reconstruction, and animation. Additionally, we address the\ncommon issue of overfitting in generative volumetric models with a novel\nprior-preserving personalization framework based on VRMM. Such an approach\nenables accurate 3D face reconstruction from even a single portrait input. Our\nexperiments showcase the potential of VRMM to significantly enhance the field\nof 3D face modeling.\n","authors":["Haotian Yang","Mingwu Zheng","Chongyang Ma","Yu-Kun Lai","Pengfei Wan","Haibin Huang"],"pdf_url":"https://arxiv.org/pdf/2402.04101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04097v1","updated":"2024-02-06T15:52:23Z","published":"2024-02-06T15:52:23Z","title":"Analysis of Deep Image Prior and Exploiting Self-Guidance for Image\n Reconstruction","summary":" The ability of deep image prior (DIP) to recover high-quality images from\nincomplete or corrupted measurements has made it popular in inverse problems in\nimage restoration and medical imaging including magnetic resonance imaging\n(MRI). However, conventional DIP suffers from severe overfitting and spectral\nbias effects.In this work, we first provide an analysis of how DIP recovers\ninformation from undersampled imaging measurements by analyzing the training\ndynamics of the underlying networks in the kernel regime for different\narchitectures.This study sheds light on important underlying properties for\nDIP-based recovery.Current research suggests that incorporating a reference\nimage as network input can enhance DIP's performance in image reconstruction\ncompared to using random inputs. However, obtaining suitable reference images\nrequires supervision, and raises practical difficulties. In an attempt to\novercome this obstacle, we further introduce a self-driven reconstruction\nprocess that concurrently optimizes both the network weights and the input\nwhile eliminating the need for training data. Our method incorporates a novel\ndenoiser regularization term which enables robust and stable joint estimation\nof both the network input and reconstructed image.We demonstrate that our\nself-guided method surpasses both the original DIP and modern supervised\nmethods in terms of MR image reconstruction performance and outperforms\nprevious DIP-based schemes for image inpainting.\n","authors":["Shijun Liang","Evan Bell","Qing Qu","Rongrong Wang","Saiprasad Ravishankar"],"pdf_url":"https://arxiv.org/pdf/2402.04097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04087v1","updated":"2024-02-06T15:45:27Z","published":"2024-02-06T15:45:27Z","title":"A Hard-to-Beat Baseline for Training-free CLIP-based Adaptation","summary":" Contrastive Language-Image Pretraining (CLIP) has gained popularity for its\nremarkable zero-shot capacity. Recent research has focused on developing\nefficient fine-tuning methods, such as prompt learning and adapter, to enhance\nCLIP's performance in downstream tasks. However, these methods still require\nadditional training time and computational resources, which is undesirable for\ndevices with limited resources. In this paper, we revisit a classical\nalgorithm, Gaussian Discriminant Analysis (GDA), and apply it to the downstream\nclassification of CLIP. Typically, GDA assumes that features of each class\nfollow Gaussian distributions with identical covariance. By leveraging Bayes'\nformula, the classifier can be expressed in terms of the class means and\ncovariance, which can be estimated from the data without the need for training.\nTo integrate knowledge from both visual and textual modalities, we ensemble it\nwith the original zero-shot classifier within CLIP. Extensive results on 17\ndatasets validate that our method surpasses or achieves comparable results with\nstate-of-the-art methods on few-shot classification, imbalanced learning, and\nout-of-distribution generalization. In addition, we extend our method to\nbase-to-new generalization and unsupervised learning, once again demonstrating\nits superiority over competing approaches. Our code is publicly available at\n\\url{https://github.com/mrflogs/ICLR24}.\n","authors":["Zhengbo Wang","Jian Liang","Lijun Sheng","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04087v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04064v1","updated":"2024-02-06T15:09:50Z","published":"2024-02-06T15:09:50Z","title":"Multi-class Road Defect Detection and Segmentation using Spatial and\n Channel-wise Attention for Autonomous Road Repairing","summary":" Road pavement detection and segmentation are critical for developing\nautonomous road repair systems. However, developing an instance segmentation\nmethod that simultaneously performs multi-class defect detection and\nsegmentation is challenging due to the textural simplicity of road pavement\nimage, the diversity of defect geometries, and the morphological ambiguity\nbetween classes. We propose a novel end-to-end method for multi-class road\ndefect detection and segmentation. The proposed method comprises multiple\nspatial and channel-wise attention blocks available to learn global\nrepresentations across spatial and channel-wise dimensions. Through these\nattention blocks, more globally generalised representations of morphological\ninformation (spatial characteristics) of road defects and colour and depth\ninformation of images can be learned. To demonstrate the effectiveness of our\nframework, we conducted various ablation studies and comparisons with prior\nmethods on a newly collected dataset annotated with nine road defect classes.\nThe experiments show that our proposed method outperforms existing\nstate-of-the-art methods for multi-class road defect detection and segmentation\nmethods.\n","authors":["Jongmin Yu","Chen Bene Chi","Sebastiano Fichera","Paolo Paoletti","Devansh Mehta","Shan Luo"],"pdf_url":"https://arxiv.org/pdf/2402.04064v1.pdf","comment":"Accepted to the ICRA 2024"},{"id":"http://arxiv.org/abs/2402.02474v2","updated":"2024-02-06T14:58:09Z","published":"2024-02-04T13:09:13Z","title":"Deep Spectral Improvement for Unsupervised Image Instance Segmentation","summary":" Deep spectral methods reframe the image decomposition process as a graph\npartitioning task by extracting features using self-supervised learning and\nutilizing the Laplacian of the affinity matrix to obtain eigensegments.\nHowever, instance segmentation has received less attention compared to other\ntasks within the context of deep spectral methods. This paper addresses the\nfact that not all channels of the feature map extracted from a self-supervised\nbackbone contain sufficient information for instance segmentation purposes. In\nfact, Some channels are noisy and hinder the accuracy of the task. To overcome\nthis issue, this paper proposes two channel reduction modules: Noise Channel\nReduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains\nchannels with lower entropy, as they are less likely to be noisy, while DCR\nprunes channels with low standard deviation, as they lack sufficient\ninformation for effective instance segmentation. Furthermore, the paper\ndemonstrates that the dot product, commonly used in deep spectral methods, is\nnot suitable for instance segmentation due to its sensitivity to feature map\nvalues, potentially leading to incorrect instance segments. A new similarity\nmetric called Bray-Curtis over Chebyshev (BoC) is proposed to address this\nissue. It takes into account the distribution of features in addition to their\nvalues, providing a more robust similarity measure for instance segmentation.\nQuantitative and qualitative results on the Youtube-VIS2019 dataset highlight\nthe improvements achieved by the proposed channel reduction methods and the use\nof BoC instead of the conventional dot product for creating the affinity\nmatrix. These improvements are observed in terms of mean Intersection over\nUnion and extracted instance segments, demonstrating enhanced instance\nsegmentation performance. The code is available on:\nhttps://github.com/farnooshar/SpecUnIIS\n","authors":["Farnoosh Arefi","Amir M. Mansourian","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2402.02474v2.pdf","comment":"11 pages, 13 figures and 5 tables"},{"id":"http://arxiv.org/abs/2402.04050v1","updated":"2024-02-06T14:53:19Z","published":"2024-02-06T14:53:19Z","title":"Connecting the Dots: Collaborative Fine-tuning for Black-Box\n Vision-Language Models","summary":" With the emergence of pretrained vision-language models (VLMs), considerable\nefforts have been devoted to fine-tuning them for downstream tasks. Despite the\nprogress made in designing efficient fine-tuning methods, such methods require\naccess to the model's parameters, which can be challenging as model owners\noften opt to provide their models as a black box to safeguard model ownership.\nThis paper proposes a \\textbf{C}ollabo\\textbf{ra}tive\n\\textbf{F}ine-\\textbf{T}uning (\\textbf{CraFT}) approach for fine-tuning\nblack-box VLMs to downstream tasks, where one only has access to the input\nprompts and the output predictions of the model. CraFT comprises two modules, a\nprompt generation module for learning text prompts and a prediction refinement\nmodule for enhancing output predictions in residual style. Additionally, we\nintroduce an auxiliary prediction-consistent loss to promote consistent\noptimization across these modules. These modules are optimized by a novel\ncollaborative training algorithm. Extensive experiments on few-shot\nclassification over 15 datasets demonstrate the superiority of CraFT. The\nresults show that CraFT achieves a decent gain of about 12\\% with 16-shot\ndatasets and only 8,000 queries. Moreover, CraFT trains faster and uses only\nabout 1/80 of the memory footprint for deployment, while sacrificing only\n1.62\\% compared to the white-box method.\n","authors":["Zhengbo Wang","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08768v2","updated":"2024-02-06T14:45:39Z","published":"2023-12-14T09:31:33Z","title":"Local Conditional Controlling for Text-to-Image Diffusion Models","summary":" Diffusion models have exhibited impressive prowess in the text-to-image task.\nRecent methods add image-level controls, e.g., edge and depth maps, to\nmanipulate the generation process together with text prompts to obtain desired\nimages. This controlling process is globally operated on the entire image,\nwhich limits the flexibility of control regions. In this paper, we introduce a\nnew simple yet practical task setting: local control. It focuses on controlling\nspecific local areas according to user-defined image conditions, where the rest\nareas are only conditioned by the original text prompt. This manner allows the\nusers to flexibly control the image generation in a fine-grained way. However,\nit is non-trivial to achieve this goal. The naive manner of directly adding\nlocal conditions may lead to the local control dominance problem. To mitigate\nthis problem, we propose a training-free method that leverages the updates of\nnoised latents and parameters in the cross-attention map during the denosing\nprocess to promote concept generation in non-control areas. Moreover, we use\nfeature mask constraints to mitigate the degradation of synthesized image\nquality caused by information differences inside and outside the local control\narea. Extensive experiments demonstrate that our method can synthesize\nhigh-quality images to the prompt under local control conditions. Code is\navailable at https://github.com/YibooZhao/Local-Control.\n","authors":["Yibo Zhao","Liang Peng","Yang Yang","Zekai Luo","Hengjia Li","Yao Chen","Wei Zhao","qinglin lu","Boxi Wu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.08768v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07975v2","updated":"2024-02-06T14:40:09Z","published":"2023-10-12T01:47:55Z","title":"Self-supervised visual learning for analyzing firearms trafficking\n activities on the Web","summary":" Automated visual firearms classification from RGB images is an important\nreal-world task with applications in public space security, intelligence\ngathering and law enforcement investigations. When applied to images massively\ncrawled from the World Wide Web (including social media and dark Web sites), it\ncan serve as an important component of systems that attempt to identify\ncriminal firearms trafficking networks, by analyzing Big Data from open-source\nintelligence. Deep Neural Networks (DNN) are the state-of-the-art methodology\nfor achieving this, with Convolutional Neural Networks (CNN) being typically\nemployed. The common transfer learning approach consists of pretraining on a\nlarge-scale, generic annotated dataset for whole-image classification, such as\nImageNet-1k, and then finetuning the DNN on a smaller, annotated,\ntask-specific, downstream dataset for visual firearms classification. Neither\nVisual Transformer (ViT) neural architectures nor Self-Supervised Learning\n(SSL) approaches have been so far evaluated on this critical task..\n","authors":["Sotirios Konstantakos","Despina Ioanna Chalkiadaki","Ioannis Mademlis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.07975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00477v2","updated":"2024-02-06T14:29:29Z","published":"2023-03-01T13:04:45Z","title":"ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based\n Place recognition in Orchards","summary":" Robust and reliable place recognition and loop closure detection in\nagricultural environments is still an open problem. In particular, orchards are\na difficult case study due to structural similarity across the entire field. In\nthis work, we address the place recognition problem in orchards resorting to 3D\nLiDAR data, which is considered a key modality for robustness. Hence, we\npropose ORCHNet, a deep-learning-based approach that maps 3D-LiDAR scans to\nglobal descriptors. Specifically, this work proposes a new global feature\naggregation approach, which fuses multiple aggregation methods into a robust\nglobal descriptor. ORCHNet is evaluated on real-world data collected in\norchards, comprising data from the summer and autumn seasons. To assess the\nrobustness, we compare ORCHNet with state-of-the-art aggregation approaches on\ndata from the same season and across seasons. Moreover, we additionally\nevaluate the proposed approach as part of a localization framework, where\nORCHNet is used as a loop closure detector. The empirical results indicate\nthat, on the place recognition task, ORCHNet outperforms the remaining\napproaches, and is also more robust across seasons. As for the localization,\nthe edge cases where the path goes through the trees are solved when\nintegrating ORCHNet as a loop detector, showing the potential applicability of\nthe proposed approach in this task. The code will be publicly available\nat:\\url{https://github.com/Cybonic/ORCHNet.git}\n","authors":["T. Barros","L. Garrote","P. Conde","M. J. Coombes","C. Liu","C. Premebida","U. J. Nunes"],"pdf_url":"https://arxiv.org/pdf/2303.00477v2.pdf","comment":"This is a Technical Report"},{"id":"http://arxiv.org/abs/2402.04031v1","updated":"2024-02-06T14:26:02Z","published":"2024-02-06T14:26:02Z","title":"Polyp-DDPM: Diffusion-Based Semantic Polyp Synthesis for Enhanced\n Segmentation","summary":" This study introduces Polyp-DDPM, a diffusion-based method for generating\nrealistic images of polyps conditioned on masks, aimed at enhancing the\nsegmentation of gastrointestinal (GI) tract polyps. Our approach addresses the\nchallenges of data limitations, high annotation costs, and privacy concerns\nassociated with medical images. By conditioning the diffusion model on\nsegmentation masks-binary masks that represent abnormal areas-Polyp-DDPM\noutperforms state-of-the-art methods in terms of image quality (achieving a\nFrechet Inception Distance (FID) score of 78.47, compared to scores above\n83.79) and segmentation performance (achieving an Intersection over Union (IoU)\nof 0.7156, versus less than 0.6694 for synthetic images from baseline models\nand 0.7067 for real data). Our method generates a high-quality, diverse\nsynthetic dataset for training, thereby enhancing polyp segmentation models to\nbe comparable with real images and offering greater data augmentation\ncapabilities to improve segmentation models. The source code and pretrained\nweights for Polyp-DDPM are made publicly available at\nhttps://github.com/mobaidoctor/polyp-ddpm.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.04031v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.02205v2","updated":"2024-02-06T14:19:31Z","published":"2024-02-03T16:38:25Z","title":"GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model\n on Complex Traffic Events","summary":" The recognition and understanding of traffic incidents, particularly traffic\naccidents, is a topic of paramount importance in the realm of intelligent\ntransportation systems and intelligent vehicles. This area has continually\ncaptured the extensive focus of both the academic and industrial sectors.\nIdentifying and comprehending complex traffic events is highly challenging,\nprimarily due to the intricate nature of traffic environments, diverse\nobservational perspectives, and the multifaceted causes of accidents. These\nfactors have persistently impeded the development of effective solutions. The\nadvent of large vision-language models (VLMs) such as GPT-4V, has introduced\ninnovative approaches to addressing this issue. In this paper, we explore the\nability of GPT-4V with a set of representative traffic incident videos and\ndelve into the model's capacity of understanding these complex traffic\nsituations. We observe that GPT-4V demonstrates remarkable cognitive,\nreasoning, and decision-making ability in certain classic traffic events.\nConcurrently, we also identify certain limitations of GPT-4V, which constrain\nits understanding in more intricate scenarios. These limitations merit further\nexploration and resolution.\n","authors":["Xingcheng Zhou","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2402.02205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16290v3","updated":"2024-02-06T14:14:23Z","published":"2022-11-29T15:21:34Z","title":"LocPoseNet: Robust Location Prior for Unseen Object Pose Estimation","summary":" Object location prior is critical for the standard 6D object pose estimation\nsetting. The prior can be used to initialize the 3D object translation and\nfacilitate 3D object rotation estimation. Unfortunately, the object detectors\nthat are used for this purpose do not generalize to unseen objects. Therefore,\nexisting 6D pose estimation methods for unseen objects either assume the\nground-truth object location to be known or yield inaccurate results when it is\nunavailable. In this paper, we address this problem by developing a method,\nLocPoseNet, able to robustly learn location prior for unseen objects. Our\nmethod builds upon a template matching strategy, where we propose to distribute\nthe reference kernels and convolve them with a query to efficiently compute\nmulti-scale correlations. We then introduce a novel translation estimator,\nwhich decouples scale-aware and scale-robust features to predict different\nobject location parameters. Our method outperforms existing works by a large\nmargin on LINEMOD and GenMOP. We further construct a challenging synthetic\ndataset, which allows us to highlight the better robustness of our method to\nvarious noise sources. Our project website is at:\nhttps://sailor-z.github.io/projects/3DV2024_LocPoseNet.html.\n","authors":["Chen Zhao","Yinlin Hu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2211.16290v3.pdf","comment":"Accepted by 3DV2024"},{"id":"http://arxiv.org/abs/2402.04013v1","updated":"2024-02-06T14:06:23Z","published":"2024-02-06T14:06:23Z","title":"Privacy Leakage on DNNs: A Survey of Model Inversion Attacks and\n Defenses","summary":" Model Inversion (MI) attacks aim to disclose private information about the\ntraining data by abusing access to the pre-trained models. These attacks enable\nadversaries to reconstruct high-fidelity data that closely aligns with the\nprivate training data, which has raised significant privacy concerns. Despite\nthe rapid advances in the field, we lack a comprehensive overview of existing\nMI attacks and defenses. To fill this gap, this paper thoroughly investigates\nthis field and presents a holistic survey. Firstly, our work briefly reviews\nthe traditional MI on machine learning scenarios. We then elaborately analyze\nand compare numerous recent attacks and defenses on \\textbf{D}eep\n\\textbf{N}eural \\textbf{N}etworks (DNNs) across multiple modalities and\nlearning tasks.\n","authors":["Hao Fang","Yixiang Qiu","Hongyao Yu","Wenbo Yu","Jiawei Kong","Baoli Chong","Bin Chen","Xuan Wang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2402.04013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04009v1","updated":"2024-02-06T14:03:15Z","published":"2024-02-06T14:03:15Z","title":"Low-rank Attention Side-Tuning for Parameter-Efficient Fine-Tuning","summary":" In finetuning a large pretrained model to downstream tasks,\nparameter-efficient fine-tuning (PEFT) methods can effectively finetune\npretrained models with few trainable parameters, but suffer from high GPU\nmemory consumption and slow training speed. Because learnable parameters from\nthese methods are entangled with the pretrained model, gradients related to the\nfrozen pretrained model's parameters have to be computed and stored during\nfinetuning. We propose Low-rank Attention Side-Tuning (LAST), which\ndisentangles the trainable module from the pretrained model by freezing not\nonly parameters but also outputs of the pretrained network. LAST trains a\nside-network composed of only low-rank self-attention modules. By viewing the\npretrained model as a frozen feature extractor, the side-network takes\nintermediate output from the pretrained model and focus on learning\ntask-specific knowledge. We also show that LAST can be highly parallel across\nmultiple optimization objectives, making it very efficient in downstream task\nadaptation, for example, in finding optimal hyperparameters. LAST outperforms\nprevious state-of-the-art methods on VTAB-1K and other visual adaptation tasks\nwith roughly only 30\\% of GPU memory footprint and 60\\% of training time\ncompared to existing PEFT methods, but achieves significantly higher accuracy.\n","authors":["Ningyuan Tang","Minghao Fu","Ke Zhu","Jianxin Wu"],"pdf_url":"https://arxiv.org/pdf/2402.04009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03989v1","updated":"2024-02-06T13:31:45Z","published":"2024-02-06T13:31:45Z","title":"YOLOPoint Joint Keypoint and Object Detection","summary":" Intelligent vehicles of the future must be capable of understanding and\nnavigating safely through their surroundings. Camera-based vehicle systems can\nuse keypoints as well as objects as low- and high-level landmarks for\nGNSS-independent SLAM and visual odometry. To this end we propose YOLOPoint, a\nconvolutional neural network model that simultaneously detects keypoints and\nobjects in an image by combining YOLOv5 and SuperPoint to create a single\nforward-pass network that is both real-time capable and accurate. By using a\nshared backbone and a light-weight network structure, YOLOPoint is able to\nperform competitively on both the HPatches and KITTI benchmarks.\n","authors":["Anton Backhaus","Thorsten Luettel","Hans-Joachim Wuensche"],"pdf_url":"https://arxiv.org/pdf/2402.03989v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.03981v1","updated":"2024-02-06T13:16:54Z","published":"2024-02-06T13:16:54Z","title":"Controllable Diverse Sampling for Diffusion Based Motion Behavior\n Forecasting","summary":" In autonomous driving tasks, trajectory prediction in complex traffic\nenvironments requires adherence to real-world context conditions and behavior\nmultimodalities. Existing methods predominantly rely on prior assumptions or\ngenerative models trained on curated data to learn road agents' stochastic\nbehavior bounded by scene constraints. However, they often face mode averaging\nissues due to data imbalance and simplistic priors, and could even suffer from\nmode collapse due to unstable training and single ground truth supervision.\nThese issues lead the existing methods to a loss of predictive diversity and\nadherence to the scene constraints. To address these challenges, we introduce a\nnovel trajectory generator named Controllable Diffusion Trajectory (CDT), which\nintegrates map information and social interactions into a Transformer-based\nconditional denoising diffusion model to guide the prediction of future\ntrajectories. To ensure multimodality, we incorporate behavioral tokens to\ndirect the trajectory's modes, such as going straight, turning right or left.\nMoreover, we incorporate the predicted endpoints as an alternative behavioral\ntoken into the CDT model to facilitate the prediction of accurate trajectories.\nExtensive experiments on the Argoverse 2 benchmark demonstrate that CDT excels\nin generating diverse and scene-compliant trajectories in complex urban\nsettings.\n","authors":["Yiming Xu","Hao Cheng","Monika Sester"],"pdf_url":"https://arxiv.org/pdf/2402.03981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19302v3","updated":"2024-02-06T13:14:35Z","published":"2023-05-30T15:26:43Z","title":"Smooth, exact rotational symmetrization for deep learning on point\n clouds","summary":" Point clouds are versatile representations of 3D objects and have found\nwidespread application in science and engineering. Many successful\ndeep-learning models have been proposed that use them as input. The domain of\nchemical and materials modeling is especially challenging because exact\ncompliance with physical constraints is highly desirable for a model to be\nusable in practice. These constraints include smoothness and invariance with\nrespect to translations, rotations, and permutations of identical atoms. If\nthese requirements are not rigorously fulfilled, atomistic simulations might\nlead to absurd outcomes even if the model has excellent accuracy. Consequently,\ndedicated architectures, which achieve invariance by restricting their design\nspace, have been developed. General-purpose point-cloud models are more varied\nbut often disregard rotational symmetry. We propose a general symmetrization\nmethod that adds rotational equivariance to any given model while preserving\nall the other requirements. Our approach simplifies the development of better\natomic-scale machine-learning schemes by relaxing the constraints on the design\nspace and making it possible to incorporate ideas that proved effective in\nother domains. We demonstrate this idea by introducing the Point Edge\nTransformer (PET) architecture, which is not intrinsically equivariant but\nachieves state-of-the-art performance on several benchmark datasets of\nmolecules and solids. A-posteriori application of our general protocol makes\nPET exactly equivariant, with minimal changes to its accuracy.\n","authors":["Sergey N. Pozdnyakov","Michele Ceriotti"],"pdf_url":"https://arxiv.org/pdf/2305.19302v3.pdf","comment":"Enhancing figures; minor polishing"},{"id":"http://arxiv.org/abs/2402.03973v1","updated":"2024-02-06T13:06:14Z","published":"2024-02-06T13:06:14Z","title":"Humans Beat Deep Networks at Recognizing Objects in Unusual Poses, Given\n Enough Time","summary":" Deep learning is closing the gap with humans on several object recognition\nbenchmarks. Here we investigate this gap in the context of challenging images\nwhere objects are seen from unusual viewpoints. We find that humans excel at\nrecognizing objects in unusual poses, in contrast with state-of-the-art\npretrained networks (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) which are\nsystematically brittle in this condition. Remarkably, as we limit image\nexposure time, human performance degrades to the level of deep networks,\nsuggesting that additional mental processes (requiring additional time) take\nplace when humans identify objects in unusual poses. Finally, our analysis of\nerror patterns of humans vs. networks reveals that even time-limited humans are\ndissimilar to feed-forward deep networks. We conclude that more work is needed\nto bring computer vision systems to the level of robustness of the human visual\nsystem. Understanding the nature of the mental processes taking place during\nextra viewing time may be key to attain such robustness.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06108v5","updated":"2024-02-06T12:41:20Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n Detection Systems","summary":" In autonomous driving, LiDAR and radar are crucial for environmental\nperception. LiDAR offers precise 3D spatial sensing information but struggles\nin adverse weather like fog. Conversely, radar signals can penetrate rain or\nmist due to their specific wavelength but are prone to noise disturbances.\nRecent state-of-the-art works reveal that the fusion of radar and LiDAR can\nlead to robust detection in adverse weather. The existing works adopt\nconvolutional neural network architecture to extract features from each sensor\ndata, then align and aggregate the two branch features to predict object\ndetection results. However, these methods have low accuracy of predicted\nbounding boxes due to a simple design of label assignment and fusion\nstrategies. In this paper, we propose a bird's-eye view fusion learning-based\nanchor box-free object detection system, which fuses the feature derived from\nthe radar range-azimuth heatmap and the LiDAR point cloud to estimate possible\nobjects. Different label assignment strategies have been designed to facilitate\nthe consistency between the classification of foreground or background anchor\npoints and the corresponding bounding box regressions. Furthermore, the\nperformance of the proposed object detector is further enhanced by employing a\nnovel interactive transformer module. The superior performance of the methods\nproposed in this paper has been demonstrated using the recently published\nOxford Radar RobotCar dataset. Our system's average precision significantly\noutperforms the state-of-the-art method by 13.1% and 19.0% at Intersection of\nUnion (IoU) of 0.8 under 'Clear+Foggy' training conditions for 'Clear' and\n'Foggy' testing, respectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v5.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03951v1","updated":"2024-02-06T12:23:14Z","published":"2024-02-06T12:23:14Z","title":"Boosting Adversarial Transferability across Model Genus by\n Deformation-Constrained Warping","summary":" Adversarial examples generated by a surrogate model typically exhibit limited\ntransferability to unknown target systems. To address this problem, many\ntransferability enhancement approaches (e.g., input transformation and model\naugmentation) have been proposed. However, they show poor performances in\nattacking systems having different model genera from the surrogate model. In\nthis paper, we propose a novel and generic attacking strategy, called\nDeformation-Constrained Warping Attack (DeCoWA), that can be effectively\napplied to cross model genus attack. Specifically, DeCoWA firstly augments\ninput examples via an elastic deformation, namely Deformation-Constrained\nWarping (DeCoW), to obtain rich local details of the augmented input. To avoid\nsevere distortion of global semantics led by random deformation, DeCoW further\nconstrains the strength and direction of the warping transformation by a novel\nadaptive control strategy. Extensive experiments demonstrate that the\ntransferable examples crafted by our DeCoWA on CNN surrogates can significantly\nhinder the performance of Transformers (and vice versa) on various tasks,\nincluding image classification, video action recognition, and audio\nrecognition. Code is made available at https://github.com/LinQinLiang/DeCoWA.\n","authors":["Qinliang Lin","Cheng Luo","Zenghao Niu","Xilin He","Weicheng Xie","Yuanbo Hou","Linlin Shen","Siyang Song"],"pdf_url":"https://arxiv.org/pdf/2402.03951v1.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2401.00736v2","updated":"2024-02-06T12:20:06Z","published":"2024-01-01T12:25:57Z","title":"Diffusion Models, Image Super-Resolution And Everything: A Survey","summary":" Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field\nand further closed the gap between image quality and human perceptual\npreferences. They are easy to train and can produce very high-quality samples\nthat exceed the realism of those produced by previous generative methods.\nDespite their promising results, they also come with new challenges that need\nfurther research: high computational demands, comparability, lack of\nexplainability, color shifts, and more. Unfortunately, entry into this field is\noverwhelming because of the abundance of publications. To address this, we\nprovide a unified recount of the theoretical foundations underlying DMs applied\nto image SR and offer a detailed analysis that underscores the unique\ncharacteristics and methodologies within this domain, distinct from broader\nexisting reviews in the field. This survey articulates a cohesive understanding\nof DM principles and explores current research avenues, including alternative\ninput domains, conditioning techniques, guidance mechanisms, corruption spaces,\nand zero-shot learning approaches. By offering a detailed examination of the\nevolution and current trends in image SR through the lens of DMs, this survey\nsheds light on the existing challenges and charts potential future directions,\naiming to inspire further innovation in this rapidly advancing area.\n","authors":["Brian B. Moser","Arundhati S. Shanbhag","Federico Raue","Stanislav Frolov","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.00736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07286v2","updated":"2024-02-06T12:02:40Z","published":"2023-07-14T11:52:10Z","title":"One-Shot Action Recognition via Multi-Scale Spatial-Temporal Skeleton\n Matching","summary":" One-shot skeleton action recognition, which aims to learn a skeleton action\nrecognition model with a single training sample, has attracted increasing\ninterest due to the challenge of collecting and annotating large-scale skeleton\naction data. However, most existing studies match skeleton sequences by\ncomparing their feature vectors directly which neglects spatial structures and\ntemporal orders of skeleton data. This paper presents a novel one-shot skeleton\naction recognition technique that handles skeleton action recognition via\nmulti-scale spatial-temporal feature matching. We represent skeleton data at\nmultiple spatial and temporal scales and achieve optimal feature matching from\ntwo perspectives. The first is multi-scale matching which captures the\nscale-wise semantic relevance of skeleton data at multiple spatial and temporal\nscales simultaneously. The second is cross-scale matching which handles\ndifferent motion magnitudes and speeds by capturing sample-wise relevance\nacross multiple scales. Extensive experiments over three large-scale datasets\n(NTU RGB+D, NTU RGB+D 120, and PKU-MMD) show that our method achieves superior\none-shot skeleton action recognition, and it outperforms the state-of-the-art\nconsistently by large margins.\n","authors":["Siyuan Yang","Jun Liu","Shijian Lu","Er Meng Hwa","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2307.07286v2.pdf","comment":"8 pages, 4 figures, 6 tables. Accepted by IEEE Transactions on\n Pattern Analysis and Machine Intelligence"},{"id":"http://arxiv.org/abs/2311.17536v2","updated":"2024-02-06T12:01:47Z","published":"2023-11-29T11:14:43Z","title":"SmoothVideo: Smooth Video Synthesis with Noise Constraints on Diffusion\n Models for One-shot Video Tuning","summary":" Recent one-shot video tuning methods, which fine-tune the network on a\nspecific video based on pre-trained text-to-image models (e.g., Stable\nDiffusion), are popular in the community because of the flexibility. However,\nthese methods often produce videos marred by incoherence and inconsistency. To\naddress these limitations, this paper introduces a simple yet effective noise\nconstraint across video frames. This constraint aims to regulate noise\npredictions across their temporal neighbors, resulting in smooth latents. It\ncan be simply included as a loss term during the training phase. By applying\nthe loss to existing one-shot video tuning methods, we significantly improve\nthe overall consistency and smoothness of the generated videos. Furthermore, we\nargue that current video evaluation metrics inadequately capture smoothness. To\naddress this, we introduce a novel metric that considers detailed features and\ntheir temporal dynamics. Experimental results validate the effectiveness of our\napproach in producing smoother videos on various one-shot video tuning\nbaselines. The source codes and video demos are available at\n\\href{https://github.com/SPengLiang/SmoothVideo}{https://github.com/SPengLiang/SmoothVideo}.\n","authors":["Liang Peng","Haoran Cheng","Zheng Yang","Ruisi Zhao","Linxuan Xia","Chaotian Song","Qinglin Lu","Boxi Wu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17536v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02327v2","updated":"2024-02-06T11:35:05Z","published":"2024-02-04T03:02:35Z","title":"Bootstrapping Audio-Visual Segmentation by Strengthening Audio Cues","summary":" How to effectively interact audio with vision has garnered considerable\ninterest within the multi-modality research field. Recently, a novel\naudio-visual segmentation (AVS) task has been proposed, aiming to segment the\nsounding objects in video frames under the guidance of audio cues. However,\nmost existing AVS methods are hindered by a modality imbalance where the visual\nfeatures tend to dominate those of the audio modality, due to a unidirectional\nand insufficient integration of audio cues. This imbalance skews the feature\nrepresentation towards the visual aspect, impeding the learning of joint\naudio-visual representations and potentially causing segmentation inaccuracies.\nTo address this issue, we propose AVSAC. Our approach features a Bidirectional\nAudio-Visual Decoder (BAVD) with integrated bidirectional bridges, enhancing\naudio cues and fostering continuous interplay between audio and visual\nmodalities. This bidirectional interaction narrows the modality imbalance,\nfacilitating more effective learning of integrated audio-visual\nrepresentations. Additionally, we present a strategy for audio-visual\nframe-wise synchrony as fine-grained guidance of BAVD. This strategy enhances\nthe share of auditory components in visual features, contributing to a more\nbalanced audio-visual representation learning. Extensive experiments show that\nour method attains new benchmarks in AVS performance.\n","authors":["Tianxiang Chen","Zhentao Tan","Tao Gong","Qi Chu","Yue Wu","Bin Liu","Le Lu","Jieping Ye","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.02327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03917v1","updated":"2024-02-06T11:35:02Z","published":"2024-02-06T11:35:02Z","title":"Elastic Feature Consolidation for Cold Start Exemplar-free Incremental\n Learning","summary":" Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a\nsequence of tasks without having access to previous task data. In this paper,\nwe consider the challenging Cold Start scenario in which insufficient data is\navailable in the first task to learn a high-quality backbone. This is\nespecially challenging for EFCIL since it requires high plasticity, which\nresults in feature drift which is difficult to compensate for in the\nexemplar-free setting. To address this problem, we propose a simple and\neffective approach that consolidates feature representations by regularizing\ndrift in directions highly relevant to previous tasks and employs prototypes to\nreduce task-recency bias. Our method, called Elastic Feature Consolidation\n(EFC), exploits a tractable second-order approximation of feature drift based\non an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in\nfeature space which we use to regularize feature drift in important directions\nand to update Gaussian prototypes used in a novel asymmetric cross entropy loss\nwhich effectively balances prototype rehearsal with data from new tasks.\nExperimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and\nImageNet-1K demonstrate that Elastic Feature Consolidation is better able to\nlearn new tasks by maintaining model plasticity and significantly outperform\nthe state-of-the-art.\n","authors":["Simone Magistri","Tomaso Trinci","Albin Soutif-Cormerais","Joost van de Weijer","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2402.03917v1.pdf","comment":"Accepted at Twelfth International Conference on Learning\n Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2306.05418v2","updated":"2024-02-06T11:27:57Z","published":"2023-06-08T17:58:57Z","title":"Weakly Supervised 3D Object Detection with Multi-Stage Generalization","summary":" With the rapid development of large models, the need for data has become\nincreasingly crucial. Especially in 3D object detection, costly manual\nannotations have hindered further advancements. To reduce the burden of\nannotation, we study the problem of achieving 3D object detection solely based\non 2D annotations. Thanks to advanced 3D reconstruction techniques, it is now\nfeasible to reconstruct the overall static 3D scene. However, extracting\nprecise object-level annotations from the entire scene and generalizing these\nlimited annotations to the entire scene remain challenges. In this paper, we\nintroduce a novel paradigm called BA$^2$-Det, encompassing pseudo label\ngeneration and multi-stage generalization. We devise the DoubleClustering\nalgorithm to obtain object clusters from reconstructed scene-level points, and\nfurther enhance the model's detection capabilities by developing three stages\nof generalization: progressing from complete to partial, static to dynamic, and\nclose to distant. Experiments conducted on the large-scale Waymo Open Dataset\nshow that the performance of BA$^2$-Det is on par with the fully-supervised\nmethods using 10% annotations. Additionally, using large raw videos for\npretraining,BA$^2$-Det can achieve a 20% relative improvement on the KITTI\ndataset. The method also has great potential for detecting open-set 3D objects\nin complex scenes. Project page: https://ba2det.site.\n","authors":["Jiawei He","Yuqi Wang","Yuntao Chen","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.05418v2.pdf","comment":"Project page: https://ba2det.site"},{"id":"http://arxiv.org/abs/2402.03908v1","updated":"2024-02-06T11:21:58Z","published":"2024-02-06T11:21:58Z","title":"EscherNet: A Generative Model for Scalable View Synthesis","summary":" We introduce EscherNet, a multi-view conditioned diffusion model for view\nsynthesis. EscherNet learns implicit and generative 3D representations coupled\nwith a specialised camera positional encoding, allowing precise and continuous\nrelative control of the camera transformation between an arbitrary number of\nreference and target views. EscherNet offers exceptional generality,\nflexibility, and scalability in view synthesis -- it can generate more than 100\nconsistent target views simultaneously on a single consumer-grade GPU, despite\nbeing trained with a fixed number of 3 reference views to 3 target views. As a\nresult, EscherNet not only addresses zero-shot novel view synthesis, but also\nnaturally unifies single- and multi-image 3D reconstruction, combining these\ndiverse tasks into a single, cohesive framework. Our extensive experiments\ndemonstrate that EscherNet achieves state-of-the-art performance in multiple\nbenchmarks, even when compared to methods specifically tailored for each\nindividual problem. This remarkable versatility opens up new directions for\ndesigning scalable neural architectures for 3D vision. Project page:\n\\url{https://kxhit.github.io/EscherNet}.\n","authors":["Xin Kong","Shikun Liu","Xiaoyang Lyu","Marwan Taher","Xiaojuan Qi","Andrew J. Davison"],"pdf_url":"https://arxiv.org/pdf/2402.03908v1.pdf","comment":"Project Page: https://kxhit.github.io/EscherNet"},{"id":"http://arxiv.org/abs/2402.03904v1","updated":"2024-02-06T11:16:18Z","published":"2024-02-06T11:16:18Z","title":"Deep MSFOP: Multiple Spectral filter Operators Preservation in Deep\n Functional Maps for Unsupervised Shape Matching","summary":" We propose a novel constraint called Multiple Spectral filter Operators\nPreservation (MSFOR) to compute functional maps and based on it, develop an\nefficient deep functional map architecture called Deep MSFOP for shape\nmatching. The core idea is that, instead of using the general descriptor\npreservation constraint, we require our maps to preserve multiple spectral\nfilter operators. This allows us to incorporate more informative geometrical\ninformation, contained in different frequency bands of functions, into the\nfunctional map computing. This can be confirmed by that some previous\ntechniques like wavelet preservation and LBO commutativity are actually our\nspecial cases. Moreover, we also develop a very efficient way to compute the\nmaps with MSFOP constraint, which can be conveniently embedded into the deep\nlearning, especially having learnable filter operators. Utilizing the above\nresults, we finally design our Deep MSFOP pipeline, equipped with a suitable\nunsupervised loss jointly penalizing the functional map and the underlying\npointwise map. Our deep functional map has notable advantages, including that\nthe functional map is more geometrically informative and guaranteed to be\nproper, and the computing is numerically stable. Extensive experimental results\non different datasets demonstrate that our approach outperforms the existing\nstate-of-the-art methods, especially in challenging settings like non-isometric\nand inconsistent topology datasets.\n","authors":["Feifan Luo","Qingsong Li","Ling Hu","Xinru Liu","Haojun Xu","Haibo Wang","Ting Li","Shengjun Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05653v5","updated":"2024-02-06T11:12:02Z","published":"2022-09-13T00:01:23Z","title":"Semantic2Graph: Graph-based Multi-modal Feature Fusion for Action\n Segmentation in Videos","summary":" Video action segmentation have been widely applied in many fields. Most\nprevious studies employed video-based vision models for this purpose. However,\nthey often rely on a large receptive field, LSTM or Transformer methods to\ncapture long-term dependencies within videos, leading to significant\ncomputational resource requirements. To address this challenge, graph-based\nmodel was proposed. However, previous graph-based models are less accurate.\nHence, this study introduces a graph-structured approach named Semantic2Graph,\nto model long-term dependencies in videos, thereby reducing computational costs\nand raise the accuracy. We construct a graph structure of video at the\nframe-level. Temporal edges are utilized to model the temporal relations and\naction order within videos. Additionally, we have designed positive and\nnegative semantic edges, accompanied by corresponding edge weights, to capture\nboth long-term and short-term semantic relationships in video actions. Node\nattributes encompass a rich set of multi-modal features extracted from video\ncontent, graph structures, and label text, encompassing visual, structural, and\nsemantic cues. To synthesize this multi-modal information effectively, we\nemploy a graph neural network (GNN) model to fuse multi-modal features for node\naction label classification. Experimental results demonstrate that\nSemantic2Graph outperforms state-of-the-art methods in terms of performance,\nparticularly on benchmark datasets such as GTEA and 50Salads. Multiple ablation\nexperiments further validate the effectiveness of semantic features in\nenhancing model performance. Notably, the inclusion of semantic edges in\nSemantic2Graph allows for the cost-effective capture of long-term dependencies,\naffirming its utility in addressing the challenges posed by computational\nresource constraints in video-based vision models.\n","authors":["Junbin Zhang","Pei-Hsuan Tsai","Meng-Hsun Tsai"],"pdf_url":"https://arxiv.org/pdf/2209.05653v5.pdf","comment":"13 pages, 3 figures, 9 tables. Published on Applied Intelligence"},{"id":"http://arxiv.org/abs/2402.03896v1","updated":"2024-02-06T11:07:05Z","published":"2024-02-06T11:07:05Z","title":"Convincing Rationales for Visual Question Answering Reasoning","summary":" Visual Question Answering (VQA) is a challenging task of predicting the\nanswer to a question about the content of an image. It requires deep\nunderstanding of both the textual question and visual image. Prior works\ndirectly evaluate the answering models by simply calculating the accuracy of\nthe predicted answers. However, the inner reasoning behind the prediction is\ndisregarded in such a \"black box\" system, and we do not even know if one can\ntrust the predictions. In some cases, the models still get the correct answers\neven when they focus on irrelevant visual regions or textual tokens, which\nmakes the models unreliable and illogical. To generate both visual and textual\nrationales next to the predicted answer to the given image/question pair, we\npropose Convincing Rationales for VQA, CRVQA. Considering the extra annotations\nbrought by the new outputs, {CRVQA} is trained and evaluated by samples\nconverted from some existing VQA datasets and their visual labels. The\nextensive experiments demonstrate that the visual and textual rationales\nsupport the prediction of the answers, and further improve the accuracy.\nFurthermore, {CRVQA} achieves competitive performance on generic VQA datatsets\nin the zero-shot evaluation setting. The dataset and source code will be\nreleased under https://github.com/lik1996/CRVQA2024.\n","authors":["Kun Li","George Vosselman","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2402.03896v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2310.05453v3","updated":"2024-02-06T10:25:10Z","published":"2023-10-09T06:57:55Z","title":"Memory-Assisted Sub-Prototype Mining for Universal Domain Adaptation","summary":" Universal domain adaptation aims to align the classes and reduce the feature\ngap between the same category of the source and target domains. The target\nprivate category is set as the unknown class during the adaptation process, as\nit is not included in the source domain. However, most existing methods\noverlook the intra-class structure within a category, especially in cases where\nthere exists significant concept shift between the samples belonging to the\nsame category. When samples with large concept shift are forced to be pushed\ntogether, it may negatively affect the adaptation performance. Moreover, from\nthe interpretability aspect, it is unreasonable to align visual features with\nsignificant differences, such as fighter jets and civil aircraft, into the same\ncategory. Unfortunately, due to such semantic ambiguity and annotation cost,\ncategories are not always classified in detail, making it difficult for the\nmodel to perform precise adaptation. To address these issues, we propose a\nnovel Memory-Assisted Sub-Prototype Mining (MemSPM) method that can learn the\ndifferences between samples belonging to the same category and mine sub-classes\nwhen there exists significant concept shift between them. By doing so, our\nmodel learns a more reasonable feature space that enhances the transferability\nand reflects the inherent differences among samples annotated as the same\ncategory. We evaluate the effectiveness of our MemSPM method over multiple\nscenarios, including UniDA, OSDA, and PDA. Our method achieves state-of-the-art\nperformance on four benchmarks in most cases.\n","authors":["Yuxiang Lai","Yi Zhou","Xinghong Liu","Tao Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05453v3.pdf","comment":"Accepted by The International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2303.15414v2","updated":"2024-02-06T10:24:49Z","published":"2023-03-27T17:39:00Z","title":"Learnable Graph Matching: A Practical Paradigm for Data Association","summary":" Data association is at the core of many computer vision tasks, e.g., multiple\nobject tracking, image matching, and point cloud registration. however, current\ndata association solutions have some defects: they mostly ignore the intra-view\ncontext information; besides, they either train deep association models in an\nend-to-end way and hardly utilize the advantage of optimization-based\nassignment methods, or only use an off-the-shelf neural network to extract\nfeatures. In this paper, we propose a general learnable graph matching method\nto address these issues. Especially, we model the intra-view relationships as\nan undirected graph. Then data association turns into a general graph matching\nproblem between graphs. Furthermore, to make optimization end-to-end\ndifferentiable, we relax the original graph matching problem into continuous\nquadratic programming and then incorporate training into a deep graph neural\nnetwork with KKT conditions and implicit function theorem. In MOT task, our\nmethod achieves state-of-the-art performance on several MOT datasets. For image\nmatching, our method outperforms state-of-the-art methods on a popular indoor\ndataset, ScanNet. For point cloud registration, we also achieve competitive\nresults. Code will be available at https://github.com/jiaweihe1996/GMTracker.\n","authors":["Jiawei He","Zehao Huang","Naiyan Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.15414v2.pdf","comment":"Accepted by TPAMI 2024. arXiv admin note: substantial text overlap\n with arXiv:2103.16178"},{"id":"http://arxiv.org/abs/2402.01303v2","updated":"2024-02-06T10:01:10Z","published":"2024-02-02T10:47:08Z","title":"AGILE: Approach-based Grasp Inference Learned from Element Decomposition","summary":" Humans, this species expert in grasp detection, can grasp objects by taking\ninto account hand-object positioning information. This work proposes a method\nto enable a robot manipulator to learn the same, grasping objects in the most\noptimal way according to how the gripper has approached the object. Built on\ndeep learning, the proposed method consists of two main stages. In order to\ngeneralize the network on unseen objects, the proposed Approach-based Grasping\nInference involves an element decomposition stage to split an object into its\nmain parts, each with one or more annotated grasps for a particular approach of\nthe gripper. Subsequently, a grasp detection network utilizes the decomposed\nelements by Mask R-CNN and the information on the approach of the gripper in\norder to detect the element the gripper has approached and the most optimal\ngrasp. In order to train the networks, the study introduces a robotic grasping\ndataset collected in the Coppeliasim simulation environment. The dataset\ninvolves 10 different objects with annotated element decomposition masks and\ngrasp rectangles. The proposed method acquires a 90% grasp success rate on seen\nobjects and 78% on unseen objects in the Coppeliasim simulation environment.\nLastly, simulation-to-reality domain adaptation is performed by applying\ntransformations on the training set collected in simulation and augmenting the\ndataset, which results in a 70% physical grasp success performance using a\nDelta parallel robot and a 2 -fingered gripper.\n","authors":["MohammadHossein Koosheshi","Hamed Hosseini","Mehdi Tale Masouleh","Ahmad Kalhor","Mohammad Reza Hairi Yazdi"],"pdf_url":"https://arxiv.org/pdf/2402.01303v2.pdf","comment":"Conference Paper, ICROM 2023, 8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.14777v3","updated":"2024-02-06T09:41:44Z","published":"2023-05-24T06:31:05Z","title":"Generative Modeling through the Semi-dual Formulation of Unbalanced\n Optimal Transport","summary":" Optimal Transport (OT) problem investigates a transport map that bridges two\ndistributions while minimizing a given cost function. In this regard, OT\nbetween tractable prior distribution and data has been utilized for generative\nmodeling tasks. However, OT-based methods are susceptible to outliers and face\noptimization challenges during training. In this paper, we propose a novel\ngenerative model based on the semi-dual formulation of Unbalanced Optimal\nTransport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution\nmatching. This approach provides better robustness against outliers, stability\nduring training, and faster convergence. We validate these properties\nempirically through experiments. Moreover, we study the theoretical upper-bound\nof divergence between distributions in UOT. Our model outperforms existing\nOT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 6.36\non CelebA-HQ-256. The code is available at\n\\url{https://github.com/Jae-Moo/UOTM}.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2305.14777v3.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2402.03843v1","updated":"2024-02-06T09:39:05Z","published":"2024-02-06T09:39:05Z","title":"A new method for optical steel rope non-destructive damage detection","summary":" This paper presents a novel algorithm for non-destructive damage detection\nfor steel ropes in high-altitude environments (aerial ropeway). The algorithm\ncomprises two key components: First, a segmentation model named RGBD-UNet is\ndesigned to accurately extract steel ropes from complex backgrounds. This model\nis equipped with the capability to process and combine color and depth\ninformation through the proposed CMA module. Second, a detection model named\nVovNetV3.5 is developed to differentiate between normal and abnormal steel\nropes. It integrates the VovNet architecture with a DBB module to enhance\nperformance. Besides, a novel background augmentation method is proposed to\nenhance the generalization ability of the segmentation model. Datasets\ncontaining images of steel ropes in different scenarios are created for the\ntraining and testing of both the segmentation and detection models. Experiments\ndemonstrate a significant improvement over baseline models. On the proposed\ndataset, the highest accuracy achieved by the detection model reached 0.975,\nand the maximum F-measure achieved by the segmentation model reached 0.948.\n","authors":["Yunqing Bao","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2402.03843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03840v1","updated":"2024-02-06T09:37:42Z","published":"2024-02-06T09:37:42Z","title":"Belief Scene Graphs: Expanding Partial Scenes with Objects through\n Computation of Expectation","summary":" In this article, we propose the novel concept of Belief Scene Graphs, which\nare utility-driven extensions of partial 3D scene graphs, that enable efficient\nhigh-level task planning with partial information. We propose a graph-based\nlearning methodology for the computation of belief (also referred to as\nexpectation) on any given 3D scene graph, which is then used to strategically\nadd new nodes (referred to as blind nodes) that are relevant for a robotic\nmission. We propose the method of Computation of Expectation based on\nCorrelation Information (CECI), to reasonably approximate real\nBelief/Expectation, by learning histograms from available training data. A\nnovel Graph Convolutional Neural Network (GCN) model is developed, to learn\nCECI from a repository of 3D scene graphs. As no database of 3D scene graphs\nexists for the training of the novel CECI model, we present a novel methodology\nfor generating a 3D scene graph dataset based on semantically annotated\nreal-life 3D spaces. The generated dataset is then utilized to train the\nproposed CECI model and for extensive validation of the proposed method. We\nestablish the novel concept of \\textit{Belief Scene Graphs} (BSG), as a core\ncomponent to integrate expectations into abstract representations. This new\nconcept is an evolution of the classical 3D scene graph concept and aims to\nenable high-level reasoning for the task planning and optimization of a variety\nof robotics missions. The efficacy of the overall framework has been evaluated\nin an object search scenario, and has also been tested on a real-life\nexperiment to emulate human common sense of unseen-objects.\n","authors":["Mario A. V. Saucedo","Akash Patel","Akshit Saradagi","Christoforos Kanellakis","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2402.03840v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2307.05591v3","updated":"2024-02-06T09:33:48Z","published":"2023-07-10T17:59:21Z","title":"Linear Alignment of Vision-language Models for Image Captioning","summary":" Recently, vision-language models like CLIP have advanced the state of the art\nin a variety of multi-modal tasks including image captioning and caption\nevaluation. Many approaches adapt CLIP-style models to a downstream task by\ntraining a mapping network between CLIP and a language model. This is costly as\nit usually involves calculating gradients for large models. We propose a more\nefficient training protocol that fits a linear mapping between image and text\nembeddings of CLIP via a closed-form solution. This bypasses the need for\ngradient computation and results in a lightweight captioning method called\nReCap, which can be trained up to 1000 times faster than existing lightweight\nmethods. Moreover, we propose two new learning-based image-captioning metrics\nthat build on CLIP score along with our linear mapping. Furthermore, we combine\nReCap with our new metrics to design an iterative datastore-augmentation loop\n(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k,\nVizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art\nlightweight methods on established metrics while outperforming them on our new\nmetrics, which are better aligned with human ratings on Flickr8k-Expert and\nFlickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to\nother domains and that our DAL leads to a performance boost.\n","authors":["Fabian Paischer","Markus Hofmarcher","Sepp Hochreiter","Thomas Adler"],"pdf_url":"https://arxiv.org/pdf/2307.05591v3.pdf","comment":"8 pages (+ references and appendix)"},{"id":"http://arxiv.org/abs/2402.03833v1","updated":"2024-02-06T09:24:53Z","published":"2024-02-06T09:24:53Z","title":"An SVD-free Approach to Nonlinear Dictionary Learning based on RVFL","summary":" This paper presents a novel nonlinear dictionary learning algorithm\nleveraging the theory of a feed-forward neural network called Random Vector\nFunctional Link (RVFL). The proposed RVFL-based nonlinear Dictionary Learning\n(RVFLDL) learns a dictionary as a sparse-to-dense feature map from nonlinear\nsparse coefficients to the dense input features. Kernel-based nonlinear\ndictionary learning methods operate in a feature space obtained by an implicit\nfeature map, and they are not independent of computationally expensive\noperations like Singular Value Decomposition (SVD). Training the RVFL-based\ndictionary is free from SVD computation as RVFL generates weights from the\ninput to the output layer analytically. Sparsity-inducing Horse-shoe prior is\nassumed on the coefficients to generate a sparse coefficient matrix w.r.t an\ninitial random dictionary. Higher-order dependencies between the input sparse\ncoefficients and the dictionary atoms are incorporated into the training\nprocess by nonlinearly transforming the sparse coefficients and adding them as\nenhanced features. Thus the method projects sparse coefficients to a higher\ndimensional space while inducing nonlinearities into the dictionary. For\nclassification using RVFL-net, a classifier matrix is learned as a transform\nthat maps nonlinear sparse coefficients to the labels. The performance of the\nmethod illustrated in image classification and reconstruction applications is\ncomparable to that of other nonlinear dictionary learning methods. Experiments\nshow that RVFLDL is scalable and provides a solution better than those obtained\nusing other nonlinear dictionary learning methods.\n","authors":["G. Madhuri","Atul Negi"],"pdf_url":"https://arxiv.org/pdf/2402.03833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03830v1","updated":"2024-02-06T09:19:44Z","published":"2024-02-06T09:19:44Z","title":"OASim: an Open and Adaptive Simulator based on Neural Rendering for\n Autonomous Driving","summary":" With deep learning and computer vision technology development, autonomous\ndriving provides new solutions to improve traffic safety and efficiency. The\nimportance of building high-quality datasets is self-evident, especially with\nthe rise of end-to-end autonomous driving algorithms in recent years. Data\nplays a core role in the algorithm closed-loop system. However, collecting\nreal-world data is expensive, time-consuming, and unsafe. With the development\nof implicit rendering technology and in-depth research on using generative\nmodels to produce data at scale, we propose OASim, an open and adaptive\nsimulator and autonomous driving data generator based on implicit neural\nrendering. It has the following characteristics: (1) High-quality scene\nreconstruction through neural implicit surface reconstruction technology. (2)\nTrajectory editing of the ego vehicle and participating vehicles. (3) Rich\nvehicle model library that can be freely selected and inserted into the scene.\n(4) Rich sensors model library where you can select specified sensors to\ngenerate data. (5) A highly customizable data generation system can generate\ndata according to user needs. We demonstrate the high quality and fidelity of\nthe generated data through perception performance evaluation on the Carla\nsimulator and real-world data acquisition. Code is available at\nhttps://github.com/PJLab-ADG/OASim.\n","authors":["Guohang Yan","Jiahao Pi","Jianfei Guo","Zhaotong Luo","Min Dou","Nianchen Deng","Qiusheng Huang","Daocheng Fu","Licheng Wen","Pinlong Cai","Xing Gao","Xinyu Cai","Bo Zhang","Xuemeng Yang","Yeqi Bai","Hongbin Zhou","Botian Shi"],"pdf_url":"https://arxiv.org/pdf/2402.03830v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.02430v2","updated":"2024-02-06T09:18:44Z","published":"2024-02-04T09:59:18Z","title":"Exploiting Low-level Representations for Ultra-Fast Road Segmentation","summary":" Achieving real-time and accuracy on embedded platforms has always been the\npursuit of road segmentation methods. To this end, they have proposed many\nlightweight networks. However, they ignore the fact that roads are \"stuff\"\n(background or environmental elements) rather than \"things\" (specific\nidentifiable objects), which inspires us to explore the feasibility of\nrepresenting roads with low-level instead of high-level features. Surprisingly,\nwe find that the primary stage of mainstream network models is sufficient to\nrepresent most pixels of the road for segmentation. Motivated by this, we\npropose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg).\nSpecifically, LFD-RoadSeg employs a bilateral structure. The spatial detail\nbranch is firstly designed to extract low-level feature representation for the\nroad by the first stage of ResNet-18. To suppress texture-less regions mistaken\nas the road in the low-level feature, the context semantic branch is then\ndesigned to extract the context feature in a fast manner. To this end, in the\nsecond branch, we asymmetrically downsample the input image and design an\naggregation module to achieve comparable receptive fields to the third stage of\nResNet-18 but with less time consumption. Finally, to segment the road from the\nlow-level feature, a selective fusion module is proposed to calculate\npixel-wise attention between the low-level representation and context feature,\nand suppress the non-road low-level response by this attention. On KITTI-Road,\nLFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average\nprecision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on\na Jetson TX2, all with a compact model size of just 936k parameters. The source\ncode is available at https://github.com/zhouhuan-hust/LFD-RoadSeg.\n","authors":["Huan Zhou","Feng Xue","Yucong Li","Shi Gong","Yiqun Li","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02430v2.pdf","comment":"11 pages, 7 figures, IEEE TITS"},{"id":"http://arxiv.org/abs/2302.00487v3","updated":"2024-02-06T09:12:09Z","published":"2023-01-31T11:34:56Z","title":"A Comprehensive Survey of Continual Learning: Theory, Method and\n Application","summary":" To cope with real-world dynamics, an intelligent system needs to\nincrementally acquire, update, accumulate, and exploit knowledge throughout its\nlifetime. This ability, known as continual learning, provides a foundation for\nAI systems to develop themselves adaptively. In a general sense, continual\nlearning is explicitly limited by catastrophic forgetting, where learning a new\ntask usually results in a dramatic performance degradation of the old tasks.\nBeyond this, increasingly numerous advances have emerged in recent years that\nlargely extend the understanding and application of continual learning. The\ngrowing and widespread interest in this direction demonstrates its realistic\nsignificance as well as complexity. In this work, we present a comprehensive\nsurvey of continual learning, seeking to bridge the basic settings, theoretical\nfoundations, representative methods, and practical applications. Based on\nexisting theoretical and empirical results, we summarize the general objectives\nof continual learning as ensuring a proper stability-plasticity trade-off and\nan adequate intra/inter-task generalizability in the context of resource\nefficiency. Then we provide a state-of-the-art and elaborated taxonomy,\nextensively analyzing how representative methods address continual learning,\nand how they are adapted to particular challenges in realistic applications.\nThrough an in-depth discussion of promising directions, we believe that such a\nholistic perspective can greatly facilitate subsequent exploration in this\nfield and beyond.\n","authors":["Liyuan Wang","Xingxing Zhang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.00487v3.pdf","comment":"The concise version is in IEEE Transactions on Pattern Analysis and\n Machine Intelligence (TPAMI)"},{"id":"http://arxiv.org/abs/2402.03796v1","updated":"2024-02-06T08:29:39Z","published":"2024-02-06T08:29:39Z","title":"Face Detection: Present State and Research Directions","summary":" The majority of computer vision applications that handle images featuring\nhumans use face detection as a core component. Face detection still has issues,\ndespite much research on the topic. Face detection's accuracy and speed might\nyet be increased. This review paper shows the progress made in this area as\nwell as the substantial issues that still need to be tackled. The paper\nprovides research directions that can be taken up as research projects in the\nfield of face detection.\n","authors":["Purnendu Prabhat","Himanshu Gupta","Ajeet Kumar Vishwakarma"],"pdf_url":"https://arxiv.org/pdf/2402.03796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03795v1","updated":"2024-02-06T08:27:49Z","published":"2024-02-06T08:27:49Z","title":"Energy-based Domain-Adaptive Segmentation with Depth Guidance","summary":" Recent endeavors have been made to leverage self-supervised depth estimation\nas guidance in unsupervised domain adaptation (UDA) for semantic segmentation.\nPrior arts, however, overlook the discrepancy between semantic and depth\nfeatures, as well as the reliability of feature fusion, thus leading to\nsuboptimal segmentation performance. To address this issue, we propose a novel\nUDA framework called SMART (croSs doMain semAntic segmentation based on eneRgy\nesTimation) that utilizes Energy-Based Models (EBMs) to obtain task-adaptive\nfeatures and achieve reliable feature fusion for semantic segmentation with\nself-supervised depth estimates. Our framework incorporates two novel\ncomponents: energy-based feature fusion (EB2F) and energy-based reliable fusion\nAssessment (RFA) modules. The EB2F module produces task-adaptive semantic and\ndepth features by explicitly measuring and reducing their discrepancy using\nHopfield energy for better feature fusion. The RFA module evaluates the\nreliability of the feature fusion using an energy score to improve the\neffectiveness of depth guidance. Extensive experiments on two datasets\ndemonstrate that our method achieves significant performance gains over prior\nworks, validating the effectiveness of our energy-based learning approach.\n","authors":["Jinjing Zhu","Zhedong Hu","Tae-Kyun Kim","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17326v5","updated":"2024-02-06T08:17:33Z","published":"2023-05-27T02:04:25Z","title":"Matrix Information Theory for Self-Supervised Learning","summary":" The maximum entropy encoding framework provides a unified perspective for\nmany non-contrastive learning methods like SimSiam, Barlow Twins, and MEC.\nInspired by this framework, we introduce Matrix-SSL, a novel approach that\nleverages matrix information theory to interpret the maximum entropy encoding\nloss as matrix uniformity loss. Furthermore, Matrix-SSL enhances the maximum\nentropy encoding method by seamlessly incorporating matrix alignment loss,\ndirectly aligning covariance matrices in different branches. Experimental\nresults reveal that Matrix-SSL outperforms state-of-the-art methods on the\nImageNet dataset under linear evaluation settings and on MS-COCO for transfer\nlearning tasks. Specifically, when performing transfer learning tasks on\nMS-COCO, our method outperforms previous SOTA methods such as MoCo v2 and BYOL\nup to 3.3% with only 400 epochs compared to 800 epochs pre-training. We also\ntry to introduce representation learning into the language modeling regime,\nachieving 72.3% on the GSM8K dataset by fine-tuning a 7B model using matrix\ncross-entropy loss, with a margin of 3.1% over the standard cross-entropy loss.\nCode available at https://github.com/yifanzhang-pro/Matrix-SSL.\n","authors":["Yifan Zhang","Zhiquan Tan","Jingqin Yang","Weiran Huang","Yang Yuan"],"pdf_url":"https://arxiv.org/pdf/2305.17326v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03783v1","updated":"2024-02-06T07:53:23Z","published":"2024-02-06T07:53:23Z","title":"Exploring Low-Resource Medical Image Classification with Weakly\n Supervised Prompt Learning","summary":" Most advances in medical image recognition supporting clinical auxiliary\ndiagnosis meet challenges due to the low-resource situation in the medical\nfield, where annotations are highly expensive and professional. This\nlow-resource problem can be alleviated by leveraging the transferable\nrepresentations of large-scale pre-trained vision-language models via relevant\nmedical text prompts. However, existing pre-trained vision-language models\nrequire domain experts to carefully design the medical prompts, which greatly\nincreases the burden on clinicians. To address this problem, we propose a\nweakly supervised prompt learning method MedPrompt to automatically generate\nmedical prompts, which includes an unsupervised pre-trained vision-language\nmodel and a weakly supervised prompt learning model. The unsupervised\npre-trained vision-language model utilizes the natural correlation between\nmedical images and corresponding medical texts for pre-training, without any\nmanual annotations. The weakly supervised prompt learning model only utilizes\nthe classes of images in the dataset to guide the learning of the specific\nclass vector in the prompt, while the learning of other context vectors in the\nprompt requires no manual annotations for guidance. To the best of our\nknowledge, this is the first model to automatically generate medical prompts.\nWith these prompts, the pre-trained vision-language model can be freed from the\nstrong expert dependency of manual annotation and manual prompt design.\nExperimental results show that the model using our automatically generated\nprompts outperforms its full-shot learning hand-crafted prompts counterparts\nwith only a minimal number of labeled samples for few-shot learning, and\nreaches superior or comparable accuracy on zero-shot image classification. The\nproposed prompt generator is lightweight and therefore can be embedded into any\nnetwork architecture.\n","authors":["Fudan Zheng","Jindong Cao","Weijiang Yu","Zhiguang Chen","Nong Xiao","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03783v1.pdf","comment":"Accepted by Pattern Recognition"},{"id":"http://arxiv.org/abs/2110.03854v2","updated":"2024-02-06T07:40:00Z","published":"2021-10-08T01:50:54Z","title":"Meta-Learning 3D Shape Segmentation Functions","summary":" Learning robust 3D shape segmentation functions with deep neural networks has\nemerged as a powerful paradigm, offering promising performance in producing a\nconsistent part segmentation of each 3D shape. Generalizing across 3D shape\nsegmentation functions requires robust learning of priors over the respective\nfunction space and enables consistent part segmentation of shapes in presence\nof significant 3D structure variations. Existing generalization methods rely on\nextensive training of 3D shape segmentation functions on large-scale labeled\ndatasets. In this paper, we proposed to formalize the learning of a 3D shape\nsegmentation function space as a meta-learning problem, aiming to predict a 3D\nsegmentation model that can be quickly adapted to new shapes with no or limited\ntraining data. More specifically, we define each task as unsupervised learning\nof shape-conditioned 3D segmentation function which takes as input points in 3D\nspace and predicts the part-segment labels. The 3D segmentation function is\ntrained by a self-supervised 3D shape reconstruction loss without the need for\npart labels. Also, we introduce an auxiliary deep neural network as a\nmeta-learner which takes as input a 3D shape and predicts the prior over the\nrespective 3D segmentation function space. We show in experiments that our\nmeta-learning approach, denoted as Meta-3DSeg, leads to improvements on\nunsupervised 3D shape segmentation over the conventional designs of deep neural\nnetworks for 3D shape segmentation functions.\n","authors":["Yu Hao","Hao Huang","Shuaihang Yuan","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2110.03854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03769v1","updated":"2024-02-06T07:22:50Z","published":"2024-02-06T07:22:50Z","title":"AttackNet: Enhancing Biometric Security via Tailored Convolutional\n Neural Network Architectures for Liveness Detection","summary":" Biometric security is the cornerstone of modern identity verification and\nauthentication systems, where the integrity and reliability of biometric\nsamples is of paramount importance. This paper introduces AttackNet, a bespoke\nConvolutional Neural Network architecture, meticulously designed to combat\nspoofing threats in biometric systems. Rooted in deep learning methodologies,\nthis model offers a layered defense mechanism, seamlessly transitioning from\nlow-level feature extraction to high-level pattern discernment. Three\ndistinctive architectural phases form the crux of the model, each underpinned\nby judiciously chosen activation functions, normalization techniques, and\ndropout layers to ensure robustness and resilience against adversarial attacks.\nBenchmarking our model across diverse datasets affirms its prowess, showcasing\nsuperior performance metrics in comparison to contemporary models. Furthermore,\na detailed comparative analysis accentuates the model's efficacy, drawing\nparallels with prevailing state-of-the-art methodologies. Through iterative\nrefinement and an informed architectural strategy, AttackNet underscores the\npotential of deep learning in safeguarding the future of biometric security.\n","authors":["Oleksandr Kuznetsov","Dmytro Zakharov","Emanuele Frontoni","Andrea Maranesi"],"pdf_url":"https://arxiv.org/pdf/2402.03769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03766v1","updated":"2024-02-06T07:16:36Z","published":"2024-02-06T07:16:36Z","title":"MobileVLM V2: Faster and Stronger Baseline for Vision Language Model","summary":" We introduce MobileVLM V2, a family of significantly improved vision language\nmodels upon MobileVLM, which proves that a delicate orchestration of novel\narchitectural design, an improved training scheme tailored for mobile VLMs, and\nrich high-quality dataset curation can substantially benefit VLMs' performance.\nSpecifically, MobileVLM V2 1.7B achieves better or on-par performance on\nstandard VLM benchmarks compared with much larger VLMs at the 3B scale.\nNotably, our 3B model outperforms a large variety of VLMs at the 7B+ scale. Our\nmodels will be released at https://github.com/Meituan-AutoML/MobileVLM .\n","authors":["Xiangxiang Chu","Limeng Qiao","Xinyu Zhang","Shuang Xu","Fei Wei","Yang Yang","Xiaofei Sun","Yiming Hu","Xinyang Lin","Bo Zhang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2402.03766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03762v1","updated":"2024-02-06T07:07:33Z","published":"2024-02-06T07:07:33Z","title":"MoD-SLAM: Monocular Dense Mapping for Unbounded 3D Scene Reconstruction","summary":" Neural implicit representations have recently been demonstrated in many\nfields including Simultaneous Localization And Mapping (SLAM). Current neural\nSLAM can achieve ideal results in reconstructing bounded scenes, but this\nrelies on the input of RGB-D images. Neural-based SLAM based only on RGB images\nis unable to reconstruct the scale of the scene accurately, and it also suffers\nfrom scale drift due to errors accumulated during tracking. To overcome these\nlimitations, we present MoD-SLAM, a monocular dense mapping method that allows\nglobal pose optimization and 3D reconstruction in real-time in unbounded\nscenes. Optimizing scene reconstruction by monocular depth estimation and using\nloop closure detection to update camera pose enable detailed and precise\nreconstruction on large scenes. Compared to previous work, our approach is more\nrobust, scalable and versatile. Our experiments demonstrate that MoD-SLAM has\nmore excellent mapping performance than prior neural SLAM methods, especially\nin large borderless scenes.\n","authors":["Heng Zhou","Zhetao Guo","Shuhong Liu","Lechen Zhang","Qihao Wang","Yuxiang Ren","Mingrui Li"],"pdf_url":"https://arxiv.org/pdf/2402.03762v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2303.07937v4","updated":"2024-02-06T06:49:43Z","published":"2023-03-14T14:24:31Z","title":"Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D\n Generation","summary":" Text-to-3D generation has shown rapid progress in recent days with the advent\nof score distillation, a methodology of using pretrained text-to-2D diffusion\nmodels to optimize neural radiance field (NeRF) in the zero-shot setting.\nHowever, the lack of 3D awareness in the 2D diffusion models destabilizes score\ndistillation-based methods from reconstructing a plausible 3D scene. To address\nthis issue, we propose 3DFuse, a novel framework that incorporates 3D awareness\ninto pretrained 2D diffusion models, enhancing the robustness and 3D\nconsistency of score distillation-based methods. We realize this by first\nconstructing a coarse 3D structure of a given text prompt and then utilizing\nprojected, view-specific depth map as a condition for the diffusion model.\nAdditionally, we introduce a training strategy that enables the 2D diffusion\nmodel learns to handle the errors and sparsity within the coarse 3D structure\nfor robust generation, as well as a method for ensuring semantic consistency\nthroughout all viewpoints of the scene. Our framework surpasses the limitations\nof prior arts, and has significant implications for 3D consistent generation of\n2D diffusion models.\n","authors":["Junyoung Seo","Wooseok Jang","Min-Seop Kwak","Hyeonsu Kim","Jaehoon Ko","Junho Kim","Jin-Hwa Kim","Jiyoung Lee","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2303.07937v4.pdf","comment":"Project page https://ku-cvlab.github.io/3DFuse/"},{"id":"http://arxiv.org/abs/2402.03758v1","updated":"2024-02-06T06:49:04Z","published":"2024-02-06T06:49:04Z","title":"Virtual Classification: Modulating Domain-Specific Knowledge for\n Multidomain Crowd Counting","summary":" Multidomain crowd counting aims to learn a general model for multiple diverse\ndatasets. However, deep networks prefer modeling distributions of the dominant\ndomains instead of all domains, which is known as domain bias. In this study,\nwe propose a simple-yet-effective Modulating Domain-specific Knowledge Network\n(MDKNet) to handle the domain bias issue in multidomain crowd counting. MDKNet\nis achieved by employing the idea of `modulating', enabling deep network\nbalancing and modeling different distributions of diverse datasets with little\nbias. Specifically, we propose an Instance-specific Batch Normalization (IsBN)\nmodule, which serves as a base modulator to refine the information flow to be\nadaptive to domain distributions. To precisely modulating the domain-specific\ninformation, the Domain-guided Virtual Classifier (DVC) is then introduced to\nlearn a domain-separable latent space. This space is employed as an input\nguidance for the IsBN modulator, such that the mixture distributions of\nmultiple datasets can be well treated. Extensive experiments performed on\npopular benchmarks, including Shanghai-tech A/B, QNRF and NWPU, validate the\nsuperiority of MDKNet in tackling multidomain crowd counting and the\neffectiveness for multidomain learning. Code is available at\n\\url{https://github.com/csguomy/MDKNet}.\n","authors":["Mingyue Guo","Binghui Chen","Zhaoyi Yan","Yaowei Wang","Qixiang Ye"],"pdf_url":"https://arxiv.org/pdf/2402.03758v1.pdf","comment":"Multidomain learning; Domain-guided virtual classifier;\n Instance-specific batch normalization"},{"id":"http://arxiv.org/abs/2402.03757v1","updated":"2024-02-06T06:48:46Z","published":"2024-02-06T06:48:46Z","title":"The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs","summary":" Large language models (LLMs) have recently experienced remarkable progress,\nwhere the advent of multi-modal large language models (MLLMs) has endowed LLMs\nwith visual capabilities, leading to impressive performances in various\nmulti-modal tasks. However, those powerful MLLMs such as GPT-4V still fail\nspectacularly when presented with certain image and text inputs. In this paper,\nwe identify a typical class of inputs that baffles MLLMs, which consist of\nimages that are highly relevant but inconsistent with answers, causing MLLMs to\nsuffer from hallucination. To quantify the effect, we propose CorrelationQA,\nthe first benchmark that assesses the hallucination level given spurious\nimages. This benchmark contains 7,308 text-image pairs across 13 categories.\nBased on the proposed CorrelationQA, we conduct a thorough analysis on 9\nmainstream MLLMs, illustrating that they universally suffer from this\ninstinctive bias to varying degrees. We hope that our curated benchmark and\nevaluation results aid in better assessments of the MLLMs' robustness in the\npresence of misleading images. The resource is available in\nhttps://github.com/MasaiahHan/CorrelationQA.\n","authors":["Tianyang Han","Qing Lian","Rui Pan","Renjie Pi","Jipeng Zhang","Shizhe Diao","Yong Lin","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03754v1","updated":"2024-02-06T06:46:46Z","published":"2024-02-06T06:46:46Z","title":"Intensive Vision-guided Network for Radiology Report Generation","summary":" Automatic radiology report generation is booming due to its huge application\npotential for the healthcare industry. However, existing computer vision and\nnatural language processing approaches to tackle this problem are limited in\ntwo aspects. First, when extracting image features, most of them neglect\nmulti-view reasoning in vision and model single-view structure of medical\nimages, such as space-view or channel-view. However, clinicians rely on\nmulti-view imaging information for comprehensive judgment in daily clinical\ndiagnosis. Second, when generating reports, they overlook context reasoning\nwith multi-modal information and focus on pure textual optimization utilizing\nretrieval-based methods. We aim to address these two issues by proposing a\nmodel that better simulates clinicians' perspectives and generates more\naccurate reports. Given the above limitation in feature extraction, we propose\na Globally-intensive Attention (GIA) module in the medical image encoder to\nsimulate and integrate multi-view vision perception. GIA aims to learn three\ntypes of vision perception: depth view, space view, and pixel view. On the\nother hand, to address the above problem in report generation, we explore how\nto involve multi-modal signals to generate precisely matched reports, i.e., how\nto integrate previously predicted words with region-aware visual content in\nnext word prediction. Specifically, we design a Visual Knowledge-guided Decoder\n(VKGD), which can adaptively consider how much the model needs to rely on\nvisual information and previously predicted text to assist next word\nprediction. Hence, our final Intensive Vision-guided Network (IVGN) framework\nincludes a GIA-guided Visual Encoder and the VKGD. Experiments on two\ncommonly-used datasets IU X-Ray and MIMIC-CXR demonstrate the superior ability\nof our method compared with other state-of-the-art approaches.\n","authors":["Fudan Zheng","Mengfei Li","Ying Wang","Weijiang Yu","Ruixuan Wang","Zhiguang Chen","Nong Xiao","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03754v1.pdf","comment":"Accepted by Physics in Medicine & Biology"},{"id":"http://arxiv.org/abs/2402.03752v1","updated":"2024-02-06T06:41:24Z","published":"2024-02-06T06:41:24Z","title":"Pre-training of Lightweight Vision Transformers on Small Datasets with\n Minimally Scaled Images","summary":" Can a lightweight Vision Transformer (ViT) match or exceed the performance of\nConvolutional Neural Networks (CNNs) like ResNet on small datasets with small\nimage resolutions? This report demonstrates that a pure ViT can indeed achieve\nsuperior performance through pre-training, using a masked auto-encoder\ntechnique with minimal image scaling. Our experiments on the CIFAR-10 and\nCIFAR-100 datasets involved ViT models with fewer than 3.65 million parameters\nand a multiply-accumulate (MAC) count below 0.27G, qualifying them as\n'lightweight' models. Unlike previous approaches, our method attains\nstate-of-the-art performance among similar lightweight transformer-based\narchitectures without significantly scaling up images from CIFAR-10 and\nCIFAR-100. This achievement underscores the efficiency of our model, not only\nin handling small datasets but also in effectively processing images close to\ntheir original scale.\n","authors":["Jen Hong Tan"],"pdf_url":"https://arxiv.org/pdf/2402.03752v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.03161v2","updated":"2024-02-06T06:35:36Z","published":"2024-02-05T16:30:49Z","title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled\n Visual-Motional Tokenization","summary":" In light of recent advances in multimodal Large Language Models (LLMs), there\nis increasing attention to scaling them from image-text data to more\ninformative real-world videos. Compared to static images, video poses unique\nchallenges for effective large-scale pre-training due to the modeling of its\nspatiotemporal dynamics. In this paper, we address such limitations in\nvideo-language pre-training with an efficient video decomposition that\nrepresents each video as keyframes and temporal motions. These are then adapted\nto an LLM using well-designed tokenizers that discretize visual and temporal\ninformation as a few tokens, thus enabling unified generative pre-training of\nvideos, images, and text. At inference, the generated tokens from the LLM are\ncarefully recovered to the original continuous pixel space to create various\nvideo content. Our proposed framework is both capable of comprehending and\ngenerating image and video content, as demonstrated by its competitive\nperformance across 13 multimodal benchmarks in image and video understanding\nand generation. Our code and models will be available at\nhttps://video-lavit.github.io.\n","authors":["Yang Jin","Zhicheng Sun","Kun Xu","Kun Xu","Liwei Chen","Hao Jiang","Quzhe Huang","Chengru Song","Yuliang Liu","Di Zhang","Yang Song","Kun Gai","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.03161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01217v2","updated":"2024-02-06T06:30:40Z","published":"2024-02-02T08:39:51Z","title":"Taming Uncertainty in Sparse-view Generalizable NeRF via Indirect\n Diffusion Guidance","summary":" Neural Radiance Fields (NeRF) have demonstrated effectiveness in synthesizing\nnovel views. However, their reliance on dense inputs and scene-specific\noptimization has limited their broader applicability. Generalizable NeRFs\n(Gen-NeRF), while intended to address this, often produce blurring artifacts in\nunobserved regions with sparse inputs, which are full of uncertainty. In this\npaper, we aim to diminish the uncertainty in Gen-NeRF for plausible renderings.\nWe assume that NeRF's inability to effectively mitigate this uncertainty stems\nfrom its inherent lack of generative capacity. Therefore, we innovatively\npropose an Indirect Diffusion-guided NeRF framework, termed ID-NeRF, to address\nthis uncertainty from a generative perspective by leveraging a distilled\ndiffusion prior as guidance. Specifically, to avoid model confusion caused by\ndirectly regularizing with inconsistent samplings as in previous methods, our\napproach introduces a strategy to indirectly inject the inherently missing\nimagination into the learned implicit function through a diffusion-guided\nlatent space. Empirical evaluation across various benchmarks demonstrates the\nsuperior performance of our approach in handling uncertainty with sparse\ninputs.\n","authors":["Yaokun Li","Chao Gou","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03749v1","updated":"2024-02-06T06:30:34Z","published":"2024-02-06T06:30:34Z","title":"Vision Superalignment: Weak-to-Strong Generalization for Vision\n Foundation Models","summary":" Recent advancements in large language models have sparked interest in their\nextraordinary and near-superhuman capabilities, leading researchers to explore\nmethods for evaluating and optimizing these abilities, which is called\nsuperalignment. In this context, our paper delves into the realm of vision\nfoundation models, focusing on the concept of weak-to-strong generalization,\nwhich involves using a weaker model to supervise a stronger one, aiming to\nenhance the latter's capabilities beyond the former's limits. We introduce a\nnovel and adaptively adjustable loss function for weak-to-strong supervision.\nOur comprehensive experiments span various scenarios, including few-shot\nlearning, transfer learning, noisy label learning, and common knowledge\ndistillation settings. The results are striking: our approach not only exceeds\nthe performance benchmarks set by strong-to-strong generalization but also\nsurpasses the outcomes of fine-tuning strong models with whole datasets. This\ncompelling evidence underscores the significant potential of weak-to-strong\ngeneralization, showcasing its capability to substantially elevate the\nperformance of vision foundation models. The code is available at\nhttps://github.com/ggjy/vision_weak_to_strong.\n","authors":["Jianyuan Guo","Hanting Chen","Chengcheng Wang","Kai Han","Chang Xu","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03749v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2402.03746v1","updated":"2024-02-06T06:27:40Z","published":"2024-02-06T06:27:40Z","title":"Tuning Large Multimodal Models for Videos using Reinforcement Learning\n from AI Feedback","summary":" Recent advancements in large language models have influenced the development\nof video large multimodal models (VLMMs). The previous approaches for VLMMs\ninvolved Supervised Fine-Tuning (SFT) with instruction-tuned datasets,\nintegrating LLM with visual encoders, and adding additional learnable modules.\nVideo and text multimodal alignment remains challenging, primarily due to the\ndeficient volume and quality of multimodal instruction-tune data compared to\ntext-only data. We present a novel alignment strategy that employs multimodal\nAI system to oversee itself called Reinforcement Learning from AI Feedback\n(RLAIF), providing self-preference feedback to refine itself and facilitating\nthe alignment of video and text modalities. In specific, we propose\ncontext-aware reward modeling by providing detailed video descriptions as\ncontext during the generation of preference feedback in order to enrich the\nunderstanding of video content. Demonstrating enhanced performance across\ndiverse video benchmarks, our multimodal RLAIF approach, VLM-RLAIF, outperforms\nexisting approaches, including the SFT model. We commit to open-sourcing our\ncode, models, and datasets to foster further research in this area.\n","authors":["Daechul Ahn","Yura Choi","Youngjae Yu","Dongyeop Kang","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2402.03746v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2402.03738v1","updated":"2024-02-06T06:12:03Z","published":"2024-02-06T06:12:03Z","title":"AoSRNet: All-in-One Scene Recovery Networks via Multi-knowledge\n Integration","summary":" Scattering and attenuation of light in no-homogeneous imaging media or\ninconsistent light intensity will cause insufficient contrast and color\ndistortion in the collected images, which limits the developments such as\nvision-driven smart urban, autonomous vehicles, and intelligent robots. In this\npaper, we propose an all-in-one scene recovery network via multi-knowledge\nintegration (termed AoSRNet) to improve the visibility of imaging devices in\ntypical low-visibility imaging scenes (e.g., haze, sand dust, and low light).\nIt combines gamma correction (GC) and optimized linear stretching (OLS) to\ncreate the detail enhancement module (DEM) and color restoration module (CRM).\nAdditionally, we suggest a multi-receptive field extraction module (MEM) to\nattenuate the loss of image texture details caused by GC nonlinear and OLS\nlinear transformations. Finally, we refine the coarse features generated by\nDEM, CRM, and MEM through Encoder-Decoder to generate the final restored image.\nComprehensive experimental results demonstrate the effectiveness and stability\nof AoSRNet compared to other state-of-the-art methods. The source code is\navailable at \\url{https://github.com/LouisYuxuLu/AoSRNet}.\n","authors":["Yuxu Lu","Dong Yang","Yuan Gao","Ryan Wen Liu","Jun Liu","Yu Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11861v3","updated":"2024-02-06T06:04:08Z","published":"2023-02-23T08:59:56Z","title":"Out-of-Domain Robustness via Targeted Augmentations","summary":" Models trained on one set of domains often suffer performance drops on unseen\ndomains, e.g., when wildlife monitoring models are deployed in new camera\nlocations. In this work, we study principles for designing data augmentations\nfor out-of-domain (OOD) generalization. In particular, we focus on real-world\nscenarios in which some domain-dependent features are robust, i.e., some\nfeatures that vary across domains are predictive OOD. For example, in the\nwildlife monitoring application above, image backgrounds vary across camera\nlocations but indicate habitat type, which helps predict the species of\nphotographed animals. Motivated by theoretical analysis on a linear setting, we\npropose targeted augmentations, which selectively randomize spurious\ndomain-dependent features while preserving robust ones. We prove that targeted\naugmentations improve OOD performance, allowing models to generalize better\nwith fewer domains. In contrast, existing approaches such as generic\naugmentations, which fail to randomize domain-dependent features, and\ndomain-invariant augmentations, which randomize all domain-dependent features,\nboth perform poorly OOD. In experiments on three real-world datasets, we show\nthat targeted augmentations set new states-of-the-art for OOD performance by\n3.2-15.2 percentage points.\n","authors":["Irena Gao","Shiori Sagawa","Pang Wei Koh","Tatsunori Hashimoto","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2302.11861v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03723v1","updated":"2024-02-06T05:40:53Z","published":"2024-02-06T05:40:53Z","title":"Rig3DGS: Creating Controllable Portraits from Casual Monocular Videos","summary":" Creating controllable 3D human portraits from casual smartphone videos is\nhighly desirable due to their immense value in AR/VR applications. The recent\ndevelopment of 3D Gaussian Splatting (3DGS) has shown improvements in rendering\nquality and training efficiency. However, it still remains a challenge to\naccurately model and disentangle head movements and facial expressions from a\nsingle-view capture to achieve high-quality renderings. In this paper, we\nintroduce Rig3DGS to address this challenge. We represent the entire scene,\nincluding the dynamic subject, using a set of 3D Gaussians in a canonical\nspace. Using a set of control signals, such as head pose and expressions, we\ntransform them to the 3D space with learned deformations to generate the\ndesired rendering. Our key innovation is a carefully designed deformation\nmethod which is guided by a learnable prior derived from a 3D morphable model.\nThis approach is highly efficient in training and effective in controlling\nfacial expressions, head positions, and view synthesis across various captures.\nWe demonstrate the effectiveness of our learned deformation through extensive\nquantitative and qualitative experiments. The project page can be found at\nhttp://shahrukhathar.github.io/2024/02/05/Rig3DGS.html\n","authors":["Alfredo Rivero","ShahRukh Athar","Zhixin Shu","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2402.03723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10301v2","updated":"2024-02-06T05:34:12Z","published":"2023-10-16T11:37:53Z","title":"Multi-Body Neural Scene Flow","summary":" The test-time optimization of scene flow - using a coordinate network as a\nneural prior - has gained popularity due to its simplicity, lack of dataset\nbias, and state-of-the-art performance. We observe, however, that although\ncoordinate networks capture general motions by implicitly regularizing the\nscene flow predictions to be spatially smooth, the neural prior by itself is\nunable to identify the underlying multi-body rigid motions present in\nreal-world data. To address this, we show that multi-body rigidity can be\nachieved without the cumbersome and brittle strategy of constraining the\n$SE(3)$ parameters of each rigid body as done in previous works. This is\nachieved by regularizing the scene flow optimization to encourage isometry in\nflow predictions for rigid bodies. This strategy enables multi-body rigidity in\nscene flow while maintaining a continuous flow field, hence allowing dense\nlong-term scene flow integration across a sequence of point clouds. We conduct\nextensive experiments on real-world datasets and demonstrate that our approach\noutperforms the state-of-the-art in 3D scene flow and long-term point-wise 4D\ntrajectory prediction. The code is available at:\nhttps://github.com/kavisha725/MBNSF.\n","authors":["Kavisha Vidanapathirana","Shin-Fang Chng","Xueqian Li","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2310.10301v2.pdf","comment":"Accepted for 3DV'2024 (oral)"},{"id":"http://arxiv.org/abs/2302.01034v3","updated":"2024-02-06T05:33:24Z","published":"2023-02-02T11:57:41Z","title":"An Efficient Convex Hull-based Vehicle Pose Estimation Method for 3D\n LiDAR","summary":" Vehicle pose estimation with LiDAR is essential in the perception technology\nof autonomous driving. However, due to incomplete observation measurements and\nsparsity of the LiDAR point cloud, it is challenging to achieve satisfactory\npose extraction based on 3D LiDAR with the existing pose estimation methods. In\naddition, the demand for real-time performance further increases the difficulty\nof the pose estimation task. In this paper, we propose a novel vehicle pose\nestimation method based on the convex hull. The extracted 3D cluster is reduced\nto the convex hull, reducing the subsequent computation burden while preserving\nessential contour information. Subsequently, a novel criterion based on the\nminimum occlusion area is developed for the search-based algorithm, enabling\naccurate pose estimation. Additionally, this criterion renders the proposed\nalgorithm particularly well-suited for obstacle avoidance. The proposed\nalgorithm is validated on the KITTI dataset and a manually labeled dataset\nacquired at an industrial park. The results demonstrate that our proposed\nmethod can achieve better accuracy than the classical pose estimation method\nwhile maintaining real-time speed.\n","authors":["Ningning Ding"],"pdf_url":"https://arxiv.org/pdf/2302.01034v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02695v2","updated":"2024-02-06T05:15:20Z","published":"2024-01-05T08:05:07Z","title":"VoroNav: Voronoi-based Zero-shot Object Navigation with Large Language\n Model","summary":" In the realm of household robotics, the Zero-Shot Object Navigation (ZSON)\ntask empowers agents to adeptly traverse unfamiliar environments and locate\nobjects from novel categories without prior explicit training. This paper\nintroduces VoroNav, a novel semantic exploration framework that proposes the\nReduced Voronoi Graph to extract exploratory paths and planning nodes from a\nsemantic map constructed in real time. By harnessing topological and semantic\ninformation, VoroNav designs text-based descriptions of paths and images that\nare readily interpretable by a large language model (LLM). In particular, our\napproach presents a synergy of path and farsight descriptions to represent the\nenvironmental context, enabling LLM to apply commonsense reasoning to ascertain\nwaypoints for navigation. Extensive evaluation on HM3D and HSSD validates\nVoroNav surpasses existing benchmarks in both success rate and exploration\nefficiency (absolute improvement: +2.8% Success and +3.7% SPL on HM3D, +2.6%\nSuccess and +3.8% SPL on HSSD). Additionally introduced metrics that evaluate\nobstacle avoidance proficiency and perceptual efficiency further corroborate\nthe enhancements achieved by our method in ZSON planning. Project page:\nhttps://voro-nav.github.io\n","authors":["Pengying Wu","Yao Mu","Bingxian Wu","Yi Hou","Ji Ma","Shanghang Zhang","Chang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.02695v2.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.03716v1","updated":"2024-02-06T05:11:46Z","published":"2024-02-06T05:11:46Z","title":"Attention-based Shape and Gait Representations Learning for Video-based\n Cloth-Changing Person Re-Identification","summary":" Current state-of-the-art Video-based Person Re-Identification (Re-ID)\nprimarily relies on appearance features extracted by deep learning models.\nThese methods are not applicable for long-term analysis in real-world scenarios\nwhere persons have changed clothes, making appearance information unreliable.\nIn this work, we deal with the practical problem of Video-based Cloth-Changing\nPerson Re-ID (VCCRe-ID) by proposing \"Attention-based Shape and Gait\nRepresentations Learning\" (ASGL) for VCCRe-ID. Our ASGL framework improves\nRe-ID performance under clothing variations by learning clothing-invariant gait\ncues using a Spatial-Temporal Graph Attention Network (ST-GAT). Given the\n3D-skeleton-based spatial-temporal graph, our proposed ST-GAT comprises\nmulti-head attention modules, which are able to enhance the robustness of gait\nembeddings under viewpoint changes and occlusions. The ST-GAT amplifies the\nimportant motion ranges and reduces the influence of noisy poses. Then, the\nmulti-head learning module effectively reserves beneficial local temporal\ndynamics of movement. We also boost discriminative power of person\nrepresentations by learning body shape cues using a GAT. Experiments on two\nlarge-scale VCCRe-ID datasets demonstrate that our proposed framework\noutperforms state-of-the-art methods by 12.2% in rank-1 accuracy and 7.0% in\nmAP.\n","authors":["Vuong D. Nguyen","Samiha Mirza","Pranav Mantini","Shishir K. Shah"],"pdf_url":"https://arxiv.org/pdf/2402.03716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v2","updated":"2024-02-06T05:10:33Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A simple method to reduce hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.03708v1","updated":"2024-02-06T05:02:33Z","published":"2024-02-06T05:02:33Z","title":"SISP: A Benchmark Dataset for Fine-grained Ship Instance Segmentation in\n Panchromatic Satellite Images","summary":" Fine-grained ship instance segmentation in satellite images holds\nconsiderable significance for monitoring maritime activities at sea. However,\nexisting datasets often suffer from the scarcity of fine-grained information or\npixel-wise localization annotations, as well as the insufficient image\ndiversity and variations, thus limiting the research of this task. To this end,\nwe propose a benchmark dataset for fine-grained Ship Instance Segmentation in\nPanchromatic satellite images, namely SISP, which contains 56,693\nwell-annotated ship instances with four fine-grained categories across 10,000\nsliced images, and all the images are collected from SuperView-1 satellite with\nthe resolution of 0.5m. Targets in the proposed SISP dataset have\ncharacteristics that are consistent with real satellite scenes, such as high\nclass imbalance, various scenes, large variations in target densities and\nscales, and high inter-class similarity and intra-class diversity, all of which\nmake the SISP dataset more suitable for real-world applications. In addition,\nwe introduce a Dynamic Feature Refinement-assist Instance segmentation network,\nnamely DFRInst, as the benchmark method for ship instance segmentation in\nsatellite images, which can fortify the explicit representation of crucial\nfeatures, thus improving the performance of ship instance segmentation.\nExperiments and analysis are performed on the proposed SISP dataset to evaluate\nthe benchmark method and several state-of-the-art methods to establish\nbaselines for facilitating future research. The proposed dataset and source\ncodes will be available at: https://github.com/Justlovesmile/SISP.\n","authors":["Pengming Feng","Mingjie Xie","Hongning Liu","Xuanjia Zhao","Guangjun He","Xueliang Zhang","Jian Guan"],"pdf_url":"https://arxiv.org/pdf/2402.03708v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2210.09138v2","updated":"2024-02-06T04:58:07Z","published":"2022-10-17T14:40:04Z","title":"An Open-source Benchmark of Deep Learning Models for Audio-visual\n Apparent and Self-reported Personality Recognition","summary":" Personality determines a wide variety of human daily and working behaviours,\nand is crucial for understanding human internal and external states. In recent\nyears, a large number of automatic personality computing approaches have been\ndeveloped to predict either the apparent personality or self-reported\npersonality of the subject based on non-verbal audio-visual behaviours.\nHowever, the majority of them suffer from complex and dataset-specific\npre-processing steps and model training tricks. In the absence of a\nstandardized benchmark with consistent experimental settings, it is not only\nimpossible to fairly compare the real performances of these personality\ncomputing models but also makes them difficult to be reproduced. In this paper,\nwe present the first reproducible audio-visual benchmarking framework to\nprovide a fair and consistent evaluation of eight existing personality\ncomputing models (e.g., audio, visual and audio-visual) and seven standard deep\nlearning models on both self-reported and apparent personality recognition\ntasks. Building upon a set of benchmarked models, we also investigate the\nimpact of two previously-used long-term modelling strategies for summarising\nshort-term/frame-level predictions on personality computing results. The\nresults conclude: (i) apparent personality traits, inferred from facial\nbehaviours by most benchmarked deep learning models, show more reliability than\nself-reported ones; (ii) visual models frequently achieved superior\nperformances than audio models on personality recognition; (iii) non-verbal\nbehaviours contribute differently in predicting different personality traits;\nand (iv) our reproduced personality computing models generally achieved worse\nperformances than their original reported results. Our benchmark is publicly\navailable at \\url{https://github.com/liaorongfan/DeepPersonality}.\n","authors":["Rongfan Liao","Siyang Song","Hatice Gunes"],"pdf_url":"https://arxiv.org/pdf/2210.09138v2.pdf","comment":"Accepted by IEEE Transactions on Affective Computing"},{"id":"http://arxiv.org/abs/2402.03706v1","updated":"2024-02-06T04:57:07Z","published":"2024-02-06T04:57:07Z","title":"MMAUD: A Comprehensive Multi-Modal Anti-UAV Dataset for Modern Miniature\n Drone Threats","summary":" In response to the evolving challenges posed by small unmanned aerial\nvehicles (UAVs), which possess the potential to transport harmful payloads or\nindependently cause damage, we introduce MMAUD: a comprehensive Multi-Modal\nAnti-UAV Dataset. MMAUD addresses a critical gap in contemporary threat\ndetection methodologies by focusing on drone detection, UAV-type\nclassification, and trajectory estimation. MMAUD stands out by combining\ndiverse sensory inputs, including stereo vision, various Lidars, Radars, and\naudio arrays. It offers a unique overhead aerial detection vital for addressing\nreal-world scenarios with higher fidelity than datasets captured on specific\nvantage points using thermal and RGB. Additionally, MMAUD provides accurate\nLeica-generated ground truth data, enhancing credibility and enabling confident\nrefinement of algorithms and models, which has never been seen in other\ndatasets. Most existing works do not disclose their datasets, making MMAUD an\ninvaluable resource for developing accurate and efficient solutions. Our\nproposed modalities are cost-effective and highly adaptable, allowing users to\nexperiment and implement new UAV threat detection tools. Our dataset closely\nsimulates real-world scenarios by incorporating ambient heavy machinery sounds.\nThis approach enhances the dataset's applicability, capturing the exact\nchallenges faced during proximate vehicular operations. It is expected that\nMMAUD can play a pivotal role in advancing UAV threat detection,\nclassification, trajectory estimation capabilities, and beyond. Our dataset,\ncodes, and designs will be available in https://github.com/ntu-aris/MMAUD.\n","authors":["Shenghai Yuan","Yizhuo Yang","Thien Hoang Nguyen","Thien-Minh Nguyen","Jianfei Yang","Fen Liu","Jianping Li","Han Wang","Lihua Xie"],"pdf_url":"https://arxiv.org/pdf/2402.03706v1.pdf","comment":"Accepted by ICRA 2024"},{"id":"http://arxiv.org/abs/2402.03705v1","updated":"2024-02-06T04:56:43Z","published":"2024-02-06T04:56:43Z","title":"FoolSDEdit: Deceptively Steering Your Edits Towards Targeted\n Attribute-aware Distribution","summary":" Guided image synthesis methods, like SDEdit based on the diffusion model,\nexcel at creating realistic images from user inputs such as stroke paintings.\nHowever, existing efforts mainly focus on image quality, often overlooking a\nkey point: the diffusion model represents a data distribution, not individual\nimages. This introduces a low but critical chance of generating images that\ncontradict user intentions, raising ethical concerns. For example, a user\ninputting a stroke painting with female characteristics might, with some\nprobability, get male faces from SDEdit. To expose this potential\nvulnerability, we aim to build an adversarial attack forcing SDEdit to generate\na specific data distribution aligned with a specified attribute (e.g., female),\nwithout changing the input's attribute characteristics. We propose the Targeted\nAttribute Generative Attack (TAGA), using an attribute-aware objective function\nand optimizing the adversarial noise added to the input stroke painting.\nEmpirical studies reveal that traditional adversarial noise struggles with\nTAGA, while natural perturbations like exposure and motion blur easily alter\ngenerated images' attributes. To execute effective attacks, we introduce\nFoolSDEdit: We design a joint adversarial exposure and blur attack, adding\nexposure and motion blur to the stroke painting and optimizing them together.\nWe optimize the execution strategy of various perturbations, framing it as a\nnetwork architecture search problem. We create the SuperPert, a graph\nrepresenting diverse execution strategies for different perturbations. After\ntraining, we obtain the optimized execution strategy for effective TAGA against\nSDEdit. Comprehensive experiments on two datasets show our method compelling\nSDEdit to generate a targeted attribute-aware data distribution, significantly\noutperforming baselines.\n","authors":["Qi Zhou","Dongxia Wang","Tianlin Li","Zhihong Xu","Yang Liu","Kui Ren","Wenhai Wang","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14907v2","updated":"2024-02-06T04:45:08Z","published":"2023-10-23T13:16:51Z","title":"Orientation-Aware Leg Movement Learning for Action-Driven Human Motion\n Prediction","summary":" The task of action-driven human motion prediction aims to forecast future\nhuman motion based on the observed sequence while respecting the given action\nlabel. It requires modeling not only the stochasticity within human motion but\nthe smooth yet realistic transition between multiple action labels. However,\nthe fact that most datasets do not contain such transition data complicates\nthis task. Existing work tackles this issue by learning a smoothness prior to\nsimply promote smooth transitions, yet doing so can result in unnatural\ntransitions especially when the history and predicted motions differ\nsignificantly in orientations. In this paper, we argue that valid human motion\ntransitions should incorporate realistic leg movements to handle orientation\nchanges, and cast it as an action-conditioned in-betweening (ACB) learning task\nto encourage transition naturalness. Because modeling all possible transitions\nis virtually unreasonable, our ACB is only performed on very few selected\naction classes with active gait motions, such as Walk or Run. Specifically, we\nfollow a two-stage forecasting strategy by first employing the motion diffusion\nmodel to generate the target motion with a specified future action, and then\nproducing the in-betweening to smoothly connect the observation and prediction\nto eventually address motion prediction. Our method is completely free from the\nlabeled motion transition data during training. To show the robustness of our\napproach, we generalize our trained in-betweening learning model on one dataset\nto two unseen large-scale motion datasets to produce natural transitions.\nExtensive experimental evaluations on three benchmark datasets demonstrate that\nour method yields the state-of-the-art performance in terms of visual quality,\nprediction accuracy, and action faithfulness.\n","authors":["Chunzhi Gu","Chao Zhang","Shigeru Kuriyama"],"pdf_url":"https://arxiv.org/pdf/2310.14907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03699v1","updated":"2024-02-06T04:40:27Z","published":"2024-02-06T04:40:27Z","title":"Automatic Robotic Development through Collaborative Framework by Large\n Language Models","summary":" Despite the remarkable code generation abilities of large language models\nLLMs, they still face challenges in complex task handling. Robot development, a\nhighly intricate field, inherently demands human involvement in task allocation\nand collaborative teamwork . To enhance robot development, we propose an\ninnovative automated collaboration framework inspired by real-world robot\ndevelopers. This framework employs multiple LLMs in distinct roles analysts,\nprogrammers, and testers. Analysts delve deep into user requirements, enabling\nprogrammers to produce precise code, while testers fine-tune the parameters\nbased on user feedback for practical robot application. Each LLM tackles\ndiverse, critical tasks within the development process. Clear collaboration\nrules emulate real world teamwork among LLMs. Analysts, programmers, and\ntesters form a cohesive team overseeing strategy, code, and parameter\nadjustments . Through this framework, we achieve complex robot development\nwithout requiring specialized knowledge, relying solely on non experts\nparticipation.\n","authors":["Zhirong Luan","Yujun Lai"],"pdf_url":"https://arxiv.org/pdf/2402.03699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05462v2","updated":"2024-02-06T04:39:08Z","published":"2023-12-09T04:17:20Z","title":"HumanReg: Self-supervised Non-rigid Registration of Human Point Cloud","summary":" In this paper, we present a novel registration framework, HumanReg, that\nlearns a non-rigid transformation between two human point clouds end-to-end. We\nintroduce body prior into the registration process to efficiently handle this\ntype of point cloud. Unlike most exsisting supervised registration techniques\nthat require expensive point-wise flow annotations, HumanReg can be trained in\na self-supervised manner benefiting from a set of novel loss functions. To make\nour model better converge on real-world data, we also propose a pretraining\nstrategy, and a synthetic dataset (HumanSyn4D) consists of dynamic, sparse\nhuman point clouds and their auto-generated ground truth annotations. Our\nexperiments shows that HumanReg achieves state-of-the-art performance on\nCAPE-512 dataset and gains a qualitative result on another more challenging\nreal-world dataset. Furthermore, our ablation studies demonstrate the\neffectiveness of our synthetic dataset and novel loss functions. Our code and\nsynthetic dataset is available at https://github.com/chenyifanthu/HumanReg.\n","authors":["Yifan Chen","Zhiyu Pan","Zhicheng Zhong","Wenxuan Guo","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.05462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03697v1","updated":"2024-02-06T04:33:51Z","published":"2024-02-06T04:33:51Z","title":"SHMC-Net: A Mask-guided Feature Fusion Network for Sperm Head Morphology\n Classification","summary":" Male infertility accounts for about one-third of global infertility cases.\nManual assessment of sperm abnormalities through head morphology analysis\nencounters issues of observer variability and diagnostic discrepancies among\nexperts. Its alternative, Computer-Assisted Semen Analysis (CASA), suffers from\nlow-quality sperm images, small datasets, and noisy class labels. We propose a\nnew approach for sperm head morphology classification, called SHMC-Net, which\nuses segmentation masks of sperm heads to guide the morphology classification\nof sperm images. SHMC-Net generates reliable segmentation masks using image\npriors, refines object boundaries with an efficient graph-based method, and\ntrains an image network with sperm head crops and a mask network with the\ncorresponding masks. In the intermediate stages of the networks, image and mask\nfeatures are fused with a fusion scheme to better learn morphological features.\nTo handle noisy class labels and regularize training on small datasets,\nSHMC-Net applies Soft Mixup to combine mixup augmentation and a loss function.\nWe achieve state-of-the-art results on SCIAN and HuSHeM datasets, outperforming\nmethods that use additional pre-training or costly ensembling techniques.\n","authors":["Nishchal Sapkota","Yejia Zhang","Sirui Li","Peixian Liang","Zhuo Zhao","Danny Z Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03697v1.pdf","comment":"A shorter version is published on ISBI 2024"},{"id":"http://arxiv.org/abs/2308.11776v2","updated":"2024-02-06T04:31:11Z","published":"2023-08-22T20:35:24Z","title":"WS-SfMLearner: Self-supervised Monocular Depth and Ego-motion Estimation\n on Surgical Videos with Unknown Camera Parameters","summary":" Depth estimation in surgical video plays a crucial role in many image-guided\nsurgery procedures. However, it is difficult and time consuming to create depth\nmap ground truth datasets in surgical videos due in part to inconsistent\nbrightness and noise in the surgical scene. Therefore, building an accurate and\nrobust self-supervised depth and camera ego-motion estimation system is gaining\nmore attention from the computer vision community. Although several\nself-supervision methods alleviate the need for ground truth depth maps and\nposes, they still need known camera intrinsic parameters, which are often\nmissing or not recorded. Moreover, the camera intrinsic prediction methods in\nexisting works depend heavily on the quality of datasets. In this work, we\naimed to build a self-supervised depth and ego-motion estimation system which\ncan predict not only accurate depth maps and camera pose, but also camera\nintrinsic parameters. We proposed a cost-volume-based supervision manner to\ngive the system auxiliary supervision for camera parameters prediction. The\nexperimental results showed that the proposed method improved the accuracy of\nestimated camera parameters, ego-motion, and depth estimation.\n","authors":["Ange Lou","Jack Noble"],"pdf_url":"https://arxiv.org/pdf/2308.11776v2.pdf","comment":"Accepted by SPIE 2024"},{"id":"http://arxiv.org/abs/2402.03695v1","updated":"2024-02-06T04:30:49Z","published":"2024-02-06T04:30:49Z","title":"ConUNETR: A Conditional Transformer Network for 3D Micro-CT Embryonic\n Cartilage Segmentation","summary":" Studying the morphological development of cartilaginous and osseous\nstructures is critical to the early detection of life-threatening skeletal\ndysmorphology. Embryonic cartilage undergoes rapid structural changes within\nhours, introducing biological variations and morphological shifts that limit\nthe generalization of deep learning-based segmentation models that infer across\nmultiple embryonic age groups. Obtaining individual models for each age group\nis expensive and less effective, while direct transfer (predicting an age\nunseen during training) suffers a potential performance drop due to\nmorphological shifts. We propose a novel Transformer-based segmentation model\nwith improved biological priors that better distills morphologically diverse\ninformation through conditional mechanisms. This enables a single model to\naccurately predict cartilage across multiple age groups. Experiments on the\nmice cartilage dataset show the superiority of our new model compared to other\ncompetitive segmentation models. Additional studies on a separate mice\ncartilage dataset with a distinct mutation show that our model generalizes well\nand effectively captures age-based cartilage morphology patterns.\n","authors":["Nishchal Sapkota","Yejia Zhang","Susan M. Motch Perrine","Yuhan Hsi","Sirui Li","Meng Wu","Greg Holmes","Abdul R. Abdulai","Ethylin W. Jabs","Joan T. Richtsmeier","Danny Z Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03695v1.pdf","comment":"Published in ISBI 2024"},{"id":"http://arxiv.org/abs/2308.11774v2","updated":"2024-02-06T04:29:34Z","published":"2023-08-22T20:31:00Z","title":"SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene\n Reconstruction by Neural Radiance Field (NeRF)","summary":" The accurate reconstruction of surgical scenes from surgical videos is\ncritical for various applications, including intraoperative navigation and\nimage-guided robotic surgery automation. However, previous approaches, mainly\nrelying on depth estimation, have limited effectiveness in reconstructing\nsurgical scenes with moving surgical tools. To address this limitation and\nprovide accurate 3D position prediction for surgical tools in all frames, we\npropose a novel approach called SAMSNeRF that combines Segment Anything Model\n(SAM) and Neural Radiance Field (NeRF) techniques. Our approach generates\naccurate segmentation masks of surgical tools using SAM, which guides the\nrefinement of the dynamic surgical scene reconstruction by NeRF. Our\nexperimental results on public endoscopy surgical videos demonstrate that our\napproach successfully reconstructs high-fidelity dynamic surgical scenes and\naccurately reflects the spatial information of surgical tools. Our proposed\napproach can significantly enhance surgical navigation and automation by\nproviding surgeons with accurate 3D position information of surgical tools\nduring surgery.The source code will be released soon.\n","authors":["Ange Lou","Yamin Li","Xing Yao","Yike Zhang","Jack Noble"],"pdf_url":"https://arxiv.org/pdf/2308.11774v2.pdf","comment":"Accepted by SPIE 2024"},{"id":"http://arxiv.org/abs/2402.03690v1","updated":"2024-02-06T04:25:07Z","published":"2024-02-06T04:25:07Z","title":"3Doodle: Compact Abstraction of Objects with 3D Strokes","summary":" While free-hand sketching has long served as an efficient representation to\nconvey characteristics of an object, they are often subjective, deviating\nsignificantly from realistic representations. Moreover, sketches are not\nconsistent for arbitrary viewpoints, making it hard to catch 3D shapes. We\npropose 3Dooole, generating descriptive and view-consistent sketch images given\nmulti-view images of the target object. Our method is based on the idea that a\nset of 3D strokes can efficiently represent 3D structural information and\nrender view-consistent 2D sketches. We express 2D sketches as a union of\nview-independent and view-dependent components. 3D cubic B ezier curves\nindicate view-independent 3D feature lines, while contours of superquadrics\nexpress a smooth outline of the volume of varying viewpoints. Our pipeline\ndirectly optimizes the parameters of 3D stroke primitives to minimize\nperceptual losses in a fully differentiable manner. The resulting sparse set of\n3D strokes can be rendered as abstract sketches containing essential 3D\ncharacteristic shapes of various objects. We demonstrate that 3Doodle can\nfaithfully express concepts of the original images compared with recent sketch\ngeneration approaches.\n","authors":["Changwoon Choi","Jaeah Lee","Jaesik Park","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2402.03690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14998v2","updated":"2024-02-06T04:20:15Z","published":"2023-05-24T10:36:12Z","title":"An Examination of the Robustness of Reference-Free Image Captioning\n Evaluation Metrics","summary":" Recently, reference-free metrics such as CLIPScore (Hessel et al., 2021),\nUMIC (Lee et al., 2021), and PAC-S (Sarto et al., 2023) have been proposed for\nautomatic reference-free evaluation of image captions. Our focus lies in\nevaluating the robustness of these metrics in scenarios that require\ndistinguishing between two captions with high lexical overlap but very\ndifferent meanings. Our findings reveal that despite their high correlation\nwith human judgments, CLIPScore, UMIC, and PAC-S struggle to identify\nfine-grained errors. While all metrics exhibit strong sensitivity to visual\ngrounding errors, their sensitivity to caption implausibility errors is\nlimited. Furthermore, we found that all metrics are sensitive to variations in\nthe size of image-relevant objects mentioned in the caption, while CLIPScore\nand PAC-S are also sensitive to the number of mentions of image-relevant\nobjects in the caption. Regarding linguistic aspects of a caption, all metrics\nshow weak comprehension of negation, and CLIPScore and PAC-S are insensitive to\nthe structure of the caption to a great extent. We hope our findings will guide\nfurther improvements in reference-free evaluation of image captioning.\n","authors":["Saba Ahmadi","Aishwarya Agrawal"],"pdf_url":"https://arxiv.org/pdf/2305.14998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10373v3","updated":"2024-02-06T04:11:24Z","published":"2022-08-22T14:52:06Z","title":"Reversing Skin Cancer Adversarial Examples by Multiscale Diffusive and\n Denoising Aggregation Mechanism","summary":" Reliable skin cancer diagnosis models play an essential role in early\nscreening and medical intervention. Prevailing computer-aided skin cancer\nclassification systems employ deep learning approaches. However, recent studies\nreveal their extreme vulnerability to adversarial attacks -- often\nimperceptible perturbations to significantly reduce the performances of skin\ncancer diagnosis models. To mitigate these threats, this work presents a\nsimple, effective, and resource-efficient defense framework by reverse\nengineering adversarial perturbations in skin cancer images. Specifically, a\nmultiscale image pyramid is first established to better preserve discriminative\nstructures in the medical imaging domain. To neutralize adversarial effects,\nskin images at different scales are then progressively diffused by injecting\nisotropic Gaussian noises to move the adversarial examples to the clean image\nmanifold. Crucially, to further reverse adversarial noises and suppress\nredundant injected noises, a novel multiscale denoising mechanism is carefully\ndesigned that aggregates image information from neighboring scales. We\nevaluated the defensive effectiveness of our method on ISIC 2019, a largest\nskin cancer multiclass classification dataset. Experimental results demonstrate\nthat the proposed method can successfully reverse adversarial perturbations\nfrom different attacks and significantly outperform some state-of-the-art\nmethods in defending skin cancer diagnosis models.\n","authors":["Yongwei Wang","Yuan Li","Zhiqi Shen","Yuhui Qiao"],"pdf_url":"https://arxiv.org/pdf/2208.10373v3.pdf","comment":"Accepted by Computers in Biology and Medicine"},{"id":"http://arxiv.org/abs/2010.05784v4","updated":"2024-02-06T03:53:05Z","published":"2020-10-08T02:10:54Z","title":"Learning Calibrated Uncertainties for Domain Shift: A Distributionally\n Robust Learning Approach","summary":" We propose a framework for learning calibrated uncertainties under domain\nshifts, where the source (training) distribution differs from the target (test)\ndistribution. We detect such domain shifts via a differentiable density ratio\nestimator and train it together with the task network, composing an adjusted\nsoftmax predictive form concerning domain shift. In particular, the density\nratio estimation reflects the closeness of a target (test) sample to the source\n(training) distribution. We employ it to adjust the uncertainty of prediction\nin the task network. This idea of using the density ratio is based on the\ndistributionally robust learning (DRL) framework, which accounts for the domain\nshift by adversarial risk minimization. We show that our proposed method\ngenerates calibrated uncertainties that benefit downstream tasks, such as\nunsupervised domain adaptation (UDA) and semi-supervised learning (SSL). On\nthese tasks, methods like self-training and FixMatch use uncertainties to\nselect confident pseudo-labels for re-training. Our experiments show that the\nintroduction of DRL leads to significant improvements in cross-domain\nperformance. We also show that the estimated density ratios align with human\nselection frequencies, suggesting a positive correlation with a proxy of human\nperceived uncertainties.\n","authors":["Haoxuan Wang","Zhiding Yu","Yisong Yue","Anima Anandkumar","Anqi Liu","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2010.05784v4.pdf","comment":"IJCAI 2023"},{"id":"http://arxiv.org/abs/2402.03666v1","updated":"2024-02-06T03:39:44Z","published":"2024-02-06T03:39:44Z","title":"QuEST: Low-bit Diffusion Model Quantization via Efficient Selective\n Finetuning","summary":" Diffusion models have achieved remarkable success in image generation tasks,\nyet their practical deployment is restrained by the high memory and time\nconsumption. While quantization paves a way for diffusion model compression and\nacceleration, existing methods totally fail when the models are quantized to\nlow-bits. In this paper, we unravel three properties in quantized diffusion\nmodels that compromise the efficacy of current methods: imbalanced activation\ndistributions, imprecise temporal information, and vulnerability to\nperturbations of specific modules. To alleviate the intensified low-bit\nquantization difficulty stemming from the distribution imbalance, we propose\nfinetuning the quantized model to better adapt to the activation distribution.\nBuilding on this idea, we identify two critical types of quantized layers:\nthose holding vital temporal information and those sensitive to reduced\nbit-width, and finetune them to mitigate performance degradation with\nefficiency. We empirically verify that our approach modifies the activation\ndistribution and provides meaningful temporal information, facilitating easier\nand more accurate quantization. Our method is evaluated over three\nhigh-resolution image generation tasks and achieves state-of-the-art\nperformance under various bit-width settings, as well as being the first method\nto generate readable images on full 4-bit (i.e. W4A4) Stable Diffusion.\n","authors":["Haoxuan Wang","Yuzhang Shang","Zhihang Yuan","Junyi Wu","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2402.03666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11708v2","updated":"2024-02-06T03:10:25Z","published":"2024-01-22T06:16:29Z","title":"Mastering Text-to-Image Diffusion: Recaptioning, Planning, and\n Generating with Multimodal LLMs","summary":" Diffusion models have exhibit exceptional performance in text-to-image\ngeneration and editing. However, existing methods often face challenges when\nhandling complex text prompts that involve multiple objects with multiple\nattributes and relationships. In this paper, we propose a brand new\ntraining-free text-to-image generation/editing framework, namely Recaption,\nPlan and Generate (RPG), harnessing the powerful chain-of-thought reasoning\nability of multimodal LLMs to enhance the compositionality of text-to-image\ndiffusion models. Our approach employs the MLLM as a global planner to\ndecompose the process of generating complex images into multiple simpler\ngeneration tasks within subregions. We propose complementary regional diffusion\nto enable region-wise compositional generation. Furthermore, we integrate\ntext-guided image generation and editing within the proposed RPG in a\nclosed-loop fashion, thereby enhancing generalization ability. Extensive\nexperiments demonstrate our RPG outperforms state-of-the-art text-to-image\ndiffusion models, including DALL-E 3 and SDXL, particularly in multi-category\nobject composition and text-image semantic alignment. Notably, our RPG\nframework exhibits wide compatibility with various MLLM architectures (e.g.,\nMiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available\nat: https://github.com/YangLing0818/RPG-DiffusionMaster\n","authors":["Ling Yang","Zhaochen Yu","Chenlin Meng","Minkai Xu","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11708v2.pdf","comment":"Project: https://github.com/YangLing0818/RPG-DiffusionMaster"},{"id":"http://arxiv.org/abs/2402.03654v1","updated":"2024-02-06T03:02:39Z","published":"2024-02-06T03:02:39Z","title":"Reviewing FID and SID Metrics on Generative Adversarial Networks","summary":" The growth of generative adversarial network (GAN) models has increased the\nability of image processing and provides numerous industries with the\ntechnology to produce realistic image transformations. However, with the field\nbeing recently established there are new evaluation metrics that can further\nthis research. Previous research has shown the Fr\\'echet Inception Distance\n(FID) to be an effective metric when testing these image-to-image GANs in\nreal-world applications. Signed Inception Distance (SID), a founded metric in\n2023, expands on FID by allowing unsigned distances. This paper uses public\ndatasets that consist of fa\\c{c}ades, cityscapes, and maps within Pix2Pix and\nCycleGAN models. After training these models are evaluated on both inception\ndistance metrics which measure the generating performance of the trained\nmodels. Our findings indicate that usage of the metric SID incorporates an\nefficient and effective metric to complement, or even exceed the ability shown\nusing the FID for the image-to-image GANs\n","authors":["Ricardo de Deijn","Aishwarya Batra","Brandon Koch","Naseef Mansoor","Hema Makkena"],"pdf_url":"https://arxiv.org/pdf/2402.03654v1.pdf","comment":"14 pages 9 figures 1 table Included in IOTBS, NLTM, AIMLA, DBDM -\n 2024 Conference Proceedings Editor: David C. Wyld et al"},{"id":"http://arxiv.org/abs/2205.14756v6","updated":"2024-02-06T02:57:35Z","published":"2022-05-29T20:07:23Z","title":"EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense\n Prediction","summary":" High-resolution dense prediction enables many appealing real-world\napplications, such as computational photography, autonomous driving, etc.\nHowever, the vast computational cost makes deploying state-of-the-art\nhigh-resolution dense prediction models on hardware devices difficult. This\nwork presents EfficientViT, a new family of high-resolution vision models with\nnovel multi-scale linear attention. Unlike prior high-resolution dense\nprediction models that rely on heavy softmax attention, hardware-inefficient\nlarge-kernel convolution, or complicated topology structure to obtain good\nperformances, our multi-scale linear attention achieves the global receptive\nfield and multi-scale learning (two desirable features for high-resolution\ndense prediction) with only lightweight and hardware-efficient operations. As\nsuch, EfficientViT delivers remarkable performance gains over previous\nstate-of-the-art models with significant speedup on diverse hardware platforms,\nincluding mobile CPU, edge GPU, and cloud GPU. Without performance loss on\nCityscapes, our EfficientViT provides up to 13.9$\\times$ and 6.2$\\times$ GPU\nlatency reduction over SegFormer and SegNeXt, respectively. For\nsuper-resolution, EfficientViT delivers up to 6.4x speedup over Restormer while\nproviding 0.11dB gain in PSNR. For Segment Anything, EfficientViT delivers\n48.9x higher throughput on A100 GPU while achieving slightly better zero-shot\ninstance segmentation performance on COCO.\n","authors":["Han Cai","Junyan Li","Muyan Hu","Chuang Gan","Song Han"],"pdf_url":"https://arxiv.org/pdf/2205.14756v6.pdf","comment":"ICCV 2023; Update EfficientViT-SAM results"},{"id":"http://arxiv.org/abs/2402.02085v2","updated":"2024-02-06T02:51:00Z","published":"2024-02-03T08:52:06Z","title":"DeCoF: Generated Video Detection via Frame Consistency","summary":" The escalating quality of video generated by advanced video generation\nmethods leads to new security challenges in society, which makes generated\nvideo detection an urgent research priority. To foster collaborative research\nin this area, we construct the first open-source dataset explicitly for\ngenerated video detection, providing a valuable resource for the community to\nbenchmark and improve detection methodologies. Through a series of carefully\ndesigned probe experiments, our study explores the significance of temporal and\nspatial artifacts in developing general and robust detectors for generated\nvideo. Based on the principle of video frame consistency, we introduce a simple\nyet effective detection model (DeCoF) that eliminates the impact of spatial\nartifacts during generalizing feature learning. Our extensive experiments\ndemonstrate the efficacy of DeCoF in detecting videos produced by unseen video\ngeneration models and confirm its powerful generalization capabilities across\nseveral commercial proprietary models.\n","authors":["Long Ma","Jiajia Zhang","Hongping Deng","Ningyu Zhang","Yong Liao","Haiyang Yu"],"pdf_url":"https://arxiv.org/pdf/2402.02085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04877v2","updated":"2024-02-06T02:46:28Z","published":"2023-06-08T02:17:29Z","title":"Trojan Model Detection Using Activation Optimization","summary":" Training machine learning models can be very expensive or even unaffordable.\nThis may be, for example, due to data limitations (unavailability or being too\nlarge), or computational power limitations. Therefore, it is a common practice\nto rely on open-source pre-trained models whenever possible. However, this\npractice is alarming from a security perspective. Pre-trained models can be\ninfected with Trojan attacks, in which the attacker embeds a trigger in the\nmodel such that the model's behavior can be controlled by the attacker when the\ntrigger is present in the input. In this paper, we present a novel method for\ndetecting Trojan models. Our method creates a signature for a model based on\nactivation optimization. A classifier is then trained to detect a Trojan model\ngiven its signature. We call our method TRIGS for TRojan Identification from\nGradient-based Signatures. TRIGS achieves state-of-the-art performance on two\npublic datasets of convolutional models. Additionally, we introduce a new\nchallenging dataset of ImageNet models based on the vision transformer\narchitecture. TRIGS delivers the best performance on the new dataset,\nsurpassing the baseline methods by a large margin. Our experiments also show\nthat TRIGS requires only a small amount of clean samples to achieve good\nperformance, and works reasonably well even if the defender does not have prior\nknowledge about the attacker's model architecture. Our dataset will be released\nsoon.\n","authors":["Mohamed E. Hussein","Sudharshan Subramaniam Janakiraman","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2306.04877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03898v3","updated":"2024-02-06T02:23:16Z","published":"2023-07-08T04:34:09Z","title":"StyleGAN3: Generative Networks for Improving the Equivariance of\n Translation and Rotation","summary":" StyleGAN can use style to affect facial posture and identity features, and\nnoise to affect hair, wrinkles, skin color and other details. Among these, the\noutcomes of the picture processing will vary slightly between different\nversions of styleGAN. As a result, the comparison of performance differences\nbetween styleGAN2 and the two modified versions of styleGAN3 will be the main\nfocus of this study. We used the FFHQ dataset as the dataset and FID, EQ-T, and\nEQ-R were used to be the assessment of the model. In the end, we discovered\nthat Stylegan3 version is a better generative network to improve the\nequivariance. Our findings have a positive impact on the creation of animation\nand videos.\n","authors":["Tianlei Zhu","Junqi Chen","Renzhe Zhu","Gaurav Gupta"],"pdf_url":"https://arxiv.org/pdf/2307.03898v3.pdf","comment":"But now we feel we haven't fully studied our work and have found some\n new great results. So after careful consideration, we're going to rework this\n manuscript and try to give a more accurate model"},{"id":"http://arxiv.org/abs/2312.08846v3","updated":"2024-02-06T02:21:10Z","published":"2023-12-14T12:02:24Z","title":"TiMix: Text-aware Image Mixing for Effective Vision-Language\n Pre-training","summary":" Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances\nmodern Vision-Language Pre-training (VLP) models by aligning visual and\nlinguistic modalities. Due to noises in web-harvested text-image pairs,\nhowever, scaling up training data volume in SMCL presents considerable\nobstacles in terms of computational cost and data inefficiency. To improve data\nefficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates\nmix-based data augmentation techniques into SMCL, yielding significant\nperformance improvements without significantly increasing computational\noverhead. We provide a theoretical analysis of TiMixfrom a mutual information\n(MI) perspective, showing that mixed data samples for cross-modal contrastive\nlearning implicitly serve as a regularizer for the contrastive loss. The\nexperimental results demonstrate that TiMix exhibits a comparable performance\non downstream tasks, even with a reduced amount of training data and shorter\ntraining time, when benchmarked against existing methods. This work empirically\nand theoretically demonstrates the potential of data mixing for data-efficient\nand computationally viable VLP, benefiting broader VLP model adoption in\npractical scenarios.\n","authors":["Chaoya Jiang","Wei ye","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.08846v3.pdf","comment":"Accepted on AAAI2024"},{"id":"http://arxiv.org/abs/2402.03634v1","updated":"2024-02-06T02:17:44Z","published":"2024-02-06T02:17:44Z","title":"BEAM: Beta Distribution Ray Denoising for Multi-view 3D Object Detection","summary":" Multi-view 3D object detectors struggle with duplicate predictions due to the\nlack of depth information, resulting in false positive detections. In this\nstudy, we introduce BEAM, a novel Beta Distribution Ray Denoising approach that\ncan be applied to any DETR-style multi-view 3D detector to explicitly\nincorporate structure prior knowledge of the scene. By generating rays from\ncameras to objects and sampling spatial denoising queries from the Beta\ndistribution family along these rays, BEAM enhances the model's ability to\ndistinguish spatial hard negative samples arising from ambiguous depths. BEAM\nis a plug-and-play technique that adds only marginal computational costs during\ntraining, while impressively preserving the inference speed. Extensive\nexperiments and ablation studies on the NuScenes dataset demonstrate\nsignificant improvements over strong baselines, outperforming the\nstate-of-the-art method StreamPETR by 1.9% mAP. The code will be available at\nhttps://github.com/LiewFeng/BEAM.\n","authors":["Feng Liu","Tengteng Huang","Qianjing Zhang","Haotian Yao","Chi Zhang","Fang Wan","Qixiang Ye","Yanzhao Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.03634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03631v1","updated":"2024-02-06T02:00:18Z","published":"2024-02-06T02:00:18Z","title":"CAT-SAM: Conditional Tuning Network for Few-Shot Adaptation of\n Segmentation Anything Model","summary":" The recent Segment Anything Model (SAM) has demonstrated remarkable zero-shot\ncapability and flexible geometric prompting in general image segmentation.\nHowever, SAM often struggles when handling various unconventional images, such\nas aerial, medical, and non-RGB images. This paper presents CAT-SAM, a\nConditionAl Tuning network that adapts SAM toward various unconventional target\ntasks with just few-shot target samples. CAT-SAM freezes the entire SAM and\nadapts its mask decoder and image encoder simultaneously with a small number of\nlearnable parameters. The core design is a prompt bridge structure that enables\ndecoder-conditioned joint tuning of the heavyweight image encoder and the\nlightweight mask decoder. The bridging maps the prompt token of the mask\ndecoder to the image encoder, fostering synergic adaptation of the encoder and\nthe decoder with mutual benefits. We develop two representative tuning\nstrategies for the image encoder which leads to two CAT-SAM variants: one\ninjecting learnable prompt tokens in the input space and the other inserting\nlightweight adapter networks. Extensive experiments over 11 unconventional\ntasks show that both CAT-SAM variants achieve superior target segmentation\nperformance consistently even under the very challenging one-shot adaptation\nsetup. Project page: \\url{https://xiaoaoran.github.io/projects/CAT-SAM}\n","authors":["Aoran Xiao","Weihao Xuan","Heli Qi","Yun Xing","Ruijie Ren","Xiaoqin Zhang","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03631v1.pdf","comment":"Project page: https://xiaoaoran.github.io/projects/CAT-SAM"},{"id":"http://arxiv.org/abs/2311.15607v2","updated":"2024-02-06T01:31:24Z","published":"2023-11-27T08:00:53Z","title":"Spatially Covariant Image Registration with Text Prompts","summary":" Medical images are often characterized by their structured anatomical\nrepresentations and spatially inhomogeneous contrasts. Leveraging anatomical\npriors in neural networks can greatly enhance their utility in\nresource-constrained clinical settings. Prior research has harnessed such\ninformation for image segmentation, yet progress in deformable image\nregistration has been modest. Our work introduces textSCF, a novel method that\nintegrates spatially covariant filters and textual anatomical prompts encoded\nby visual-language models, to fill this gap. This approach optimizes an\nimplicit function that correlates text embeddings of anatomical regions to\nfilter weights, relaxing the typical translation-invariance constraint of\nconvolutional operations. TextSCF not only boosts computational efficiency but\ncan also retain or improve registration accuracy. By capturing the contextual\ninterplay between anatomical regions, it offers impressive inter-regional\ntransferability and the ability to preserve structural discontinuities during\nregistration. TextSCF's performance has been rigorously tested on inter-subject\nbrain MRI and abdominal CT registration tasks, outperforming existing\nstate-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the\nleaderboard. In abdominal registrations, textSCF's larger model variant\nimproved the Dice score by 11.3% over the second-best model, while its smaller\nvariant maintained similar accuracy but with an 89.13% reduction in network\nparameters and a 98.34\\% decrease in computational operations.\n","authors":["Xiang Chen","Min Liu","Rongguang Wang","Renjiu Hu","Dongdong Liu","Gaolei Li","Hang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15607v2.pdf","comment":"13 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.04148v2","updated":"2024-02-06T01:27:17Z","published":"2023-09-08T06:24:44Z","title":"Representation Synthesis by Probabilistic Many-Valued Logic Operation in\n Self-Supervised Learning","summary":" In this paper, we propose a new self-supervised learning (SSL) method for\nrepresentations that enable logic operations. Representation learning has been\napplied to various tasks, such as image generation and retrieval. The logical\ncontrollability of representations is important for these tasks. Although some\nmethods have been shown to enable the intuitive control of representations\nusing natural languages as the inputs, representation control via logic\noperations between representations has not been demonstrated. Some SSL methods\nusing representation synthesis (e.g., elementwise mean and maximum operations)\nhave been proposed, but the operations performed in these methods do not\nincorporate logic operations. In this work, we propose a logic-operable\nself-supervised representation learning method by replacing the existing\nrepresentation synthesis with the OR operation on the probabilistic extension\nof many-valued logic. The representations comprise a set of feature-possession\ndegrees, which are truth values indicating the presence or absence of each\nfeature in the image, and realize the logic operations (e.g., OR and AND). Our\nmethod can generate a representation that has the features of both\nrepresentations or only those features common to both representations. In\naddition, the expression of the ambiguous presence of a feature is realized by\nindicating the feature-possession degree by the probability distribution of\ntruth values of the many-valued logic. We showed that our method performs\ncompetitively in single and multi-label classification tasks compared with\nprior SSL methods using synthetic representations. Moreover, experiments on\nimage retrieval using MNIST and PascalVOC showed that the representations of\nour method can be operated by OR and AND operations.\n","authors":["Hiroki Nakamura","Masashi Okada","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2309.04148v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.03607v1","updated":"2024-02-06T00:51:27Z","published":"2024-02-06T00:51:27Z","title":"Improving Contextual Congruence Across Modalities for Effective\n Multimodal Marketing using Knowledge-infused Learning","summary":" The prevalence of smart devices with the ability to capture moments in\nmultiple modalities has enabled users to experience multimodal information\nonline. However, large Language (LLMs) and Vision models (LVMs) are still\nlimited in capturing holistic meaning with cross-modal semantic relationships.\nWithout explicit, common sense knowledge (e.g., as a knowledge graph), Visual\nLanguage Models (VLMs) only learn implicit representations by capturing\nhigh-level patterns in vast corpora, missing essential contextual cross-modal\ncues. In this work, we design a framework to couple explicit commonsense\nknowledge in the form of knowledge graphs with large VLMs to improve the\nperformance of a downstream task, predicting the effectiveness of multi-modal\nmarketing campaigns. While the marketing application provides a compelling\nmetric for assessing our methods, our approach enables the early detection of\nlikely persuasive multi-modal campaigns and the assessment and augmentation of\nmarketing theory.\n","authors":["Trilok Padhi","Ugur Kursuncu","Yaman Kumar","Valerie L. Shalin","Lane Peterson Fronczek"],"pdf_url":"https://arxiv.org/pdf/2402.03607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03592v1","updated":"2024-02-06T00:03:44Z","published":"2024-02-06T00:03:44Z","title":"GRASP: GRAph-Structured Pyramidal Whole Slide Image Representation","summary":" Cancer subtyping is one of the most challenging tasks in digital pathology,\nwhere Multiple Instance Learning (MIL) by processing gigapixel whole slide\nimages (WSIs) has been in the spotlight of recent research. However, MIL\napproaches do not take advantage of inter- and intra-magnification information\ncontained in WSIs. In this work, we present GRASP, a novel graph-structured\nmulti-magnification framework for processing WSIs in digital pathology. Our\napproach is designed to dynamically emulate the pathologist's behavior in\nhandling WSIs and benefits from the hierarchical structure of WSIs. GRASP,\nwhich introduces a convergence-based node aggregation instead of traditional\npooling mechanisms, outperforms state-of-the-art methods over two distinct\ncancer datasets by a margin of up to 10% balanced accuracy, while being 7 times\nsmaller than the closest-performing state-of-the-art model in terms of the\nnumber of parameters. Our results show that GRASP is dynamic in finding and\nconsulting with different magnifications for subtyping cancers and is reliable\nand stable across different hyperparameters. The model's behavior has been\nevaluated by two expert pathologists confirming the interpretability of the\nmodel's dynamic. We also provide a theoretical foundation, along with empirical\nevidence, for our work, explaining how GRASP interacts with different\nmagnifications and nodes in the graph to make predictions. We believe that the\nstrong characteristics yet simple structure of GRASP will encourage the\ndevelopment of interpretable, structure-based designs for WSI representation in\ndigital pathology. Furthermore, we publish two large graph datasets of rare\nOvarian and Bladder cancers to contribute to the field.\n","authors":["Ali Khajegili Mirabadi","Graham Archibald","Amirali Darbandsari","Alberto Contreras-Sanz","Ramin Ebrahim Nakhli","Maryam Asadi","Allen Zhang","C. Blake Gilks","Peter Black","Gang Wang","Hossein Farahani","Ali Bashashati"],"pdf_url":"https://arxiv.org/pdf/2402.03592v1.pdf","comment":"Early version: To be updated"},{"id":"http://arxiv.org/abs/2402.04476v1","updated":"2024-02-06T23:52:10Z","published":"2024-02-06T23:52:10Z","title":"Dual-View Visual Contextualization for Web Navigation","summary":" Automatic web navigation aims to build a web agent that can follow language\ninstructions to execute complex and diverse tasks on real-world websites.\nExisting work primarily takes HTML documents as input, which define the\ncontents and action spaces (i.e., actionable elements and operations) of\nwebpages. Nevertheless, HTML documents may not provide a clear task-related\ncontext for each element, making it hard to select the right (sequence of)\nactions. In this paper, we propose to contextualize HTML elements through their\n\"dual views\" in webpage screenshots: each HTML element has its corresponding\nbounding box and visual content in the screenshot. We build upon the insight --\nweb developers tend to arrange task-related elements nearby on webpages to\nenhance user experiences -- and propose to contextualize each element with its\nneighbor elements, using both textual and visual features. The resulting\nrepresentations of HTML elements are more informative for the agent to take\naction. We validate our method on the recently released Mind2Web dataset, which\nfeatures diverse navigation domains and tasks on real-world websites. Our\nmethod consistently outperforms the baseline in all the scenarios, including\ncross-task, cross-website, and cross-domain ones.\n","authors":["Jihyung Kil","Chan Hee Song","Boyuan Zheng","Xiang Deng","Yu Su","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2402.04476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04465v1","updated":"2024-02-06T23:18:29Z","published":"2024-02-06T23:18:29Z","title":"BAdaCost: Multi-class Boosting with Costs","summary":" We present BAdaCost, a multi-class cost-sensitive classification algorithm.\nIt combines a set of cost-sensitive multi-class weak learners to obtain a\nstrong classification rule within the Boosting framework. To derive the\nalgorithm we introduce CMEL, a Cost-sensitive Multi-class Exponential Loss that\ngeneralizes the losses optimized in various classification algorithms such as\nAdaBoost, SAMME, Cost-sensitive AdaBoost and PIBoost. Hence unifying them under\na common theoretical framework. In the experiments performed we prove that\nBAdaCost achieves significant gains in performance when compared to previous\nmulti-class cost-sensitive approaches. The advantages of the proposed algorithm\nin asymmetric multi-class classification are also evaluated in practical\nmulti-view face and car detection problems.\n","authors":["Antonio Fernández-Baldera","José M. Buenaposada","Luis Baumela"],"pdf_url":"https://arxiv.org/pdf/2402.04465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04446v1","updated":"2024-02-06T22:32:05Z","published":"2024-02-06T22:32:05Z","title":"Pushing the limits of cell segmentation models for imaging mass\n cytometry","summary":" Imaging mass cytometry (IMC) is a relatively new technique for imaging\nbiological tissue at subcellular resolution. In recent years, learning-based\nsegmentation methods have enabled precise quantification of cell type and\nmorphology, but typically rely on large datasets with fully annotated ground\ntruth (GT) labels. This paper explores the effects of imperfect labels on\nlearning-based segmentation models and evaluates the generalisability of these\nmodels to different tissue types. Our results show that removing 50% of cell\nannotations from GT masks only reduces the dice similarity coefficient (DSC)\nscore to 0.874 (from 0.889 achieved by a model trained on fully annotated GT\nmasks). This implies that annotation time can in fact be reduced by at least\nhalf without detrimentally affecting performance. Furthermore, training our\nsingle-tissue model on imperfect labels only decreases DSC by 0.031 on an\nunseen tissue type compared to its multi-tissue counterpart, with negligible\nqualitative differences in segmentation. Additionally, bootstrapping the\nworst-performing model (with 5% of cell annotations) a total of ten times\nimproves its original DSC score of 0.720 to 0.829. These findings imply that\nless time and work can be put into the process of producing comparable\nsegmentation models; this includes eliminating the need for multiple IMC tissue\ntypes during training, whilst also providing the potential for models with very\nfew labels to improve on themselves. Source code is available on GitHub:\nhttps://github.com/kimberley/ISBI2024.\n","authors":["Kimberley M. Bird","Xujiong Ye","Alan M. Race","James M. Brown"],"pdf_url":"https://arxiv.org/pdf/2402.04446v1.pdf","comment":"International Symposium on Biomedical Imaging (ISBI) 2024 Submission"},{"id":"http://arxiv.org/abs/2402.04426v1","updated":"2024-02-06T21:56:38Z","published":"2024-02-06T21:56:38Z","title":"Quantitative Metrics for Benchmarking Medical Image Harmonization","summary":" Image harmonization is an important preprocessing strategy to address domain\nshifts arising from data acquired using different machines and scanning\nprotocols in medical imaging. However, benchmarking the effectiveness of\nharmonization techniques has been a challenge due to the lack of widely\navailable standardized datasets with ground truths. In this context, we propose\nthree metrics: two intensity harmonization metrics and one anatomy preservation\nmetric for medical images during harmonization, where no ground truths are\nrequired. Through extensive studies on a dataset with available harmonization\nground truth, we demonstrate that our metrics are correlated with established\nimage quality assessment metrics. We show how these novel metrics may be\napplied to real-world scenarios where no harmonization ground truth exists.\nAdditionally, we provide insights into different interpretations of the metric\nvalues, shedding light on their significance in the context of the\nharmonization process. As a result of our findings, we advocate for the\nadoption of these quantitative harmonization metrics as a standard for\nbenchmarking the performance of image harmonization techniques.\n","authors":["Abhijeet Parida","Zhifan Jiang","Roger J. Packer","Robert A. Avery","Syed M. Anwar","Marius G. Linguraru"],"pdf_url":"https://arxiv.org/pdf/2402.04426v1.pdf","comment":"Accepted for presentation at the ISBI 2024"},{"id":"http://arxiv.org/abs/2312.14395v2","updated":"2024-02-06T21:51:04Z","published":"2023-12-22T02:52:54Z","title":"Unsupervised Deep Learning Image Verification Method","summary":" Although deep learning are commonly employed for image recognition, usually\nhuge amount of labeled training data is required, which may not always be\nreadily available. This leads to a noticeable performance disparity when\ncompared to state-of-the-art unsupervised face verification techniques. In this\nwork, we propose a method to narrow this gap by leveraging an autoencoder to\nconvert the face image vector into a novel representation. Notably, the\nautoencoder is trained to reconstruct neighboring face image vectors rather\nthan the original input image vectors. These neighbor face image vectors are\nchosen through an unsupervised process based on the highest cosine scores with\nthe training face image vectors. The proposed method achieves a relative\nimprovement of 56\\% in terms of EER over the baseline system on Labeled Faces\nin the Wild (LFW) dataset. This has successfully narrowed down the performance\ngap between cosine and PLDA scoring systems.\n","authors":["Enoch Solomon","Abraham Woubie","Eyael Solomon Emiru"],"pdf_url":"https://arxiv.org/pdf/2312.14395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01701v4","updated":"2024-02-06T21:50:42Z","published":"2023-10-02T23:38:17Z","title":"Transcending Domains through Text-to-Image Diffusion: A Source-Free\n Approach to Domain Adaptation","summary":" Domain Adaptation (DA) is a method for enhancing a model's performance on a\ntarget domain with inadequate annotated data by applying the information the\nmodel has acquired from a related source domain with sufficient labeled data.\nThe escalating enforcement of data-privacy regulations like HIPAA, COPPA,\nFERPA, etc. have sparked a heightened interest in adapting models to novel\ndomains while circumventing the need for direct access to the source data, a\nproblem known as Source-Free Domain Adaptation (SFDA). In this paper, we\npropose a novel framework for SFDA that generates source data using a\ntext-to-image diffusion model trained on the target domain samples. Our method\nstarts by training a text-to-image diffusion model on the labeled target domain\nsamples, which is then fine-tuned using the pre-trained source model to\ngenerate samples close to the source data. Finally, we use Domain Adaptation\ntechniques to align the artificially generated source data with the target\ndomain data, resulting in significant performance improvements of the model on\nthe target domain. Through extensive comparison against several baselines on\nthe standard Office-31, Office-Home, and VisDA benchmarks, we demonstrate the\neffectiveness of our approach for the SFDA task.\n","authors":["Shivang Chopra","Suraj Kothawade","Houda Aynaou","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2310.01701v4.pdf","comment":"Revamped the whole paper; new version will be re-submitted"},{"id":"http://arxiv.org/abs/2105.00582v2","updated":"2024-02-06T21:41:26Z","published":"2021-05-03T00:14:43Z","title":"Semi-supervised learning for generalizable intracranial hemorrhage\n detection and segmentation","summary":" Purpose: To develop and evaluate a semi-supervised learning model for\nintracranial hemorrhage detection and segmentation on an out-of-distribution\nhead CT evaluation set.\n Materials and Methods: This retrospective study used semi-supervised learning\nto bootstrap performance. An initial \"teacher\" deep learning model was trained\non 457 pixel-labeled head CT scans collected from one US institution from\n2010-2017 and used to generate pseudo-labels on a separate unlabeled corpus of\n25000 examinations from the RSNA and ASNR. A second \"student\" model was trained\non this combined pixel- and pseudo-labeled dataset. Hyperparameter tuning was\nperformed on a validation set of 93 scans. Testing for both classification\n(n=481 examinations) and segmentation (n=23 examinations, or 529 images) was\nperformed on CQ500, a dataset of 481 scans performed in India, to evaluate\nout-of-distribution generalizability. The semi-supervised model was compared\nwith a baseline model trained on only labeled data using area under the\nreceiver operating characteristic curve (AUC), Dice similarity coefficient\n(DSC), and average precision (AP) metrics.\n Results: The semi-supervised model achieved statistically significantly\nhigher examination AUC on CQ500 compared with the baseline (0.939 [0.938,\n0.940] vs. 0.907 [0.906, 0.908]) (p=0.009). It also achieved a higher DSC\n(0.829 [0.825, 0.833] vs. 0.809 [0.803, 0.812]) (p=0.012) and Pixel AP (0.848\n[0.843, 0.853]) vs. 0.828 [0.817, 0.828]) compared to the baseline.\n Conclusion: The addition of unlabeled data in a semi-supervised learning\nframework demonstrates stronger generalizability potential for intracranial\nhemorrhage detection and segmentation compared with a supervised baseline.\n","authors":["Emily Lin","Esther Yuh"],"pdf_url":"https://arxiv.org/pdf/2105.00582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04416v1","updated":"2024-02-06T21:29:37Z","published":"2024-02-06T21:29:37Z","title":"A Data Centric Approach for Unsupervised Domain Generalization via\n Retrieval from Web Scale Multimodal Data","summary":" Domain generalization (DG) is an important problem that learns a model that\ncan generalize to unseen test domains leveraging one or more source domains,\nunder the assumption of shared label spaces. However, most DG methods assume\naccess to abundant source data in the target label space, a requirement that\nproves overly stringent for numerous real-world applications, where acquiring\nthe same label space as the target task is prohibitively expensive. For this\nsetting, we tackle the multimodal version of the unsupervised domain\ngeneralization (UDG) problem, which uses a large task-agnostic unlabeled source\ndataset, such as LAION-2B during finetuning. Our framework does not explicitly\nassume any relationship between the source dataset and target task. Instead, it\nrelies only on the premise that the source dataset can be efficiently searched\nin a joint vision-language space. For this multimodal UDG setting, we propose a\nnovel method to build a small ($<$100K) subset of the source data in three\nsimple steps: (1) diversified retrieval using label names as queries, (2) rank\npseudo-labeling, and (3) clustering to find representative samples. To\ndemonstrate the value of studying the multimodal UDG problem, we compare our\nresults against state-of-the-art source-free DG and zero-shot (ZS) methods on\ntheir respective benchmarks and show up to 10% improvement in accuracy on 20\ndiverse target datasets. Additionally, our multi-stage dataset construction\nmethod achieves 3% improvement on average over nearest neighbors retrieval.\nCode is available: https://github.com/Chris210634/mudg\n","authors":["Christopher Liao","Theodoros Tsiligkaridis","Brian Kulis"],"pdf_url":"https://arxiv.org/pdf/2402.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09869v3","updated":"2024-02-06T21:12:24Z","published":"2022-11-17T20:17:04Z","title":"RenderDiffusion: Image Diffusion for 3D Reconstruction, Inpainting and\n Generation","summary":" Diffusion models currently achieve state-of-the-art performance for both\nconditional and unconditional image generation. However, so far, image\ndiffusion models do not support tasks required for 3D understanding, such as\nview-consistent 3D generation or single-view object reconstruction. In this\npaper, we present RenderDiffusion, the first diffusion model for 3D generation\nand inference, trained using only monocular 2D supervision. Central to our\nmethod is a novel image denoising architecture that generates and renders an\nintermediate three-dimensional representation of a scene in each denoising\nstep. This enforces a strong inductive structure within the diffusion process,\nproviding a 3D consistent representation while only requiring 2D supervision.\nThe resulting 3D representation can be rendered from any view. We evaluate\nRenderDiffusion on FFHQ, AFHQ, ShapeNet and CLEVR datasets, showing competitive\nperformance for generation of 3D scenes and inference of 3D scenes from 2D\nimages. Additionally, our diffusion-based approach allows us to use 2D\ninpainting to edit 3D scenes.\n","authors":["Titas Anciukevicius","Zexiang Xu","Matthew Fisher","Paul Henderson","Hakan Bilen","Niloy J. Mitra","Paul Guerrero"],"pdf_url":"https://arxiv.org/pdf/2211.09869v3.pdf","comment":"Accepted at CVPR 2023. Project page:\n https://github.com/Anciukevicius/RenderDiffusion"},{"id":"http://arxiv.org/abs/2402.04408v1","updated":"2024-02-06T21:07:09Z","published":"2024-02-06T21:07:09Z","title":"Detection Transformer for Teeth Detection, Segmentation, and Numbering\n in Oral Rare Diseases: Focus on Data Augmentation and Inpainting Techniques","summary":" In this work, we focused on deep learning image processing in the context of\noral rare diseases, which pose challenges due to limited data availability. A\ncrucial step involves teeth detection, segmentation and numbering in panoramic\nradiographs. To this end, we used a dataset consisting of 156 panoramic\nradiographs from individuals with rare oral diseases and labeled by experts. We\ntrained the Detection Transformer (DETR) neural network for teeth detection,\nsegmentation, and numbering the 52 teeth classes. In addition, we used data\naugmentation techniques, including geometric transformations. Finally, we\ngenerated new panoramic images using inpainting techniques with stable\ndiffusion, by removing teeth from a panoramic radiograph and integrating teeth\ninto it. The results showed a mAP exceeding 0,69 for DETR without data\naugmentation. The mAP was improved to 0,82 when data augmentation techniques\nare used. Furthermore, we observed promising performances when using new\npanoramic radiographs generated with inpainting technique, with mAP of 0,76.\n","authors":["Hocine Kadi","Théo Sourget","Marzena Kawczynski","Sara Bendjama","Bruno Grollemund","Agnès Bloch-Zupan"],"pdf_url":"https://arxiv.org/pdf/2402.04408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17245v4","updated":"2024-02-06T20:39:17Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advancements in real-time neural rendering using point-based\ntechniques have paved the way for the widespread adoption of 3D\nrepresentations. However, foundational approaches like 3D Gaussian Splatting\ncome with a substantial storage overhead caused by growing the SfM points to\nmillions, often demanding gigabyte-level disk space for a single unbounded\nscene, posing significant scalability challenges and hindering the splatting\nefficiency.\n To address this challenge, we introduce LightGaussian, a novel method\ndesigned to transform 3D Gaussians into a more efficient and compact format.\nDrawing inspiration from the concept of Network Pruning, LightGaussian\nidentifies Gaussians that are insignificant in contributing to the scene\nreconstruction and adopts a pruning and recovery process, effectively reducing\nredundancy in Gaussian counts while preserving visual effects. Additionally,\nLightGaussian employs distillation and pseudo-view augmentation to distill\nspherical harmonics to a lower degree, allowing knowledge transfer to more\ncompact representations while maintaining reflectance. Furthermore, we propose\na hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in\nlower bitwidth representations with minimal accuracy losses.\n In summary, LightGaussian achieves an averaged compression rate over 15x\nwhile boosting the FPS from 139 to 215, enabling an efficient representation of\ncomplex scenes on Mip-NeRF 360, Tank and Temple datasets.\n Project website: https://lightgaussian.github.io/\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v4.pdf","comment":"16pages, 8figures"},{"id":"http://arxiv.org/abs/2310.09943v3","updated":"2024-02-06T20:17:13Z","published":"2023-10-15T20:41:07Z","title":"Evaluating Robustness of Visual Representations for Object Assembly Task\n Requiring Spatio-Geometrical Reasoning","summary":" This paper primarily focuses on evaluating and benchmarking the robustness of\nvisual representations in the context of object assembly tasks. Specifically,\nit investigates the alignment and insertion of objects with geometrical\nextrusions and intrusions, commonly referred to as a peg-in-hole task. The\naccuracy required to detect and orient the peg and the hole geometry in SE(3)\nspace for successful assembly poses significant challenges. Addressing this, we\nemploy a general framework in visuomotor policy learning that utilizes visual\npretraining models as vision encoders. Our study investigates the robustness of\nthis framework when applied to a dual-arm manipulation setup, specifically to\nthe grasp variations. Our quantitative analysis shows that existing pretrained\nmodels fail to capture the essential visual features necessary for this task.\nHowever, a visual encoder trained from scratch consistently outperforms the\nfrozen pretrained models. Moreover, we discuss rotation representations and\nassociated loss functions that substantially improve policy learning. We\npresent a novel task scenario designed to evaluate the progress in visuomotor\npolicy learning, with a specific focus on improving the robustness of intricate\nassembly tasks that require both geometrical and spatial reasoning. Videos,\nadditional experiments, dataset, and code are available at\nhttps://bit.ly/geometric-peg-in-hole .\n","authors":["Chahyon Ku","Carl Winge","Ryan Diaz","Wentao Yuan","Karthik Desingh"],"pdf_url":"https://arxiv.org/pdf/2310.09943v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04273v1","updated":"2024-02-06T20:09:52Z","published":"2024-02-06T20:09:52Z","title":"Breaking Data Silos: Cross-Domain Learning for Multi-Agent Perception\n from Independent Private Sources","summary":" The diverse agents in multi-agent perception systems may be from different\ncompanies. Each company might use the identical classic neural network\narchitecture based encoder for feature extraction. However, the data source to\ntrain the various agents is independent and private in each company, leading to\nthe Distribution Gap of different private data for training distinct agents in\nmulti-agent perception system. The data silos by the above Distribution Gap\ncould result in a significant performance decline in multi-agent perception. In\nthis paper, we thoroughly examine the impact of the distribution gap on\nexisting multi-agent perception systems. To break the data silos, we introduce\nthe Feature Distribution-aware Aggregation (FDA) framework for cross-domain\nlearning to mitigate the above Distribution Gap in multi-agent perception. FDA\ncomprises two key components: Learnable Feature Compensation Module and\nDistribution-aware Statistical Consistency Module, both aimed at enhancing\nintermediate features to minimize the distribution gap among multi-agent\nfeatures. Intensive experiments on the public OPV2V and V2XSet datasets\nunderscore FDA's effectiveness in point cloud-based 3D object detection,\npresenting it as an invaluable augmentation to existing multi-agent perception\nsystems.\n","authors":["Jinlong Li","Baolu Li","Xinyu Liu","Runsheng Xu","Jiaqi Ma","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2402.04273v1.pdf","comment":"Accepted by the 2024 IEEE International Conference on Robotics and\n Automation (ICRA)"},{"id":"http://arxiv.org/abs/2311.12610v2","updated":"2024-02-06T19:49:11Z","published":"2023-11-21T13:52:31Z","title":"VALUED -- Vision and Logical Understanding Evaluation Dataset","summary":" Starting with early successes in computer vision tasks, deep learning based\ntechniques have since overtaken state of the art approaches in a multitude of\ndomains. However, it has been demonstrated time and again that these techniques\nfail to capture semantic context and logical constraints, instead often relying\non spurious correlations to arrive at the answer. Since application of deep\nlearning techniques to critical scenarios are dependent on adherence to domain\nspecific constraints, several attempts have been made to address this issue.\nOne limitation holding back a thorough exploration of this area, is a lack of\nsuitable datasets which feature a rich set of rules. In order to address this,\nwe present the VALUE (Vision And Logical Understanding Evaluation) Dataset,\nconsisting of 200,000$+$ annotated images and an associated rule set, based on\nthe popular board game - chess. The curated rule set considerably constrains\nthe set of allowable predictions, and are designed to probe key semantic\nabilities like localization and enumeration. Alongside standard metrics,\nadditional metrics to measure performance with regards to logical consistency\nis presented. We analyze several popular and state of the art vision models on\nthis task, and show that, although their performance on standard metrics are\nlaudable, they produce a plethora of incoherent results, indicating that this\ndataset presents a significant challenge for future works.\n","authors":["Soumadeep Saha","Saptarshi Saha","Utpal Garain"],"pdf_url":"https://arxiv.org/pdf/2311.12610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04356v1","updated":"2024-02-06T19:42:18Z","published":"2024-02-06T19:42:18Z","title":"Bidirectional Autoregressive Diffusion Model for Dance Generation","summary":" Dance serves as a powerful medium for expressing human emotions, but the\nlifelike generation of dance is still a considerable challenge. Recently,\ndiffusion models have showcased remarkable generative abilities across various\ndomains. They hold promise for human motion generation due to their adaptable\nmany-to-many nature. Nonetheless, current diffusion-based motion generation\nmodels often create entire motion sequences directly and unidirectionally,\nlacking focus on the motion with local and bidirectional enhancement. When\nchoreographing high-quality dance movements, people need to take into account\nnot only the musical context but also the nearby music-aligned dance motions.\nTo authentically capture human behavior, we propose a Bidirectional\nAutoregressive Diffusion Model (BADM) for music-to-dance generation, where a\nbidirectional encoder is built to enforce that the generated dance is\nharmonious in both the forward and backward directions. To make the generated\ndance motion smoother, a local information decoder is built for local motion\nenhancement. The proposed framework is able to generate new motions based on\nthe input conditions and nearby motions, which foresees individual motion\nslices iteratively and consolidates all predictions. To further refine the\nsynchronicity between the generated dance and the beat, the beat information is\nincorporated as an input to generate better music-aligned dance movements.\nExperimental results demonstrate that the proposed model achieves\nstate-of-the-art performance compared to existing unidirectional approaches on\nthe prominent benchmark for music-to-dance generation.\n","authors":["Canyu Zhang","Youbao Tang","Ning Zhang","Ruei-Sung Lin","Mei Han","Jing Xiao","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04354v1","updated":"2024-02-06T19:37:05Z","published":"2024-02-06T19:37:05Z","title":"3D printer-controlled syringe pumps for dual, active, regulable and\n simultaneous dispensing of reagents. Manufacturing of immunochromatographic\n test strips","summary":" Lateral flow immunoassays (LFIA) are widely used worldwide for the detection\nof different analytes because they combine multiple advantages such as low\nproduction cost, simplicity, and portability, which allows biomarkers detection\nwithout requiring infrastructure or highly trained personnel. Here we propose\nto provide solutions to the manufacturing process of LFIA at laboratory-scale,\nparticularly to the controlled and active dispensing of the reagents in the\nform the Test Lines (TL) and the Control Lines (CL). To accomplish this task,\nwe adapted a 3D printer to also control Syringe Pumps (SP), since the proposed\nadaptation of a 3D printer is easy, free and many laboratories already have it\nin their infrastructure. In turn, the standard function of the 3D printer can\nbe easily restored by disconnecting the SPs and reconnecting the extruder.\nAdditionally, the unified control of the 3D printer enables dual, active,\nregulable and simultaneous dispensing, four features that are typically found\nonly in certain high-cost commercial equipment. With the proposed setup, the\nchallenge of dispensing simultaneously at least 2 lines (CL and TL) with SPs\ncontrolled by a 3D printer was addressed, including regulation in the width of\ndispensed lines within experimental limits. Also, the construction of a LFIA\nfor the detection of leptospirosis is shown as a practical example of\nautomatized reagent dispensing.\n","authors":["Gabriel Siano","Leandro Peretti","Juan Manuel Marquez","Nazarena Pujato","Leonardo Giovanini","Claudio Berli"],"pdf_url":"https://arxiv.org/pdf/2402.04354v1.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.04324v1","updated":"2024-02-06T19:08:18Z","published":"2024-02-06T19:08:18Z","title":"ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation","summary":" Image-to-video (I2V) generation aims to use the initial frame (alongside a\ntext prompt) to create a video sequence. A grand challenge in I2V generation is\nto maintain visual consistency throughout the video: existing methods often\nstruggle to preserve the integrity of the subject, background, and style from\nthe first frame, as well as ensure a fluid and logical progression within the\nvideo narrative. To mitigate these issues, we propose ConsistI2V, a\ndiffusion-based method to enhance visual consistency for I2V generation.\nSpecifically, we introduce (1) spatiotemporal attention over the first frame to\nmaintain spatial and motion consistency, (2) noise initialization from the\nlow-frequency band of the first frame to enhance layout consistency. These two\napproaches enable ConsistI2V to generate highly consistent videos. We also\nextend the proposed approaches to show their potential to improve consistency\nin auto-regressive long video generation and camera motion control. To verify\nthe effectiveness of our method, we propose I2V-Bench, a comprehensive\nevaluation benchmark for I2V generation. Our automatic and human evaluation\nresults demonstrate the superiority of ConsistI2V over existing methods.\n","authors":["Weiming Ren","Harry Yang","Ge Zhang","Cong Wei","Xinrun Du","Stephen Huang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04324v1.pdf","comment":"Project Page: https://tiger-ai-lab.github.io/ConsistI2V/"},{"id":"http://arxiv.org/abs/2402.03214v2","updated":"2024-02-06T18:57:10Z","published":"2024-02-05T17:25:04Z","title":"Organic or Diffused: Can We Distinguish Human Art from AI-generated\n Images?","summary":" The advent of generative AI images has completely disrupted the art world.\nDistinguishing AI generated images from human art is a challenging problem\nwhose impact is growing over time. A failure to address this problem allows bad\nactors to defraud individuals paying a premium for human art and companies\nwhose stated policies forbid AI imagery. It is also critical for content owners\nto establish copyright, and for model trainers interested in curating training\ndata in order to avoid potential model collapse.\n There are several different approaches to distinguishing human art from AI\nimages, including classifiers trained by supervised learning, research tools\ntargeting diffusion models, and identification by professional artists using\ntheir knowledge of artistic techniques. In this paper, we seek to understand\nhow well these approaches can perform against today's modern generative models\nin both benign and adversarial settings. We curate real human art across 7\nstyles, generate matching images from 5 generative models, and apply 8\ndetectors (5 automated detectors and 3 different human groups including 180\ncrowdworkers, 4000+ professional artists, and 13 expert artists experienced at\ndetecting AI). Both Hive and expert artists do very well, but make mistakes in\ndifferent ways (Hive is weaker against adversarial perturbations while Expert\nartists produce higher false positives). We believe these weaknesses will\nremain as models continue to evolve, and use our data to demonstrate why a\ncombined team of human and automated detectors provides the best combination of\naccuracy and robustness.\n","authors":["Anna Yoo Jeong Ha","Josephine Passananti","Ronik Bhaskar","Shawn Shan","Reid Southen","Haitao Zheng","Ben Y. Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.03214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04301v1","updated":"2024-02-06T17:00:19Z","published":"2024-02-06T17:00:19Z","title":"Deep PCCT: Photon Counting Computed Tomography Deep Learning\n Applications Review","summary":" Medical imaging faces challenges such as limited spatial resolution,\ninterference from electronic noise and poor contrast-to-noise ratios. Photon\nCounting Computed Tomography (PCCT) has emerged as a solution, addressing these\nissues with its innovative technology. This review delves into the recent\ndevelopments and applications of PCCT in pre-clinical research, emphasizing its\npotential to overcome traditional imaging limitations. For example PCCT has\ndemonstrated remarkable efficacy in improving the detection of subtle\nabnormalities in breast, providing a level of detail previously unattainable.\nExamining the current literature on PCCT, it presents a comprehensive analysis\nof the technology, highlighting the main features of scanners and their varied\napplications. In addition, it explores the integration of deep learning into\nPCCT, along with the study of radiomic features, presenting successful\napplications in data processing. While acknowledging these advances, it also\ndiscusses the existing challenges in this field, paving the way for future\nresearch and improvements in medical imaging technologies. Despite the limited\nnumber of articles on this subject, due to the recent integration of PCCT at a\nclinical level, its potential benefits extend to various diagnostic\napplications.\n","authors":["Ana Carolina Alves","André Ferreira","Gijs Luijten","Jens Kleesiek","Behrus Puladi","Jan Egger","Victor Alves"],"pdf_url":"https://arxiv.org/pdf/2402.04301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04297v1","updated":"2024-02-06T15:42:38Z","published":"2024-02-06T15:42:38Z","title":"Road Surface Defect Detection -- From Image-based to Non-image-based: A\n Survey","summary":" Ensuring traffic safety is crucial, which necessitates the detection and\nprevention of road surface defects. As a result, there has been a growing\ninterest in the literature on the subject, leading to the development of\nvarious road surface defect detection methods. The methods for detecting road\ndefects can be categorised in various ways depending on the input data types or\ntraining methodologies. The predominant approach involves image-based methods,\nwhich analyse pixel intensities and surface textures to identify defects.\nDespite their popularity, image-based methods share the distinct limitation of\nvulnerability to weather and lighting changes. To address this issue,\nresearchers have explored the use of additional sensors, such as laser scanners\nor LiDARs, providing explicit depth information to enable the detection of\ndefects in terms of scale and volume. However, the exploration of data beyond\nimages has not been sufficiently investigated. In this survey paper, we provide\na comprehensive review of road surface defect detection studies, categorising\nthem based on input data types and methodologies used. Additionally, we review\nrecently proposed non-image-based methods and discuss several challenges and\nopen problems associated with these techniques.\n","authors":["Jongmin Yu","Jiaqi Jiang","Sebastiano Fichera","Paolo Paoletti","Lisa Layzell","Devansh Mehta","Shan Luo"],"pdf_url":"https://arxiv.org/pdf/2402.04297v1.pdf","comment":"Survey papers"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.17748v2","updated":"2024-02-06T17:57:00Z","published":"2023-12-29T18:59:58Z","title":"K-PERM: Personalized Response Generation Using Dynamic Knowledge\n Retrieval and Persona-Adaptive Queries","summary":" Personalizing conversational agents can enhance the quality of conversations\nand increase user engagement. However, they often lack external knowledge to\nappropriately tend to a user's persona. This is particularly crucial for\npractical applications like mental health support, nutrition planning,\nculturally sensitive conversations, or reducing toxic behavior in\nconversational agents. To enhance the relevance and comprehensiveness of\npersonalized responses, we propose using a two-step approach that involves (1)\nselectively integrating user personas and (2) contextualizing the response with\nsupplementing information from a background knowledge source. We develop K-PERM\n(Knowledge-guided PErsonalization with Reward Modulation), a dynamic\nconversational agent that combines these elements. K-PERM achieves\nstate-of-the-art performance on the popular FoCus dataset, containing\nreal-world personalized conversations concerning global landmarks. We show that\nusing responses from K-PERM can improve performance in state-of-the-art LLMs\n(GPT 3.5) by 10.5%, highlighting the impact of K-PERM for personalizing\nchatbots.\n","authors":["Kanak Raj","Kaushik Roy","Vamshi Bonagiri","Priyanshul Govil","Krishnaprasad Thirunarayanan","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2312.17748v2.pdf","comment":"Accepted at AAAI 2024 Spring Symposium Series"},{"id":"http://arxiv.org/abs/2302.06014v2","updated":"2024-02-06T16:08:10Z","published":"2023-02-12T22:04:27Z","title":"Online Recommendations for Agents with Discounted Adaptive Preferences","summary":" We consider a bandit recommendations problem in which an agent's preferences\n(representing selection probabilities over recommended items) evolve as a\nfunction of past selections, according to an unknown $\\textit{preference\nmodel}$. In each round, we show a menu of $k$ items (out of $n$ total) to the\nagent, who then chooses a single item, and we aim to minimize regret with\nrespect to some $\\textit{target set}$ (a subset of the item simplex) for\nadversarial losses over the agent's choices. Extending the setting from Agarwal\nand Brown (2022), where uniform-memory agents were considered, here we allow\nfor non-uniform memory in which a discount factor is applied to the agent's\nmemory vector at each subsequent round. In the \"long-term memory\" regime (when\nthe effective memory horizon scales with $T$ sublinearly), we show that\nefficient sublinear regret is obtainable with respect to the set of\n$\\textit{everywhere instantaneously realizable distributions}$ (the \"EIRD set\",\nas formulated in prior work) for any $\\textit{smooth}$ preference model.\nFurther, for preferences which are bounded above and below by linear functions\nof memory weight (we call these \"scale-bounded\" preferences) we give an\nalgorithm which obtains efficient sublinear regret with respect to nearly the\n$\\textit{entire}$ item simplex. We show an NP-hardness result for expanding to\ntargets beyond EIRD in general. In the \"short-term memory\" regime (when the\nmemory horizon is constant), we show that scale-bounded preferences again\nenable efficient sublinear regret for nearly the entire simplex even without\nsmoothness if losses do not change too frequently, yet we show an\ninformation-theoretic barrier for competing against the EIRD set under\narbitrary smooth preference models even when losses are constant.\n","authors":["Arpit Agarwal","William Brown"],"pdf_url":"https://arxiv.org/pdf/2302.06014v2.pdf","comment":"Updates for camera-ready version (ALT 2024)"},{"id":"http://arxiv.org/abs/2401.09885v3","updated":"2024-02-06T15:09:13Z","published":"2024-01-18T10:56:27Z","title":"Source Code Clone Detection Using Unsupervised Similarity Measures","summary":" Assessing similarity in source code has gained significant attention in\nrecent years due to its importance in software engineering tasks such as clone\ndetection and code search and recommendation. This work presents a comparative\nanalysis of unsupervised similarity measures for identifying source code clone\ndetection. The goal is to overview the current state-of-the-art techniques,\ntheir strengths, and weaknesses. To do that, we compile the existing\nunsupervised strategies and evaluate their performance on a benchmark dataset\nto guide software engineers in selecting appropriate methods for their specific\nuse cases. The source code of this study is available at\nhttps://github.com/jorge-martinez-gil/codesim\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2401.09885v3.pdf","comment":"Accepted for publication as Full Paper in the Software Quality Days\n 2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2401.16979v2","updated":"2024-02-06T15:05:02Z","published":"2024-01-30T13:04:20Z","title":"Re3val: Reinforced and Reranked Generative Retrieval","summary":" Generative retrieval models encode pointers to information in a corpus as an\nindex within the model's parameters. These models serve as part of a larger\npipeline, where retrieved information conditions generation for\nknowledge-intensive NLP tasks. However, we identify two limitations: the\ngenerative retrieval does not account for contextual information. Secondly, the\nretrieval can't be tuned for the downstream readers as decoding the page title\nis a non-differentiable operation. This paper introduces Re3val, trained with\ngenerative reranking and reinforcement learning using limited data. Re3val\nleverages context acquired via Dense Passage Retrieval to rerank the retrieved\npage titles and utilizes REINFORCE to maximize rewards generated by constrained\ndecoding. Additionally, we generate questions from our pre-training dataset to\nmitigate epistemic uncertainty and bridge the domain gap between the\npre-training and fine-tuning datasets. Subsequently, we extract and rerank\ncontexts from the KILT database using the rerank page titles. Upon grounding\nthe top five reranked contexts, Re3val demonstrates the Top 1 KILT scores\ncompared to all other generative retrieval models across five KILT datasets.\n","authors":["EuiYul Song","Sangryul Kim","Haeju Lee","Joonkee Kim","James Thorne"],"pdf_url":"https://arxiv.org/pdf/2401.16979v2.pdf","comment":"17 pages, 4 figures, Findings of the Association for Computational\n Linguistics: EACL 2023"},{"id":"http://arxiv.org/abs/2402.03916v1","updated":"2024-02-06T11:33:57Z","published":"2024-02-06T11:33:57Z","title":"Can Large Language Models Detect Rumors on Social Media?","summary":" In this work, we investigate to use Large Language Models (LLMs) for rumor\ndetection on social media. However, it is challenging for LLMs to reason over\nthe entire propagation information on social media, which contains news\ncontents and numerous comments, due to LLMs may not concentrate on key clues in\nthe complex propagation information, and have trouble in reasoning when facing\nmassive and redundant information. Accordingly, we propose an LLM-empowered\nRumor Detection (LeRuD) approach, in which we design prompts to teach LLMs to\nreason over important clues in news and comments, and divide the entire\npropagation information into a Chain-of-Propagation for reducing LLMs' burden.\nWe conduct extensive experiments on the Twitter and Weibo datasets, and LeRuD\noutperforms several state-of-the-art rumor detection models by 2.4% to 7.6%.\nMeanwhile, by applying LLMs, LeRuD requires no data for training, and thus\nshows more promising rumor detection ability in few-shot or zero-shot\nscenarios.\n","authors":["Qiang Liu","Xiang Tao","Junfei Wu","Shu Wu","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.03916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03915v1","updated":"2024-02-06T11:31:04Z","published":"2024-02-06T11:31:04Z","title":"Learning Metrics that Maximise Power for Accelerated A/B-Tests","summary":" Online controlled experiments are a crucial tool to allow for confident\ndecision-making in technology companies. A North Star metric is defined (such\nas long-term revenue or user retention), and system variants that statistically\nsignificantly improve on this metric in an A/B-test can be considered superior.\nNorth Star metrics are typically delayed and insensitive. As a result, the cost\nof experimentation is high: experiments need to run for a long time, and even\nthen, type-II errors (i.e. false negatives) are prevalent.\n We propose to tackle this by learning metrics from short-term signals that\ndirectly maximise the statistical power they harness with respect to the North\nStar. We show that existing approaches are prone to overfitting, in that higher\naverage metric sensitivity does not imply improved type-II errors, and propose\nto instead minimise the $p$-values a metric would have produced on a log of\npast experiments. We collect such datasets from two social media applications\nwith over 160 million Monthly Active Users each, totalling over 153 A/B-pairs.\nEmpirical results show that we are able to increase statistical power by up to\n78% when using our learnt metrics stand-alone, and by up to 210% when used in\ntandem with the North Star. Alternatively, we can obtain constant statistical\npower at a sample size that is down to 12% of what the North Star requires,\nsignificantly reducing the cost of experimentation.\n","authors":["Olivier Jeunen","Aleksei Ustimenko"],"pdf_url":"https://arxiv.org/pdf/2402.03915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03801v1","updated":"2024-02-06T08:39:44Z","published":"2024-02-06T08:39:44Z","title":"On Practical Diversified Recommendation with Controllable Category\n Diversity Framework","summary":" Recommender systems have made significant strides in various industries,\nprimarily driven by extensive efforts to enhance recommendation accuracy.\nHowever, this pursuit of accuracy has inadvertently given rise to echo\nchamber/filter bubble effects. Especially in industry, it could impair user's\nexperiences and prevent user from accessing a wider range of items. One of the\nsolutions is to take diversity into account. However, most of existing works\nfocus on user's explicit preferences, while rarely exploring user's\nnon-interaction preferences. These neglected non-interaction preferences are\nespecially important for broadening user's interests in alleviating echo\nchamber/filter bubble effects.Therefore, in this paper, we first define\ndiversity as two distinct definitions, i.e., user-explicit diversity\n(U-diversity) and user-item non-interaction diversity (N-diversity) based on\nuser historical behaviors. Then, we propose a succinct and effective method,\nnamed as Controllable Category Diversity Framework (CCDF) to achieve both high\nU-diversity and N-diversity simultaneously.Specifically, CCDF consists of two\nstages, User-Category Matching and Constrained Item Matching. The User-Category\nMatching utilizes the DeepU2C model and a combined loss to capture user's\npreferences in categories, and then selects the top-$K$ categories with a\ncontrollable parameter $K$.These top-$K$ categories will be used as trigger\ninformation in Constrained Item Matching. Offline experimental results show\nthat our proposed DeepU2C outperforms state-of-the-art diversity-oriented\nmethods, especially on N-diversity task. The whole framework is validated in a\nreal-world production environment by conducting online A/B testing.\n","authors":["Tao Zhang","Luwei Yang","Zhibo Xiao","Wen Jiang","Wei Ning"],"pdf_url":"https://arxiv.org/pdf/2402.03801v1.pdf","comment":"A Two-stage Controllable Category Diversity Framework for\n Recommendation"},{"id":"http://arxiv.org/abs/2210.10718v2","updated":"2024-02-06T06:01:27Z","published":"2022-10-19T16:53:08Z","title":"Whole Page Unbiased Learning to Rank","summary":" The page presentation biases in the information retrieval system, especially\non the click behavior, is a well-known challenge that hinders improving ranking\nmodels' performance with implicit user feedback. Unbiased Learning to\nRank~(ULTR) algorithms are then proposed to learn an unbiased ranking model\nwith biased click data. However, most existing algorithms are specifically\ndesigned to mitigate position-related bias, e.g., trust bias, without\nconsidering biases induced by other features in search result page\npresentation(SERP), e.g. attractive bias induced by the multimedia.\nUnfortunately, those biases widely exist in industrial systems and may lead to\nan unsatisfactory search experience. Therefore, we introduce a new problem,\ni.e., whole-page Unbiased Learning to Rank(WP-ULTR), aiming to handle biases\ninduced by whole-page SERP features simultaneously. It presents tremendous\nchallenges: (1) a suitable user behavior model (user behavior hypothesis) can\nbe hard to find; and (2) complex biases cannot be handled by existing\nalgorithms. To address the above challenges, we propose a Bias Agnostic\nwhole-page unbiased Learning to rank algorithm, named BAL, to automatically\nfind the user behavior model with causal discovery and mitigate the biases\ninduced by multiple SERP features with no specific design. Experimental results\non a real-world dataset verify the effectiveness of the BAL.\n","authors":["Haitao Mao","Lixin Zou","Yujia Zheng","Jiliang Tang","Xiaokai Chu","Jiashu Zhao","Qian Wang","Dawei Yin"],"pdf_url":"https://arxiv.org/pdf/2210.10718v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.07944v4","updated":"2024-02-06T05:14:22Z","published":"2023-04-17T01:55:40Z","title":"An In-depth Investigation of User Response Simulation for Conversational\n Search","summary":" Conversational search has seen increased recent attention in both the IR and\nNLP communities. It seeks to clarify and solve users' search needs through\nmulti-turn natural language interactions. However, most existing systems are\ntrained and demonstrated with recorded or artificial conversation logs.\nEventually, conversational search systems should be trained, evaluated, and\ndeployed in an open-ended setting with unseen conversation trajectories. A key\nchallenge is that training and evaluating such systems both require a\nhuman-in-the-loop, which is expensive and does not scale. One strategy is to\nsimulate users, thereby reducing the scaling costs. However, current user\nsimulators are either limited to only responding to yes-no questions from the\nconversational search system or unable to produce high-quality responses in\ngeneral.\n In this paper, we show that existing user simulation systems could be\nsignificantly improved by a smaller finetuned natural language generation\nmodel. However, rather than merely reporting it as the new state-of-the-art, we\nconsider it a strong baseline and present an in-depth investigation of\nsimulating user response for conversational search. Our goal is to supplement\nexisting work with an insightful hand-analysis of unsolved challenges by the\nbaseline and propose our solutions. The challenges we identified include (1) a\nblind spot that is difficult to learn, and (2) a specific type of misevaluation\nin the standard setup. We propose a new generation system to effectively cover\nthe training blind spot and suggest a new evaluation setup to avoid\nmisevaluation. Our proposed system leads to significant improvements over\nexisting systems and large language models such as GPT-4. Additionally, our\nanalysis provides insights into the nature of user simulation to facilitate\nfuture work.\n","authors":["Zhenduo Wang","Zhichao Xu","Qingyao Ai","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2304.07944v4.pdf","comment":"To appear in The Web Conference 2024, 8 pages with Appendices"},{"id":"http://arxiv.org/abs/2308.07134v5","updated":"2024-02-06T03:08:44Z","published":"2023-08-14T13:41:09Z","title":"Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models has revolutionized\nvarious AI research domains. Transformers-based Large Language Models (LLMs)\nhave gradually replaced CNNs and RNNs to unify fields of computer vision and\nnatural language processing. Compared with independent data samples such as\nimages, videos or texts, graphs usually contain rich structural and relational\ninformation. Meanwhile, language, especially natural language, being one of the\nmost expressive mediums, excels in describing complex structures. However,\nexisting work on incorporating graph problems into the generative language\nmodeling framework remains very limited. Considering the rising prominence of\nLLMs, it becomes essential to explore whether LLMs can also replace GNNs as the\nfoundation model for graphs. In this paper, we propose InstructGLM\n(Instruction-finetuned Graph Language Model) with highly scalable prompts based\non natural language instructions. We use natural language to describe\nmulti-scale geometric structure of the graph and then instruction finetune an\nLLM to perform graph tasks, which enables Generative Graph Learning. Our method\nsurpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets,\nunderscoring its effectiveness and sheds light on generative LLMs as new\nfoundation model for graph machine learning. Our code is open-sourced at\nhttps://github.com/agiresearch/InstructGLM.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v5.pdf","comment":"In EACL 2024"},{"id":"http://arxiv.org/abs/2402.03049v2","updated":"2024-02-06T02:51:23Z","published":"2024-02-05T14:33:56Z","title":"EasyInstruct: An Easy-to-use Instruction Processing Framework for Large\n Language Models","summary":" In recent years, instruction tuning has gained increasing attention and\nemerged as a crucial technique to enhance the capabilities of Large Language\nModels (LLMs). To construct high-quality instruction datasets, many instruction\nprocessing approaches have been proposed, aiming to achieve a delicate balance\nbetween data quantity and data quality. Nevertheless, due to inconsistencies\nthat persist among various instruction processing methods, there is no standard\nopen-source instruction processing implementation framework available for the\ncommunity, which hinders practitioners from further developing and advancing.\nTo facilitate instruction processing research and development, we present\nEasyInstruct, an easy-to-use instruction processing framework for LLMs, which\nmodularizes instruction generation, selection, and prompting, while also\nconsidering their combination and interaction. EasyInstruct is publicly\nreleased and actively maintained at https://github.com/zjunlp/EasyInstruct,\nalong with a running demo App at\nhttps://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for\nbroader research centered on instruction data.\n","authors":["Yixin Ou","Ningyu Zhang","Honghao Gui","Ziwen Xu","Shuofei Qiao","Yida Xue","Runnan Fang","Kangwei Liu","Lei Li","Zhen Bi","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03049v2.pdf","comment":"Ongoing work; the project website is at\n https://zjunlp.github.io/project/EasyInstruct, code is at\n https://github.com/zjunlp/EasyInstruct, demo is at\n https://huggingface.co/spaces/zjunlp/EasyInstruct"},{"id":"http://arxiv.org/abs/2402.01253v3","updated":"2024-02-06T02:24:59Z","published":"2024-02-02T09:20:48Z","title":"RimiRec: Modeling Refined Multi-interest in Hierarchical Structure for\n Recommendation","summary":" Industrial recommender systems usually consist of the retrieval stage and the\nranking stage, to handle the billion-scale of users and items. The retrieval\nstage retrieves candidate items relevant to user interests for recommendations\nand has attracted much attention. Frequently, a user shows refined\nmulti-interests in a hierarchical structure. For example, a user likes Conan\nand Kuroba Kaito, which are the roles in hierarchical structure \"Animation,\nJapanese Animation, Detective Conan\". However, most existing methods ignore\nthis hierarchical nature, and simply average the fine-grained interest\ninformation. Therefore, we propose a novel two-stage approach to explicitly\nmodeling refined multi-interest in a hierarchical structure for recommendation.\nIn the first hierarchical multi-interest mining stage, the hierarchical\nclustering and transformer-based model adaptively generate circles or\nsub-circles that users are interested in. In the second stage, the partition of\nretrieval space allows the EBR models to deal only with items within each\ncircle and accurately capture users' refined interests. Experimental results\nshow that the proposed approach achieves state-of-the-art performance. Our\nframework has also been deployed at Lofter.\n","authors":["Haolei Pei","Yuanyuan Xu","Yangping Zhu","Yuan Nie"],"pdf_url":"https://arxiv.org/pdf/2402.01253v3.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.03635v1","updated":"2024-02-06T02:19:06Z","published":"2024-02-06T02:19:06Z","title":"Retrieval Augmented Cross-Modal Tag Recommendation in Software Q&A Sites","summary":" Posts in software Q\\&A sites often consist of three main parts: title,\ndescription and code, which are interconnected and jointly describe the\nquestion. Existing tag recommendation methods often treat different modalities\nas a whole or inadequately consider the interaction between different\nmodalities. Additionally, they focus on extracting information directly from\nthe post itself, neglecting the information from external knowledge sources.\nTherefore, we propose a Retrieval Augmented Cross-Modal (RACM) Tag\nRecommendation Model in Software Q\\&A Sites. Specifically, we first use the\ninput post as a query and enhance the representation of different modalities by\nretrieving information from external knowledge sources. For the\nretrieval-augmented representations, we employ a cross-modal context-aware\nattention to leverage the main modality description for targeted feature\nextraction across the submodalities title and code. In the fusion process, a\ngate mechanism is employed to achieve fine-grained feature selection,\ncontrolling the amount of information extracted from the submodalities.\nFinally, the fused information is used for tag recommendation. Experimental\nresults on three real-world datasets demonstrate that our model outperforms the\nstate-of-the-art counterparts.\n","authors":["Sijin Lu","Pengyu Xu","Bing Liu","Hongjian Sun","Liping Jing","Jian Yu"],"pdf_url":"https://arxiv.org/pdf/2402.03635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03616v1","updated":"2024-02-06T01:05:14Z","published":"2024-02-06T01:05:14Z","title":"Leveraging Large Language Models for Hybrid Workplace Decision Support","summary":" Large Language Models (LLMs) hold the potential to perform a variety of text\nprocessing tasks and provide textual explanations for proposed actions or\ndecisions. In the era of hybrid work, LLMs can provide intelligent decision\nsupport for workers who are designing their hybrid work plans. In particular,\nthey can offer suggestions and explanations to workers balancing numerous\ndecision factors, thereby enhancing their work experience. In this paper, we\npresent a decision support model for workspaces in hybrid work environments,\nleveraging the reasoning skill of LLMs. We first examine LLM's capability of\nmaking suitable workspace suggestions. We find that its reasoning extends\nbeyond the guidelines in the prompt and the LLM can manage the trade-off among\nthe available resources in the workspaces. We conduct an extensive user study\nto understand workers' decision process for workspace choices and evaluate the\neffectiveness of the system. We observe that a worker's decision could be\ninfluenced by the LLM's suggestions and explanations. The participants in our\nstudy find the system to be convenient, regardless of whether reasons are\nprovided or not. Our results show that employees can benefit from the\nLLM-empowered system for their workspace selection in hybrid workplace.\n","authors":["Yujin Kim","Chin-Chia Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.03616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03600v1","updated":"2024-02-06T00:20:49Z","published":"2024-02-06T00:20:49Z","title":"Understanding and Counteracting Feature-Level Bias in Click-Through Rate\n Prediction","summary":" Common click-through rate (CTR) prediction recommender models tend to exhibit\nfeature-level bias, which leads to unfair recommendations among item groups and\ninaccurate recommendations for users. While existing methods address this issue\nby adjusting the learning of CTR models, such as through additional\noptimization objectives, they fail to consider how the bias is caused within\nthese models. To address this research gap, our study performs a top-down\nanalysis on representative CTR models. Through blocking different components of\na trained CTR model one by one, we identify the key contribution of the linear\ncomponent to feature-level bias. We conduct a theoretical analysis of the\nlearning process for the weights in the linear component, revealing how\ngroup-wise properties of training data influence them. Our experimental and\nstatistical analyses demonstrate a strong correlation between imbalanced\npositive sample ratios across item groups and feature-level bias. Based on this\nunderstanding, we propose a minimally invasive yet effective strategy to\ncounteract feature-level bias in CTR models by removing the biased linear\nweights from trained models. Additionally, we present a linear weight adjusting\nstrategy that requires fewer random exposure records than relevant debiasing\nmethods. The superiority of our proposed strategies are validated through\nextensive experiments on three real-world datasets.\n","authors":["Jinqiu Jin","Sihao Ding","Wenjie Wang","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2402.03600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03597v1","updated":"2024-02-06T00:14:53Z","published":"2024-02-06T00:14:53Z","title":"Identifying Reasons for Contraceptive Switching from Real-World Data\n Using Large Language Models","summary":" Prescription contraceptives play a critical role in supporting women's\nreproductive health. With nearly 50 million women in the United States using\ncontraceptives, understanding the factors that drive contraceptives selection\nand switching is of significant interest. However, many factors related to\nmedication switching are often only captured in unstructured clinical notes and\ncan be difficult to extract. Here, we evaluate the zero-shot abilities of a\nrecently developed large language model, GPT-4 (via HIPAA-compliant Microsoft\nAzure API), to identify reasons for switching between classes of contraceptives\nfrom the UCSF Information Commons clinical notes dataset. We demonstrate that\nGPT-4 can accurately extract reasons for contraceptive switching, outperforming\nbaseline BERT-based models with microF1 scores of 0.849 and 0.881 for\ncontraceptive start and stop extraction, respectively. Human evaluation of\nGPT-4-extracted reasons for switching showed 91.4% accuracy, with minimal\nhallucinations. Using extracted reasons, we identified patient preference,\nadverse events, and insurance as key reasons for switching using unsupervised\ntopic modeling approaches. Notably, we also showed using our approach that\n\"weight gain/mood change\" and \"insurance coverage\" are disproportionately found\nas reasons for contraceptive switching in specific demographic populations. Our\ncode and supplemental data are available at\nhttps://github.com/BMiao10/contraceptive-switching.\n","authors":["Brenda Y. Miao","Christopher YK Williams","Ebenezer Chinedu-Eneh","Travis Zack","Emily Alsentzer","Atul J. Butte","Irene Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04457v1","updated":"2024-02-06T22:52:47Z","published":"2024-02-06T22:52:47Z","title":"Reliability quality measures for recommender systems","summary":" Users want to know the reliability of the recommendations; they do not accept\nhigh predictions if there is no reliability evidence. Recommender systems\nshould provide reliability values associated with the predictions. Research\ninto reliability measures requires the existence of simple, plausible and\nuniversal reliability quality measures. Research into recommender system\nquality measures has focused on accuracy. Moreover, novelty, serendipity and\ndiversity have been studied; nevertheless there is an important lack of\nresearch into reliability/confidence quality measures.\n This paper proposes a reliability quality prediction measure (RPI) and a\nreliability quality recommendation measure (RRI). Both quality measures are\nbased on the hypothesis that the more suitable a reliability measure is, the\nbetter accuracy results it will provide when applied. These reliability quality\nmeasures show accuracy improvements when appropriated reliability values are\nassociated with their predictions (i.e. high reliability values associated with\ncorrect predictions or low reliability values associated with incorrect\npredictions).\n The proposed reliability quality metrics will lead to the design of brand new\nrecommender system reliability measures. These measures could be applied to\ndifferent matrix factorization techniques and to content-based, context-aware\nand social recommendation approaches. The recommender system reliability\nmeasures designed could be tested, compared and improved using the proposed\nreliability quality metrics.\n","authors":["Jesús Bobadilla","Abraham Gutierrez","Fernando Ortega","Bo Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.04457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04453v1","updated":"2024-02-06T22:42:28Z","published":"2024-02-06T22:42:28Z","title":"The Potential of AutoML for Recommender Systems","summary":" Automated Machine Learning (AutoML) has greatly advanced applications of\nMachine Learning (ML) including model compression, machine translation, and\ncomputer vision. Recommender Systems (RecSys) can be seen as an application of\nML. Yet, AutoML has found little attention in the RecSys community; nor has\nRecSys found notable attention in the AutoML community. Only few and relatively\nsimple Automated Recommender Systems (AutoRecSys) libraries exist that adopt\nAutoML techniques. However, these libraries are based on student projects and\ndo not offer the features and thorough development of AutoML libraries. We set\nout to determine how AutoML libraries perform in the scenario of an\ninexperienced user who wants to implement a recommender system. We compared the\npredictive performance of 60 AutoML, AutoRecSys, ML, and RecSys algorithms from\n15 libraries, including a mean predictor baseline, on 14 explicit feedback\nRecSys datasets. To simulate the perspective of an inexperienced user, the\nalgorithms were evaluated with default hyperparameters. We found that AutoML\nand AutoRecSys libraries performed best. AutoML libraries performed best for\nsix of the 14 datasets (43%), but it was not always the same AutoML library\nperforming best. The single-best library was the AutoRecSys library\nAuto-Surprise, which performed best on five datasets (36%). On three datasets\n(21%), AutoML libraries performed poorly, and RecSys libraries with default\nparameters performed best. Although, while obtaining 50% of all placements in\nthe top five per dataset, RecSys algorithms fall behind AutoML on average. ML\nalgorithms generally performed the worst.\n","authors":["Tobias Vente","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2402.04453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04357v1","updated":"2024-02-06T19:43:52Z","published":"2024-02-06T19:43:52Z","title":"Building Retrieval Systems for the ClueWeb22-B Corpus","summary":" The ClueWeb22 dataset containing nearly 10 billion documents was released in\n2022 to support academic and industry research. The goal of this project was to\nbuild retrieval baselines for the English section of the \"super head\" part\n(category B) of this dataset. These baselines can then be used by the research\ncommunity to compare their systems and also to generate data to train/evaluate\nnew retrieval and ranking algorithms. The report covers sparse and dense first\nstage retrievals as well as neural rerankers that were implemented for this\ndataset. These systems are available as a service on a Carnegie Mellon\nUniversity cluster.\n","authors":["Harshit Mehrotra","Jamie Callan","Zhen Fan"],"pdf_url":"https://arxiv.org/pdf/2402.04357v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.04249v1","updated":"2024-02-06T18:59:08Z","published":"2024-02-06T18:59:08Z","title":"HarmBench: A Standardized Evaluation Framework for Automated Red Teaming\n and Robust Refusal","summary":" Automated red teaming holds substantial promise for uncovering and mitigating\nthe risks associated with the malicious use of large language models (LLMs),\nyet the field lacks a standardized evaluation framework to rigorously assess\nnew methods. To address this issue, we introduce HarmBench, a standardized\nevaluation framework for automated red teaming. We identify several desirable\nproperties previously unaccounted for in red teaming evaluations and\nsystematically design HarmBench to meet these criteria. Using HarmBench, we\nconduct a large-scale comparison of 18 red teaming methods and 33 target LLMs\nand defenses, yielding novel insights. We also introduce a highly efficient\nadversarial training method that greatly enhances LLM robustness across a wide\nrange of attacks, demonstrating how HarmBench enables codevelopment of attacks\nand defenses. We open source HarmBench at\nhttps://github.com/centerforaisafety/HarmBench.\n","authors":["Mantas Mazeika","Long Phan","Xuwang Yin","Andy Zou","Zifan Wang","Norman Mu","Elham Sakhaee","Nathaniel Li","Steven Basart","Bo Li","David Forsyth","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2402.04249v1.pdf","comment":"Website: https://www.harmbench.org"},{"id":"http://arxiv.org/abs/2402.04248v1","updated":"2024-02-06T18:56:35Z","published":"2024-02-06T18:56:35Z","title":"Can Mamba Learn How to Learn? A Comparative Study on In-Context Learning\n Tasks","summary":" State-space models (SSMs), such as Mamba Gu & Dao (2034), have been proposed\nas alternatives to Transformer networks in language modeling, by incorporating\ngating, convolutions, and input-dependent token selection to mitigate the\nquadratic cost of multi-head attention. Although SSMs exhibit competitive\nperformance, their in-context learning (ICL) capabilities, a remarkable\nemergent property of modern language models that enables task execution without\nparameter optimization, remain underexplored compared to Transformers. In this\nstudy, we evaluate the ICL performance of SSMs, focusing on Mamba, against\nTransformer models across various tasks. Our results show that SSMs perform\ncomparably to Transformers in standard regression ICL tasks, while\noutperforming them in tasks like sparse parity learning. However, SSMs fall\nshort in tasks involving non-standard retrieval functionality. To address these\nlimitations, we introduce a hybrid model, \\variant, that combines Mamba with\nattention blocks, surpassing individual models in tasks where they struggle\nindependently. Our findings suggest that hybrid architectures offer promising\navenues for enhancing ICL in language models.\n","authors":["Jongho Park","Jaeseung Park","Zheyang Xiong","Nayoung Lee","Jaewoong Cho","Samet Oymak","Kangwook Lee","Dimitris Papailiopoulos"],"pdf_url":"https://arxiv.org/pdf/2402.04248v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.18948v3","updated":"2024-02-06T18:56:18Z","published":"2023-10-29T09:15:22Z","title":"Building a Safer Maritime Environment Through Multi-Path Long-Term\n Vessel Trajectory Forecasting","summary":" Maritime transportation is paramount in achieving global economic growth,\nentailing concurrent ecological obligations in sustainability and safeguarding\nendangered marine species, most notably preserving large whale populations. In\nthis regard, the Automatic Identification System (AIS) data plays a significant\nrole by offering real-time streaming data on vessel movement, allowing enhanced\ntraffic monitoring. This study explores using AIS data to prevent\nvessel-to-whale collisions by forecasting long-term vessel trajectories from\nengineered AIS data sequences. For such a task, we have developed an\nencoder-decoder model architecture using Bidirectional Long Short-Term Memory\nNetworks (Bi-LSTM) to predict the next 12 hours of vessel trajectories using 1\nto 3 hours of AIS data as input. We feed the model with probabilistic features\nengineered from historical AIS data that refer to each trajectory's potential\nroute and destination. The model then predicts the vessel's trajectory,\nconsidering these additional features by leveraging convolutional layers for\nspatial feature learning and a position-aware attention mechanism that\nincreases the importance of recent timesteps of a sequence during temporal\nfeature learning. The probabilistic features have an F1 Score of approximately\n85% and 75% for each feature type, respectively, demonstrating their\neffectiveness in augmenting information to the neural network. We test our\nmodel on the Gulf of St. Lawrence, a region known to be the habitat of North\nAtlantic Right Whales (NARW). Our model achieved a high R2 score of over 98%\nusing various techniques and features. It stands out among other approaches as\nit can make complex decisions during turnings and path selection. Our study\nhighlights the potential of data engineering and trajectory forecasting models\nfor marine life species preservation.\n","authors":["Gabriel Spadon","Jay Kumar","Matthew Smith","Sarah Vela","Romina Gehrmann","Derek Eden","Joshua van Berkel","Amilcar Soares","Ronan Fablet","Ronald Pelot","Stan Matwin"],"pdf_url":"https://arxiv.org/pdf/2310.18948v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06118v2","updated":"2024-02-06T18:55:25Z","published":"2024-01-11T18:54:44Z","title":"Extreme Compression of Large Language Models via Additive Quantization","summary":" The emergence of accurate open large language models (LLMs) has led to a race\ntowards quantization techniques for such models enabling execution on end-user\ndevices. In this paper, we revisit the problem of \"extreme\" LLM\ncompression--defined as targeting extremely low bit counts, such as 2 to 3 bits\nper parameter, from the point of view of classic methods in Multi-Codebook\nQuantization (MCQ). Our work builds on top of Additive Quantization, a classic\nalgorithm from the MCQ family, and adapts it to the quantization of language\nmodels. The resulting algorithm advances the state-of-the-art in LLM\ncompression, outperforming all recently-proposed techniques in terms of\naccuracy at a given compression budget. For instance, when compressing Llama 2\nmodels to 2 bits per parameter, our algorithm quantizes the 7B model to 6.93\nperplexity (a 1.29 improvement relative to the best prior work, and 1.81 points\nfrom FP16), the 13B model to 5.70 perplexity (a .36 improvement) and the 70B\nmodel to 3.94 perplexity (a .22 improvement) on WikiText2. We release our\nimplementation of Additive Quantization for Language Models AQLM as a baseline\nto facilitate future research in LLM quantization.\n","authors":["Vage Egiazarian","Andrei Panferov","Denis Kuznedelev","Elias Frantar","Artem Babenko","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2401.06118v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.04247v1","updated":"2024-02-06T18:54:07Z","published":"2024-02-06T18:54:07Z","title":"Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science","summary":" Intelligent agents powered by large language models (LLMs) have demonstrated\nsubstantial promise in autonomously conducting experiments and facilitating\nscientific discoveries across various disciplines. While their capabilities are\npromising, they also introduce novel vulnerabilities that demand careful\nconsideration for safety. However, there exists a notable gap in the\nliterature, as there has been no comprehensive exploration of these\nvulnerabilities. This position paper fills this gap by conducting a thorough\nexamination of vulnerabilities in LLM-based agents within scientific domains,\nshedding light on potential risks associated with their misuse and emphasizing\nthe need for safety measures. We begin by providing a comprehensive overview of\nthe potential risks inherent to scientific LLM agents, taking into account user\nintent, the specific scientific domain, and their potential impact on the\nexternal environment. Then, we delve into the origins of these vulnerabilities\nand provide a scoping review of the limited existing works. Based on our\nanalysis, we propose a triadic framework involving human regulation, agent\nalignment, and an understanding of environmental feedback (agent regulation) to\nmitigate these identified risks. Furthermore, we highlight the limitations and\nchallenges associated with safeguarding scientific agents and advocate for the\ndevelopment of improved models, robust benchmarks, and comprehensive\nregulations to address these issues effectively.\n","authors":["Xiangru Tang","Qiao Jin","Kunlun Zhu","Tongxin Yuan","Yichi Zhang","Wangchunshu Zhou","Meng Qu","Yilun Zhao","Jian Tang","Zhuosheng Zhang","Arman Cohan","Zhiyong Lu","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2402.04247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04239v1","updated":"2024-02-06T18:47:52Z","published":"2024-02-06T18:47:52Z","title":"CAST: Clustering Self-Attention using Surrogate Tokens for Efficient\n Transformers","summary":" The Transformer architecture has shown to be a powerful tool for a wide range\nof tasks. It is based on the self-attention mechanism, which is an inherently\ncomputationally expensive operation with quadratic computational complexity:\nmemory usage and compute time increase quadratically with the length of the\ninput sequences, thus limiting the application of Transformers. In this work,\nwe propose a novel Clustering self-Attention mechanism using Surrogate Tokens\n(CAST), to optimize the attention computation and achieve efficient\ntransformers. CAST utilizes learnable surrogate tokens to construct a cluster\naffinity matrix, used to cluster the input sequence and generate novel cluster\nsummaries. The self-attention from within each cluster is then combined with\nthe cluster summaries of other clusters, enabling information flow across the\nentire input sequence. CAST improves efficiency by reducing the complexity from\n$O(N^2)$ to $O(\\alpha N)$ where N is the sequence length, and {\\alpha} is\nconstant according to the number of clusters and samples per cluster. We show\nthat CAST performs better than or comparable to the baseline Transformers on\nlong-range sequence modeling tasks, while also achieving higher results on time\nand memory efficiency than other efficient transformers.\n","authors":["Adjorn van Engelenhoven","Nicola Strisciuglio","Estefanía Talavera"],"pdf_url":"https://arxiv.org/pdf/2402.04239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03300v2","updated":"2024-02-06T18:39:38Z","published":"2024-02-05T18:55:32Z","title":"DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open\n Language Models","summary":" Mathematical reasoning poses a significant challenge for language models due\nto its complex and structured nature. In this paper, we introduce DeepSeekMath\n7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B\nmath-related tokens sourced from Common Crawl, together with natural language\nand code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the\ncompetition-level MATH benchmark without relying on external toolkits and\nvoting techniques, approaching the performance level of Gemini-Ultra and GPT-4.\nSelf-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH.\nThe mathematical reasoning capability of DeepSeekMath is attributed to two key\nfactors: First, we harness the significant potential of publicly available web\ndata through a meticulously engineered data selection pipeline. Second, we\nintroduce Group Relative Policy Optimization (GRPO), a variant of Proximal\nPolicy Optimization (PPO), that enhances mathematical reasoning abilities while\nconcurrently optimizing the memory usage of PPO.\n","authors":["Zhihong Shao","Peiyi Wang","Qihao Zhu","Runxin Xu","Junxiao Song","Mingchuan Zhang","Y. K. Li","Y. Wu","Daya Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03300v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04229v1","updated":"2024-02-06T18:36:52Z","published":"2024-02-06T18:36:52Z","title":"MusicRL: Aligning Music Generation to Human Preferences","summary":" We propose MusicRL, the first music generation system finetuned from human\nfeedback. Appreciation of text-to-music models is particularly subjective since\nthe concept of musicality as well as the specific intention behind a caption\nare user-dependent (e.g. a caption such as \"upbeat work-out music\" can map to a\nretro guitar solo or a techno pop beat). Not only this makes supervised\ntraining of such models challenging, but it also calls for integrating\ncontinuous human feedback in their post-deployment finetuning. MusicRL is a\npretrained autoregressive MusicLM (Agostinelli et al., 2023) model of discrete\naudio tokens finetuned with reinforcement learning to maximise sequence-level\nrewards. We design reward functions related specifically to text-adherence and\naudio quality with the help from selected raters, and use those to finetune\nMusicLM into MusicRL-R. We deploy MusicLM to users and collect a substantial\ndataset comprising 300,000 pairwise preferences. Using Reinforcement Learning\nfrom Human Feedback (RLHF), we train MusicRL-U, the first text-to-music model\nthat incorporates human feedback at scale. Human evaluations show that both\nMusicRL-R and MusicRL-U are preferred to the baseline. Ultimately, MusicRL-RU\ncombines the two approaches and results in the best model according to human\nraters. Ablation studies shed light on the musical attributes influencing human\npreferences, indicating that text adherence and quality only account for a part\nof it. This underscores the prevalence of subjectivity in musical appreciation\nand calls for further involvement of human listeners in the finetuning of music\ngeneration models.\n","authors":["Geoffrey Cideron","Sertan Girgin","Mauro Verzetti","Damien Vincent","Matej Kastelic","Zalán Borsos","Brian McWilliams","Victor Ungureanu","Olivier Bachem","Olivier Pietquin","Matthieu Geist","Léonard Hussenot","Neil Zeghidour","Andrea Agostinelli"],"pdf_url":"https://arxiv.org/pdf/2402.04229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15574v3","updated":"2024-02-06T18:36:42Z","published":"2023-12-25T01:00:58Z","title":"Clustered Switchback Experiments: Near-Optimal Rates Under\n Spatiotemporal Interference","summary":" We consider experimentation in the presence of non-stationarity, inter-unit\n(spatial) interference, and carry-over effects (temporal interference), where\nwe wish to estimate the global average treatment effect (GATE), the difference\nbetween average outcomes having exposed all units at all times to treatment or\nto control. We suppose spatial interference is described by a graph, where a\nunit's outcome depends on its neighborhood's treatment assignments, and that\ntemporal interference is described by a hidden Markov decision process, where\nthe transition kernel under either treatment (action) satisfies a rapid mixing\ncondition. We propose a clustered switchback design, where units are grouped\ninto clusters and time steps are grouped into blocks and each whole\ncluster-block combination is assigned a single random treatment. Under this\ndesign, we show that for graphs that admit good clustering, a truncated\nexposure-mapping Horvitz-Thompson estimator achieves $\\tilde O(1/NT)$\nmean-squared error (MSE), matching an $\\Omega(1/NT)$ lower bound up to\nlogarithmic terms. Our results simultaneously generalize the $N=1$ setting of\nHu, Wager 2022 (and improves on the MSE bound shown therein for\ndifference-in-means estimators) as well as the $T=1$ settings of Ugander et al\n2013 and Leung 2022. Simulation studies validate the favorable performance of\nour approach.\n","authors":["Su Jia","Nathan Kallus","Christina Lee Yu"],"pdf_url":"https://arxiv.org/pdf/2312.15574v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04216v1","updated":"2024-02-06T18:17:02Z","published":"2024-02-06T18:17:02Z","title":"Resource-Aware Hierarchical Federated Learning in Wireless Video Caching\n Networks","summary":" Backhaul traffic congestion caused by the video traffic of a few popular\nfiles can be alleviated by storing the to-be-requested content at various\nlevels in wireless video caching networks. Typically, content service providers\n(CSPs) own the content, and the users request their preferred content from the\nCSPs using their (wireless) internet service providers (ISPs). As these parties\ndo not reveal their private information and business secrets, traditional\ntechniques may not be readily used to predict the dynamic changes in users'\nfuture demands. Motivated by this, we propose a novel resource-aware\nhierarchical federated learning (RawHFL) solution for predicting user's future\ncontent requests. A practical data acquisition technique is used that allows\nthe user to update its local training dataset based on its requested content.\nBesides, since networking and other computational resources are limited,\nconsidering that only a subset of the users participate in the model training,\nwe derive the convergence bound of the proposed algorithm. Based on this bound,\nwe minimize a weighted utility function for jointly configuring the\ncontrollable parameters to train the RawHFL energy efficiently under practical\nresource constraints. Our extensive simulation results validate the proposed\nalgorithm's superiority, in terms of test accuracy and energy cost, over\nexisting baselines.\n","authors":["Md Ferdous Pervej","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2402.04216v1.pdf","comment":"Under review for possible publication in IEEE TWC"},{"id":"http://arxiv.org/abs/2402.04211v1","updated":"2024-02-06T18:09:05Z","published":"2024-02-06T18:09:05Z","title":"Variational Shapley Network: A Probabilistic Approach to Self-Explaining\n Shapley values with Uncertainty Quantification","summary":" Shapley values have emerged as a foundational tool in machine learning (ML)\nfor elucidating model decision-making processes. Despite their widespread\nadoption and unique ability to satisfy essential explainability axioms,\ncomputational challenges persist in their estimation when ($i$) evaluating a\nmodel over all possible subset of input feature combinations, ($ii$) estimating\nmodel marginals, and ($iii$) addressing variability in explanations. We\nintroduce a novel, self-explaining method that simplifies the computation of\nShapley values significantly, requiring only a single forward pass. Recognizing\nthe deterministic treatment of Shapley values as a limitation, we explore\nincorporating a probabilistic framework to capture the inherent uncertainty in\nexplanations. Unlike alternatives, our technique does not rely directly on the\nobserved data space to estimate marginals; instead, it uses adaptable baseline\nvalues derived from a latent, feature-specific embedding space, generated by a\nnovel masked neural network architecture. Evaluations on simulated and real\ndatasets underscore our technique's robust predictive and explanatory\nperformance.\n","authors":["Mert Ketenci","Iñigo Urteaga","Victor Alfonso Rodriguez","Noémie Elhadad","Adler Perotte"],"pdf_url":"https://arxiv.org/pdf/2402.04211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03301v2","updated":"2024-02-06T18:08:40Z","published":"2024-01-06T20:52:04Z","title":"On Sample-Efficient Offline Reinforcement Learning: Data Diversity,\n Posterior Sampling, and Beyond","summary":" We seek to understand what facilitates sample-efficient learning from\nhistorical datasets for sequential decision-making, a problem that is popularly\nknown as offline reinforcement learning (RL). Further, we are interested in\nalgorithms that enjoy sample efficiency while leveraging (value) function\napproximation. In this paper, we address these fundamental questions by (i)\nproposing a notion of data diversity that subsumes the previous notions of\ncoverage measures in offline RL and (ii) using this notion to {unify} three\ndistinct classes of offline RL algorithms based on version spaces (VS),\nregularized optimization (RO), and posterior sampling (PS). We establish that\nVS-based, RO-based, and PS-based algorithms, under standard assumptions,\nachieve \\emph{comparable} sample efficiency, which recovers the\nstate-of-the-art sub-optimality bounds for finite and linear model classes with\nthe standard assumptions. This result is surprising, given that the prior work\nsuggested an unfavorable sample complexity of the RO-based algorithm compared\nto the VS-based algorithm, whereas posterior sampling is rarely considered in\noffline RL due to its explorative nature. Notably, our proposed model-free\nPS-based algorithm for offline RL is {novel}, with sub-optimality bounds that\nare {frequentist} (i.e., worst-case) in nature.\n","authors":["Thanh Nguyen-Tang","Raman Arora"],"pdf_url":"https://arxiv.org/pdf/2401.03301v2.pdf","comment":"NeurIPS'23; Arxiv is the authors' preferred version; v2: add a\n missing related work"},{"id":"http://arxiv.org/abs/2306.03933v4","updated":"2024-02-06T18:05:54Z","published":"2023-06-06T18:01:03Z","title":"High-dimensional and Permutation Invariant Anomaly Detection","summary":" Methods for anomaly detection of new physics processes are often limited to\nlow-dimensional spaces due to the difficulty of learning high-dimensional\nprobability densities. Particularly at the constituent level, incorporating\ndesirable properties such as permutation invariance and variable-length inputs\nbecomes difficult within popular density estimation methods. In this work, we\nintroduce a permutation-invariant density estimator for particle physics data\nbased on diffusion models, specifically designed to handle variable-length\ninputs. We demonstrate the efficacy of our methodology by utilizing the learned\ndensity as a permutation-invariant anomaly detection score, effectively\nidentifying jets with low likelihood under the background-only hypothesis. To\nvalidate our density estimation method, we investigate the ratio of learned\ndensities and compare to those obtained by a supervised classification\nalgorithm.\n","authors":["Vinicius Mikuni","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2306.03933v4.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.04209v1","updated":"2024-02-06T18:05:30Z","published":"2024-02-06T18:05:30Z","title":"Acute kidney injury prediction for non-critical care patients: a\n retrospective external and internal validation study","summary":" Background: Acute kidney injury (AKI), the decline of kidney excretory\nfunction, occurs in up to 18% of hospitalized admissions. Progression of AKI\nmay lead to irreversible kidney damage. Methods: This retrospective cohort\nstudy includes adult patients admitted to a non-intensive care unit at the\nUniversity of Pittsburgh Medical Center (UPMC) (n = 46,815) and University of\nFlorida Health (UFH) (n = 127,202). We developed and compared deep learning and\nconventional machine learning models to predict progression to Stage 2 or\nhigher AKI within the next 48 hours. We trained local models for each site (UFH\nModel trained on UFH, UPMC Model trained on UPMC) and a separate model with a\ndevelopment cohort of patients from both sites (UFH-UPMC Model). We internally\nand externally validated the models on each site and performed subgroup\nanalyses across sex and race. Results: Stage 2 or higher AKI occurred in 3%\n(n=3,257) and 8% (n=2,296) of UFH and UPMC patients, respectively. Area under\nthe receiver operating curve values (AUROC) for the UFH test cohort ranged\nbetween 0.77 (UPMC Model) and 0.81 (UFH Model), while AUROC values ranged\nbetween 0.79 (UFH Model) and 0.83 (UPMC Model) for the UPMC test cohort.\nUFH-UPMC Model achieved an AUROC of 0.81 (95% confidence interval [CI] [0.80,\n0.83]) for UFH and 0.82 (95% CI [0.81,0.84]) for UPMC test cohorts; an area\nunder the precision recall curve values (AUPRC) of 0.6 (95% CI, [0.05, 0.06])\nfor UFH and 0.13 (95% CI, [0.11,0.15]) for UPMC test cohorts. Kinetic estimated\nglomerular filtration rate, nephrotoxic drug burden and blood urea nitrogen\nremained the top three features with the highest influence across the models\nand health centers. Conclusion: Locally developed models displayed marginally\nreduced discrimination when tested on another institution, while the top set of\ninfluencing features remained the same across the models and sites.\n","authors":["Esra Adiyeke","Yuanfang Ren","Benjamin Shickel","Matthew M. Ruppert","Ziyuan Guan","Sandra L. Kane-Gill","Raghavan Murugan","Nabihah Amatullah","Britney A. Stottlemyer","Tiffany L. Tran","Dan Ricketts","Christopher M Horvat","Parisa Rashidi","Azra Bihorac","Tezcan Ozrazgat-Baslanti"],"pdf_url":"https://arxiv.org/pdf/2402.04209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02416v2","updated":"2024-02-06T18:02:01Z","published":"2024-02-04T09:24:51Z","title":"Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction","summary":" Efforts to align Large Language Models (LLMs) are mainly conducted via\nReinforcement Learning from Human Feedback (RLHF) methods. However, RLHF\nencounters major challenges including training reward models, actor-critic\nengineering, and importantly, it requires access to LLM parameters. Here we\nintroduce Aligner, a new efficient alignment paradigm that bypasses the whole\nRLHF process by learning the correctional residuals between the aligned and the\nunaligned answers. Our Aligner offers several key advantages. Firstly, it is an\nautoregressive seq2seq model that is trained on the query-answer-correction\ndataset via supervised learning; this offers a parameter-efficient alignment\nsolution with minimal resources. Secondly, the Aligner facilitates\nweak-to-strong generalization; finetuning large pretrained models by Aligner's\nsupervisory signals demonstrates strong performance boost. Thirdly, Aligner\nfunctions as a model-agnostic plug-and-play module, allowing for its direct\napplication on different open-source and API-based models. Remarkably,\nAligner-7B improves 11 different LLMs by 21.9% in helpfulness and 23.8% in\nharmlessness on average (GPT-4 by 17.5% and 26.9%). When finetuning (strong)\nLlama2-70B with (weak) Aligner-13B's supervision, we can improve Llama2 by 8.2%\nin helpfulness and 61.6% in harmlessness. See our dataset and code at\nhttps://aligner2024.github.io\n","authors":["Jiaming Ji","Boyuan Chen","Hantao Lou","Donghai Hong","Borong Zhang","Xuehai Pan","Juntao Dai","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.02416v2.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2308.09687v4","updated":"2024-02-06T18:00:18Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Michal Podstawski","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00405v3","updated":"2024-02-06T17:56:03Z","published":"2023-07-01T18:35:21Z","title":"Provably Efficient UCB-type Algorithms For Learning Predictive State\n Representations","summary":" The general sequential decision-making problem, which includes Markov\ndecision processes (MDPs) and partially observable MDPs (POMDPs) as special\ncases, aims at maximizing a cumulative reward by making a sequence of decisions\nbased on a history of observations and actions over time. Recent studies have\nshown that the sequential decision-making problem is statistically learnable if\nit admits a low-rank structure modeled by predictive state representations\n(PSRs). Despite these advancements, existing approaches typically involve\noracles or steps that are computationally intractable. On the other hand, the\nupper confidence bound (UCB) based approaches, which have served successfully\nas computationally efficient methods in bandits and MDPs, have not been\ninvestigated for more general PSRs, due to the difficulty of optimistic bonus\ndesign in these more challenging settings. This paper proposes the first known\nUCB-type approach for PSRs, featuring a novel bonus term that upper bounds the\ntotal variation distance between the estimated and true models. We further\ncharacterize the sample complexity bounds for our designed UCB-type algorithms\nfor both online and offline PSRs. In contrast to existing approaches for PSRs,\nour UCB-type algorithms enjoy computational tractability, last-iterate\nguaranteed near-optimal policy, and guaranteed model accuracy.\n","authors":["Ruiquan Huang","Yingbin Liang","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2307.00405v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.02441v2","updated":"2024-02-06T17:53:31Z","published":"2024-02-04T10:41:40Z","title":"TopoX: A Suite of Python Packages for Machine Learning on Topological\n Domains","summary":" We introduce topox, a Python software suite that provides reliable and\nuser-friendly building blocks for computing and machine learning on topological\ndomains that extend graphs: hypergraphs, simplicial, cellular, path and\ncombinatorial complexes. topox consists of three packages: toponetx facilitates\nconstructing and computing on these domains, including working with nodes,\nedges and higher-order cells; topoembedx provides methods to embed topological\ndomains into vector spaces, akin to popular graph-based embedding algorithms\nsuch as node2vec; topomodelx is built on top of PyTorch and offers a\ncomprehensive toolbox of higher-order message passing functions for neural\nnetworks on topological domains. The extensively documented and unit-tested\nsource code of topox is available under MIT license at\nhttps://github.com/pyt-team.\n","authors":["Mustafa Hajij","Mathilde Papillon","Florian Frantzen","Jens Agerberg","Ibrahem AlJabea","Ruben Ballester","Claudio Battiloro","Guillermo Bernárdez","Tolga Birdal","Aiden Brent","Peter Chin","Sergio Escalera","Odin Hoff Gardaa","Gurusankar Gopalakrishnan","Devendra Govil","Josef Hoppe","Maneel Reddy Karri","Jude Khouja","Manuel Lecha","Neal Livesay","Jan Meißner","Soham Mukherjee","Alexander Nikitin","Theodore Papamarkou","Jaro Prílepok","Karthikeyan Natesan Ramamurthy","Paul Rosen","Aldo Guzmán-Sáenz","Alessandro Salatiello","Shreyas N. Samaga","Michael T. Schaub","Luca Scofano","Indro Spinelli","Lev Telyatnikov","Quang Truong","Robin Walters","Maosheng Yang","Olga Zaghen","Ghada Zamzmi","Ali Zia","Nina Miolane"],"pdf_url":"https://arxiv.org/pdf/2402.02441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09844v2","updated":"2024-02-06T17:53:02Z","published":"2023-09-18T14:59:11Z","title":"CC-SGG: Corner Case Scenario Generation using Learned Scene Graphs","summary":" Corner case scenarios are an essential tool for testing and validating the\nsafety of autonomous vehicles (AVs). As these scenarios are often\ninsufficiently present in naturalistic driving datasets, augmenting the data\nwith synthetic corner cases greatly enhances the safe operation of AVs in\nunique situations. However, the generation of synthetic, yet realistic, corner\ncases poses a significant challenge. In this work, we introduce a novel\napproach based on Heterogeneous Graph Neural Networks (HGNNs) to transform\nregular driving scenarios into corner cases. To achieve this, we first generate\nconcise representations of regular driving scenes as scene graphs, minimally\nmanipulating their structure and properties. Our model then learns to perturb\nthose graphs to generate corner cases using attention and triple embeddings.\nThe input and perturbed graphs are then imported back into the simulation to\ngenerate corner case scenarios. Our model successfully learned to produce\ncorner cases from input scene graphs, achieving 89.9% prediction accuracy on\nour testing dataset. We further validate the generated scenarios on baseline\nautonomous driving methods, demonstrating our model's ability to effectively\ncreate critical situations for the baselines.\n","authors":["George Drayson","Efimia Panagiotaki","Daniel Omeiza","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2309.09844v2.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2402.02772v2","updated":"2024-02-06T17:49:44Z","published":"2024-02-05T07:12:02Z","title":"Contrastive Diffuser: Planning Towards High Return States via\n Contrastive Learning","summary":" Applying diffusion models in reinforcement learning for long-term planning\nhas gained much attention recently. Several diffusion-based methods have\nsuccessfully leveraged the modeling capabilities of diffusion for arbitrary\ndistributions. These methods generate subsequent trajectories for planning and\nhave demonstrated significant improvement. However, these methods are limited\nby their plain base distributions and their overlooking of the diversity of\nsamples, in which different states have different returns. They simply leverage\ndiffusion to learn the distribution of offline dataset, generate the\ntrajectories whose states share the same distribution with the offline dataset.\nAs a result, the probability of these models reaching the high-return states is\nlargely dependent on the dataset distribution. Even equipped with the guidance\nmodel, the performance is still suppressed. To address these limitations, in\nthis paper, we propose a novel method called CDiffuser, which devises a return\ncontrast mechanism to pull the states in generated trajectories towards\nhigh-return states while pushing them away from low-return states to improve\nthe base distribution. Experiments on 14 commonly used D4RL benchmarks\ndemonstrate the effectiveness of our proposed method.\n","authors":["Yixiang Shan","Zhengbang Zhu","Ting Long","Qifan Liang","Yi Chang","Weinan Zhang","Liang Yin"],"pdf_url":"https://arxiv.org/pdf/2402.02772v2.pdf","comment":"13 pages with appendix and references, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.04193v1","updated":"2024-02-06T17:49:02Z","published":"2024-02-06T17:49:02Z","title":"Gradient Coding in Decentralized Learning for Evading Stragglers","summary":" In this paper, we consider a decentralized learning problem in the presence\nof stragglers. Although gradient coding techniques have been developed for\ndistributed learning to evade stragglers, where the devices send encoded\ngradients with redundant training data, it is difficult to apply those\ntechniques directly to decentralized learning scenarios. To deal with this\nproblem, we propose a new gossip-based decentralized learning method with\ngradient coding (GOCO). In the proposed method, to avoid the negative impact of\nstragglers, the parameter vectors are updated locally using encoded gradients\nbased on the framework of stochastic gradient coding and then averaged in a\ngossip-based manner. We analyze the convergence performance of GOCO for\nstrongly convex loss functions. And we also provide simulation results to\ndemonstrate the superiority of the proposed method in terms of learning\nperformance compared with the baseline methods.\n","authors":["Chengxi Li","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2402.04193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00809v2","updated":"2024-02-06T17:48:56Z","published":"2024-02-01T17:45:26Z","title":"Position Paper: Bayesian Deep Learning in the Age of Large-Scale AI","summary":" In the current landscape of deep learning research, there is a predominant\nemphasis on achieving high predictive accuracy in supervised tasks involving\nlarge image and language datasets. However, a broader perspective reveals a\nmultitude of overlooked metrics, tasks, and data types, such as uncertainty,\nactive and continual learning, and scientific data, that demand attention.\nBayesian deep learning (BDL) constitutes a promising avenue, offering\nadvantages across these diverse settings. This paper posits that BDL can\nelevate the capabilities of deep learning. It revisits the strengths of BDL,\nacknowledges existing challenges, and highlights some exciting research avenues\naimed at addressing these obstacles. Looking ahead, the discussion focuses on\npossible ways to combine large-scale foundation models with BDL to unlock their\nfull potential.\n","authors":["Theodore Papamarkou","Maria Skoularidou","Konstantina Palla","Laurence Aitchison","Julyan Arbel","David Dunson","Maurizio Filippone","Vincent Fortuin","Philipp Hennig","Jose Miguel Hernandez Lobato","Aliaksandr Hubin","Alexander Immer","Theofanis Karaletsos","Mohammad Emtiyaz Khan","Agustinus Kristiadi","Yingzhen Li","Stephan Mandt","Christopher Nemeth","Michael A. Osborne","Tim G. J. Rudner","David Rügamer","Yee Whye Teh","Max Welling","Andrew Gordon Wilson","Ruqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.00809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04182v1","updated":"2024-02-06T17:42:39Z","published":"2024-02-06T17:42:39Z","title":"Reinforcement Learning with Ensemble Model Predictive Safety\n Certification","summary":" Reinforcement learning algorithms need exploration to learn. However,\nunsupervised exploration prevents the deployment of such algorithms on\nsafety-critical tasks and limits real-world deployment. In this paper, we\npropose a new algorithm called Ensemble Model Predictive Safety Certification\nthat combines model-based deep reinforcement learning with tube-based model\npredictive control to correct the actions taken by a learning agent, keeping\nsafety constraint violations at a minimum through planning. Our approach aims\nto reduce the amount of prior knowledge about the actual system by requiring\nonly offline data generated by a safe controller. Our results show that we can\nachieve significantly fewer constraint violations than comparable reinforcement\nlearning methods.\n","authors":["Sven Gronauer","Tom Haider","Felippe Schmoeller da Roza","Klaus Diepold"],"pdf_url":"https://arxiv.org/pdf/2402.04182v1.pdf","comment":"Published in: Proc. of the 23rd International Conference on\n Autonomous Agents and Multiagent Systems (AAMAS 2024)"},{"id":"http://arxiv.org/abs/2402.04177v1","updated":"2024-02-06T17:31:20Z","published":"2024-02-06T17:31:20Z","title":"Scaling Laws for Downstream Task Performance of Large Language Models","summary":" Scaling laws provide important insights that can guide the design of large\nlanguage models (LLMs). Existing work has primarily focused on studying scaling\nlaws for pretraining (upstream) loss. However, in transfer learning settings,\nin which LLMs are pretrained on an unsupervised dataset and then finetuned on a\ndownstream task, we often also care about the downstream performance. In this\nwork, we study the scaling behavior in a transfer learning setting, where LLMs\nare finetuned for machine translation tasks. Specifically, we investigate how\nthe choice of the pretraining data and its size affect downstream performance\n(translation quality) as judged by two metrics: downstream cross-entropy and\nBLEU score. Our experiments indicate that the size of the finetuning dataset\nand the distribution alignment between the pretraining and downstream data\nsignificantly influence the scaling behavior. With sufficient alignment, both\ndownstream cross-entropy and BLEU score improve monotonically with more\npretraining data. In such cases, we show that it is possible to predict the\ndownstream BLEU score with good accuracy using a log-law. However, there are\nalso cases where moderate misalignment causes the BLEU score to fluctuate or\nget worse with more pretraining, whereas downstream cross-entropy monotonically\nimproves. By analyzing these observations, we provide new practical insights\nfor choosing appropriate pretraining data.\n","authors":["Berivan Isik","Natalia Ponomareva","Hussein Hazimeh","Dimitris Paparas","Sergei Vassilvitskii","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2402.04177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04168v1","updated":"2024-02-06T17:24:06Z","published":"2024-02-06T17:24:06Z","title":"Informed Reinforcement Learning for Situation-Aware Traffic Rule\n Exceptions","summary":" Reinforcement Learning is a highly active research field with promising\nadvancements. In the field of autonomous driving, however, often very simple\nscenarios are being examined. Common approaches use non-interpretable control\ncommands as the action space and unstructured reward designs which lack\nstructure. In this work, we introduce Informed Reinforcement Learning, where a\nstructured rulebook is integrated as a knowledge source. We learn trajectories\nand asses them with a situation-aware reward design, leading to a dynamic\nreward which allows the agent to learn situations which require controlled\ntraffic rule exceptions. Our method is applicable to arbitrary RL models. We\nsuccessfully demonstrate high completion rates of complex scenarios with recent\nmodel-based agents.\n","authors":["Daniel Bogdoll","Jing Qin","Moritz Nekolla","Ahmed Abouelazm","Tim Joseph","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2402.04168v1.pdf","comment":"Daniel Bogdoll and Jing Qin contributed equally. Accepted for\n publication at ICRA 2024"},{"id":"http://arxiv.org/abs/2402.04163v1","updated":"2024-02-06T17:21:06Z","published":"2024-02-06T17:21:06Z","title":"Tempered Calculus for ML: Application to Hyperbolic Model Embedding","summary":" Most mathematical distortions used in ML are fundamentally integral in\nnature: $f$-divergences, Bregman divergences, (regularized) optimal transport\ndistances, integral probability metrics, geodesic distances, etc. In this\npaper, we unveil a grounded theory and tools which can help improve these\ndistortions to better cope with ML requirements. We start with a generalization\nof Riemann integration that also encapsulates functions that are not strictly\nadditive but are, more generally, $t$-additive, as in nonextensive statistical\nmechanics. Notably, this recovers Volterra's product integral as a special\ncase. We then generalize the Fundamental Theorem of calculus using an extension\nof the (Euclidean) derivative. This, along with a series of more specific\nTheorems, serves as a basis for results showing how one can specifically\ndesign, alter, or change fundamental properties of distortion measures in a\nsimple way, with a special emphasis on geometric- and ML-related properties\nthat are the metricity, hyperbolicity, and encoding. We show how to apply it to\na problem that has recently gained traction in ML: hyperbolic embeddings with a\n\"cheap\" and accurate encoding along the hyperbolic vs Euclidean scale. We\nunveil a new application for which the Poincar\\'e disk model has very appealing\nfeatures, and our theory comes in handy: \\textit{model} embeddings for boosted\ncombinations of decision trees, trained using the log-loss (trees) and logistic\nloss (combinations).\n","authors":["Richard Nock","Ehsan Amid","Frank Nielsen","Alexander Soen","Manfred K. Warmuth"],"pdf_url":"https://arxiv.org/pdf/2402.04163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04161v1","updated":"2024-02-06T17:18:59Z","published":"2024-02-06T17:18:59Z","title":"Attention with Markov: A Framework for Principled Analysis of\n Transformers via Markov Chains","summary":" In recent years, attention-based transformers have achieved tremendous\nsuccess across a variety of disciplines including natural languages. A key\ningredient behind their success is the generative pretraining procedure, during\nwhich these models are trained on a large text corpus in an auto-regressive\nmanner. To shed light on this phenomenon, we propose a new framework that\nallows both theory and systematic experiments to study the sequential modeling\ncapabilities of transformers through the lens of Markov chains. Inspired by the\nMarkovianity of natural languages, we model the data as a Markovian source and\nutilize this framework to systematically study the interplay between the\ndata-distributional properties, the transformer architecture, the learnt\ndistribution, and the final model performance. In particular, we theoretically\ncharacterize the loss landscape of single-layer transformers and show the\nexistence of global minima and bad local minima contingent upon the specific\ndata characteristics and the transformer architecture. Backed by experiments,\nwe demonstrate that our theoretical findings are in congruence with the\nempirical results. We further investigate these findings in the broader context\nof higher order Markov chains and deeper architectures, and outline open\nproblems in this arena. Code is available at\n\\url{https://github.com/Bond1995/Markov}.\n","authors":["Ashok Vardhan Makkuva","Marco Bondaschi","Adway Girish","Alliot Nagle","Martin Jaggi","Hyeji Kim","Michael Gastpar"],"pdf_url":"https://arxiv.org/pdf/2402.04161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04154v1","updated":"2024-02-06T17:09:25Z","published":"2024-02-06T17:09:25Z","title":"Read to Play (R2-Play): Decision Transformer with Multimodal Game\n Instruction","summary":" Developing a generalist agent is a longstanding objective in artificial\nintelligence. Previous efforts utilizing extensive offline datasets from\nvarious tasks demonstrate remarkable performance in multitasking scenarios\nwithin Reinforcement Learning.However, these works encounter challenges in\nextending their capabilities to new tasks.Recent approaches integrate textual\nguidance or visual trajectory into decision networks to provide task-specific\ncontextual cues, representing a promising direction.However, it is observed\nthat relying solely on textual guidance or visual trajectory is insufficient\nfor accurately conveying the contextual information of tasks.This paper\nexplores enhanced forms of task guidance for agents, enabling them to\ncomprehend gameplay instructions, thereby facilitating a \"read-to-play\"\ncapability.Drawing inspiration from the success of multimodal instruction\ntuning in visual tasks, we treat the visual-based RL task as a long-horizon\nvision task and construct a set of multimodal game instructions to incorporate\ninstruction tuning into a decision transformer.Experimental results demonstrate\nthat incorporating multimodal game instructions significantly enhances the\ndecision transformer's multitasking and generalization capabilities.\n","authors":["Yonggang Jin","Ge Zhang","Hao Zhao","Tianyu Zheng","Jiawei Guo","Liuyu Xiang","Shawn Yue","Stephen W. Huang","Wenhu Chen","Zhaofeng He","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2402.04154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09902v3","updated":"2024-02-06T17:05:48Z","published":"2024-01-18T11:32:50Z","title":"Interplay between depth and width for interpolation in neural ODEs","summary":" Neural ordinary differential equations (neural ODEs) have emerged as a\nnatural tool for supervised learning from a control perspective, yet a complete\nunderstanding of their optimal architecture remains elusive. In this work, we\nexamine the interplay between their width $p$ and number of layer transitions\n$L$ (effectively the depth $L+1$). Specifically, we assess the model\nexpressivity in terms of its capacity to interpolate either a finite dataset\n$D$ comprising $N$ pairs of points or two probability measures in\n$\\mathbb{R}^d$ within a Wasserstein error margin $\\varepsilon>0$. Our findings\nreveal a balancing trade-off between $p$ and $L$, with $L$ scaling as\n$O(1+N/p)$ for dataset interpolation, and\n$L=O\\left(1+(p\\varepsilon^d)^{-1}\\right)$ for measure interpolation.\n In the autonomous case, where $L=0$, a separate study is required, which we\nundertake focusing on dataset interpolation. We address the relaxed problem of\n$\\varepsilon$-approximate controllability and establish an error decay of\n$\\varepsilon\\sim O(\\log(p)p^{-1/d})$. This decay rate is a consequence of\napplying a universal approximation theorem to a custom-built Lipschitz vector\nfield that interpolates $D$. In the high-dimensional setting, we further\ndemonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact\ncontrol.\n","authors":["Antonio Álvarez-López","Arselane Hadj Slimane","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2401.09902v3.pdf","comment":"16 pages, 10 figures, double column"},{"id":"http://arxiv.org/abs/2310.02031v5","updated":"2024-02-06T17:00:08Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reason may be the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean\ndomain, which is expert in various ocean science tasks. We propose DoInstruct,\na novel framework to automatically obtain a large volume of ocean domain\ninstruction data, which generates instructions based on multi-agent\ncollaboration. Additionally, we construct the first oceanography benchmark,\nOceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though\ncomprehensive experiments, OceanGPT not only shows a higher level of knowledge\nexpertise for oceans science tasks but also gains preliminary embodied\nintelligence capabilities in ocean technology. Codes, data and checkpoints will\nsoon be available at https://github.com/zjunlp/KnowLM.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v5.pdf","comment":"Work in progress. Project Website:\n https://zjunlp.github.io/project/OceanGPT/"},{"id":"http://arxiv.org/abs/2402.04146v1","updated":"2024-02-06T16:54:59Z","published":"2024-02-06T16:54:59Z","title":"Interpretable Multi-Source Data Fusion Through Latent Variable Gaussian\n Process","summary":" With the advent of artificial intelligence (AI) and machine learning (ML),\nvarious domains of science and engineering communites has leveraged data-driven\nsurrogates to model complex systems from numerous sources of information\n(data). The proliferation has led to significant reduction in cost and time\ninvolved in development of superior systems designed to perform specific\nfunctionalities. A high proposition of such surrogates are built extensively\nfusing multiple sources of data, may it be published papers, patents, open\nrepositories, or other resources. However, not much attention has been paid to\nthe differences in quality and comprehensiveness of the known and unknown\nunderlying physical parameters of the information sources that could have\ndownstream implications during system optimization. Towards resolving this\nissue, a multi-source data fusion framework based on Latent Variable Gaussian\nProcess (LVGP) is proposed. The individual data sources are tagged as a\ncharacteristic categorical variable that are mapped into a physically\ninterpretable latent space, allowing the development of source-aware data\nfusion modeling. Additionally, a dissimilarity metric based on the latent\nvariables of LVGP is introduced to study and understand the differences in the\nsources of data. The proposed approach is demonstrated on and analyzed through\ntwo mathematical (representative parabola problem, 2D Ackley function) and two\nmaterials science (design of FeCrAl and SmCoFe alloys) case studies. From the\ncase studies, it is observed that compared to using single-source and source\nunaware ML models, the proposed multi-source data fusion framework can provide\nbetter predictions for sparse-data problems, interpretability regarding the\nsources, and enhanced modeling capabilities by taking advantage of the\ncorrelations and relationships among different sources.\n","authors":["Sandipp Krishnan Ravi","Yigitcan Comlek","Wei Chen","Arjun Pathak","Vipul Gupta","Rajnikant Umretiya","Andrew Hoffman","Ghanshyam Pilania","Piyush Pandita","Sayan Ghosh","Nathaniel Mckeever","Liping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04146v1.pdf","comment":"27 Pages,9 Figures, 3 Supplementary Figures, 2 Supplementary Tables"},{"id":"http://arxiv.org/abs/2305.17028v3","updated":"2024-02-06T16:49:03Z","published":"2023-05-26T15:36:59Z","title":"Better Batch for Deep Probabilistic Time Series Forecasting","summary":" Deep probabilistic time series forecasting has gained attention for its\nsuperior performance in nonlinear approximation and its capability to offer\nvaluable uncertainty quantification for decision-making. However, existing\nmodels often oversimplify the problem by assuming a time-independent error\nprocess, overlooking serial correlation. To overcome this limitation, we\npropose an innovative training method that incorporates error autocorrelation\nto enhance probabilistic forecasting accuracy. Our method constructs a\nmini-batch as a collection of $D$ consecutive time series segments for model\ntraining. It explicitly learns a time-varying covariance matrix over each\nmini-batch, encoding error correlation among adjacent time steps. The learned\ncovariance matrix can be used to improve prediction accuracy and enhance\nuncertainty quantification. We evaluate our method on two different neural\nforecasting models and multiple public datasets. Experimental results confirm\nthe effectiveness of the proposed approach in improving the performance of both\nmodels across a range of datasets, resulting in notable improvements in\npredictive accuracy.\n","authors":["Vincent Zhihao Zheng","Seongjin Choi","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2305.17028v3.pdf","comment":"10 pages, 3 figures, modified peer-review version, accepted to The\n 27th International Conference on Artificial Intelligence and Statistics\n (AISTATS 2024)"},{"id":"http://arxiv.org/abs/2401.10119v2","updated":"2024-02-06T16:36:40Z","published":"2024-01-18T16:50:55Z","title":"Towards Principled Graph Transformers","summary":" Graph learning architectures based on the k-dimensional Weisfeiler-Leman\n(k-WL) hierarchy offer a theoretically well-understood expressive power.\nHowever, such architectures often fail to deliver solid predictive performance\non real-world tasks, limiting their practical impact. In contrast, global\nattention-based models such as graph transformers demonstrate strong\nperformance in practice, but comparing their expressive power with the k-WL\nhierarchy remains challenging, particularly since these architectures rely on\npositional or structural encodings for their expressivity and predictive\nperformance. To address this, we show that the recently proposed Edge\nTransformer, a global attention model operating on node pairs instead of nodes,\nhas at least 3-WL expressive power. Empirically, we demonstrate that the Edge\nTransformer surpasses other theoretically aligned architectures regarding\npredictive performance while not relying on positional or structural encodings.\n","authors":["Luis Müller","Daniel Kusuma","Christopher Morris"],"pdf_url":"https://arxiv.org/pdf/2401.10119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00768v2","updated":"2024-02-06T16:33:48Z","published":"2023-11-01T18:23:12Z","title":"Language Model Training Paradigms for Clinical Feature Embeddings","summary":" In research areas with scarce data, representation learning plays a\nsignificant role. This work aims to enhance representation learning for\nclinical time series by deriving universal embeddings for clinical features,\nsuch as heart rate and blood pressure. We use self-supervised training\nparadigms for language models to learn high-quality clinical feature\nembeddings, achieving a finer granularity than existing time-step and\npatient-level representation learning. We visualize the learnt embeddings via\nunsupervised dimension reduction techniques and observe a high degree of\nconsistency with prior clinical knowledge. We also evaluate the model\nperformance on the MIMIC-III benchmark and demonstrate the effectiveness of\nusing clinical feature embeddings. We publish our code online for replication.\n","authors":["Yurong Hu","Manuel Burger","Gunnar Rätsch","Rita Kuznetsova"],"pdf_url":"https://arxiv.org/pdf/2311.00768v2.pdf","comment":"Poster at \"NeurIPS 2023 Workshop: Self-Supervised Learning - Theory\n and Practice\""},{"id":"http://arxiv.org/abs/2401.10463v2","updated":"2024-02-06T16:32:56Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":" We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04129v1","updated":"2024-02-06T16:31:11Z","published":"2024-02-06T16:31:11Z","title":"OVOR: OnePrompt with Virtual Outlier Regularization for Rehearsal-Free\n Class-Incremental Learning","summary":" Recent works have shown that by using large pre-trained models along with\nlearnable prompts, rehearsal-free methods for class-incremental learning (CIL)\nsettings can achieve superior performance to prominent rehearsal-based ones.\nRehearsal-free CIL methods struggle with distinguishing classes from different\ntasks, as those are not trained together. In this work we propose a\nregularization method based on virtual outliers to tighten decision boundaries\nof the classifier, such that confusion of classes among different tasks is\nmitigated. Recent prompt-based methods often require a pool of task-specific\nprompts, in order to prevent overwriting knowledge of previous tasks with that\nof the new task, leading to extra computation in querying and composing an\nappropriate prompt from the pool. This additional cost can be eliminated,\nwithout sacrificing accuracy, as we reveal in the paper. We illustrate that a\nsimplified prompt-based method can achieve results comparable to previous\nstate-of-the-art (SOTA) methods equipped with a prompt pool, using much less\nlearnable parameters and lower inference cost. Our regularization method has\ndemonstrated its compatibility with different prompt-based methods, boosting\nthose previous SOTA rehearsal-free CIL methods' accuracy on the ImageNet-R and\nCIFAR-100 benchmarks. Our source code is available at\nhttps://github.com/jpmorganchase/ovor.\n","authors":["Wei-Cheng Huang","Chun-Fu Chen","Hsiang Hsu"],"pdf_url":"https://arxiv.org/pdf/2402.04129v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04119v1","updated":"2024-02-06T16:12:36Z","published":"2024-02-06T16:12:36Z","title":"Scientific Language Modeling: A Quantitative Review of Large Language\n Models in Molecular Science","summary":" Efficient molecular modeling and design are crucial for the discovery and\nexploration of novel molecules, and the incorporation of deep learning methods\nhas revolutionized this field. In particular, large language models (LLMs)\noffer a fresh approach to tackle scientific problems from a natural language\nprocessing (NLP) perspective, introducing a research paradigm called scientific\nlanguage modeling (SLM). However, two key issues remain: how to quantify the\nmatch between model and data modalities and how to identify the\nknowledge-learning preferences of models. To address these challenges, we\npropose a multi-modal benchmark, named ChEBI-20-MM, and perform 1263\nexperiments to assess the model's compatibility with data modalities and\nknowledge acquisition. Through the modal transition probability matrix, we\nprovide insights into the most suitable modalities for tasks. Furthermore, we\nintroduce a statistically interpretable approach to discover context-specific\nknowledge mapping by localized feature filtering. Our pioneering analysis\noffers an exploration of the learning mechanism and paves the way for advancing\nSLM in molecular science.\n","authors":["Pengfei Liu","Jun Tao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2402.04119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06014v2","updated":"2024-02-06T16:08:10Z","published":"2023-02-12T22:04:27Z","title":"Online Recommendations for Agents with Discounted Adaptive Preferences","summary":" We consider a bandit recommendations problem in which an agent's preferences\n(representing selection probabilities over recommended items) evolve as a\nfunction of past selections, according to an unknown $\\textit{preference\nmodel}$. In each round, we show a menu of $k$ items (out of $n$ total) to the\nagent, who then chooses a single item, and we aim to minimize regret with\nrespect to some $\\textit{target set}$ (a subset of the item simplex) for\nadversarial losses over the agent's choices. Extending the setting from Agarwal\nand Brown (2022), where uniform-memory agents were considered, here we allow\nfor non-uniform memory in which a discount factor is applied to the agent's\nmemory vector at each subsequent round. In the \"long-term memory\" regime (when\nthe effective memory horizon scales with $T$ sublinearly), we show that\nefficient sublinear regret is obtainable with respect to the set of\n$\\textit{everywhere instantaneously realizable distributions}$ (the \"EIRD set\",\nas formulated in prior work) for any $\\textit{smooth}$ preference model.\nFurther, for preferences which are bounded above and below by linear functions\nof memory weight (we call these \"scale-bounded\" preferences) we give an\nalgorithm which obtains efficient sublinear regret with respect to nearly the\n$\\textit{entire}$ item simplex. We show an NP-hardness result for expanding to\ntargets beyond EIRD in general. In the \"short-term memory\" regime (when the\nmemory horizon is constant), we show that scale-bounded preferences again\nenable efficient sublinear regret for nearly the entire simplex even without\nsmoothness if losses do not change too frequently, yet we show an\ninformation-theoretic barrier for competing against the EIRD set under\narbitrary smooth preference models even when losses are constant.\n","authors":["Arpit Agarwal","William Brown"],"pdf_url":"https://arxiv.org/pdf/2302.06014v2.pdf","comment":"Updates for camera-ready version (ALT 2024)"},{"id":"http://arxiv.org/abs/2309.01945v2","updated":"2024-02-06T16:07:23Z","published":"2023-09-05T04:39:34Z","title":"OHQ: On-chip Hardware-aware Quantization","summary":" Quantization emerges as one of the most promising approaches for deploying\nadvanced deep models on resource-constrained hardware. Mixed-precision\nquantization leverages multiple bit-width architectures to unleash the accuracy\nand efficiency potential of quantized models. However, existing mixed-precision\nquantization suffers exhaustive search space that causes immense computational\noverhead. The quantization process thus relies on separate high-performance\ndevices rather than locally, which also leads to a significant gap between the\nconsidered hardware metrics and the real deployment.In this paper, we propose\nan On-chip Hardware-aware Quantization (OHQ) framework that performs\nhardware-aware mixed-precision quantization without accessing online devices.\nFirst, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling\nperceive the actual efficiency metrics of the quantization operator on the\nhardware.Second, we propose Mask-guided Quantization Estimation (MQE) technique\nto efficiently estimate the accuracy metrics of operators under the constraints\nof on-chip-level computing power.By synthesizing network and hardware insights\nthrough linear programming, we obtain optimized bit-width configurations.\nNotably, the quantization process occurs on-chip entirely without any\nadditional computing devices and data access. We demonstrate accelerated\ninference after quantization for various architectures and compression ratios,\nachieving 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively. OHQ\nimproves latency by 15~30% compared to INT8 on deployment.\n","authors":["Wei Huang","Haotong Qin","Yangdong Liu","Jingzhuo Liang","Yulun Zhang","Ying Li","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.01945v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.04114v1","updated":"2024-02-06T16:06:59Z","published":"2024-02-06T16:06:59Z","title":"SCAFFLSA: Quantifying and Eliminating Heterogeneity Bias in Federated\n Linear Stochastic Approximation and Temporal Difference Learning","summary":" In this paper, we perform a non-asymptotic analysis of the federated linear\nstochastic approximation (FedLSA) algorithm. We explicitly quantify the bias\nintroduced by local training with heterogeneous agents, and investigate the\nsample complexity of the algorithm. We show that the communication complexity\nof FedLSA scales polynomially with the desired precision $\\epsilon$, which\nlimits the benefits of federation. To overcome this, we propose SCAFFLSA, a\nnovel variant of FedLSA, that uses control variates to correct the bias of\nlocal training, and prove its convergence without assumptions on statistical\nheterogeneity. We apply the proposed methodology to federated temporal\ndifference learning with linear function approximation, and analyze the\ncorresponding complexity improvements.\n","authors":["Paul Mangold","Sergey Samsonov","Safwan Labbi","Ilya Levin","Reda Alami","Alexey Naumov","Eric Moulines"],"pdf_url":"https://arxiv.org/pdf/2402.04114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04108v1","updated":"2024-02-06T16:02:17Z","published":"2024-02-06T16:02:17Z","title":"Hierarchical Delay Attribution Classification using Unstructured Text in\n Train Management Systems","summary":" EU directives stipulate a systematic follow-up of train delays. In Sweden,\nthe Swedish Transport Administration registers and assigns an appropriate delay\nattribution code. However, this delay attribution code is assigned manually,\nwhich is a complex task. In this paper, a machine learning-based decision\nsupport for assigning delay attribution codes based on event descriptions is\ninvestigated. The text is transformed using TF-IDF, and two models, Random\nForest and Support Vector Machine, are evaluated against a random uniform\nclassifier and the classification performance of the Swedish Transport\nAdministration. Further, the problem is modeled as both a hierarchical and flat\napproach. The results indicate that a hierarchical approach performs better\nthan a flat approach. Both approaches perform better than the random uniform\nclassifier but perform worse than the manual classification.\n","authors":["Anton Borg","Per Lingvall","Martin Svensson"],"pdf_url":"https://arxiv.org/pdf/2402.04108v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.04103v1","updated":"2024-02-06T15:58:14Z","published":"2024-02-06T15:58:14Z","title":"An Exploration of Clustering Algorithms for Customer Segmentation in the\n UK Retail Market","summary":" Recently, peoples awareness of online purchases has significantly risen. This\nhas given rise to online retail platforms and the need for a better\nunderstanding of customer purchasing behaviour. Retail companies are pressed\nwith the need to deal with a high volume of customer purchases, which requires\nsophisticated approaches to perform more accurate and efficient customer\nsegmentation. Customer segmentation is a marketing analytical tool that aids\ncustomer-centric service and thus enhances profitability. In this paper, we aim\nto develop a customer segmentation model to improve decision-making processes\nin the retail market industry. To achieve this, we employed a UK-based online\nretail dataset obtained from the UCI machine learning repository. The retail\ndataset consists of 541,909 customer records and eight features. Our study\nadopted the RFM (recency, frequency, and monetary) framework to quantify\ncustomer values. Thereafter, we compared several state-of-the-art (SOTA)\nclustering algorithms, namely, K-means clustering, the Gaussian mixture model\n(GMM), density-based spatial clustering of applications with noise (DBSCAN),\nagglomerative clustering, and balanced iterative reducing and clustering using\nhierarchies (BIRCH). The results showed the GMM outperformed other approaches,\nwith a Silhouette Score of 0.80.\n","authors":["Jeen Mary John","Olamilekan Shobayo","Bayode Ogunleye"],"pdf_url":"https://arxiv.org/pdf/2402.04103v1.pdf","comment":"15 pages, Journal of Analytics"},{"id":"http://arxiv.org/abs/2402.02018v2","updated":"2024-02-06T15:47:26Z","published":"2024-02-03T04:21:07Z","title":"The Landscape and Challenges of HPC Research and LLMs","summary":" Recently, language models (LMs), especially large language models (LLMs),\nhave revolutionized the field of deep learning. Both encoder-decoder models and\nprompt-based techniques have shown immense potential for natural language\nprocessing and code-based tasks. Over the past several years, many research\nlabs and institutions have invested heavily in high-performance computing,\napproaching or breaching exascale performance levels. In this paper, we posit\nthat adapting and utilizing such language model-based techniques for tasks in\nhigh-performance computing (HPC) would be very beneficial. This study presents\nour reasoning behind the aforementioned position and highlights how existing\nideas can be improved and adapted for HPC tasks.\n","authors":["Le Chen","Nesreen K. Ahmed","Akash Dutta","Arijit Bhattacharjee","Sixing Yu","Quazi Ishtiaque Mahmud","Waqwoya Abebe","Hung Phan","Aishwarya Sarkar","Branden Butler","Niranjan Hasabnis","Gal Oren","Vy A. Vo","Juan Pablo Munoz","Theodore L. Willke","Tim Mattson","Ali Jannesari"],"pdf_url":"https://arxiv.org/pdf/2402.02018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04088v1","updated":"2024-02-06T15:46:31Z","published":"2024-02-06T15:46:31Z","title":"The Use of a Large Language Model for Cyberbullying Detection","summary":" The dominance of social media has added to the channels of bullying for\nperpetrators. Unfortunately, cyberbullying (CB) is the most prevalent\nphenomenon in todays cyber world, and is a severe threat to the mental and\nphysical health of citizens. This opens the need to develop a robust system to\nprevent bullying content from online forums, blogs, and social media platforms\nto manage the impact in our society. Several machine learning (ML) algorithms\nhave been proposed for this purpose. However, their performances are not\nconsistent due to high class imbalance and generalisation issues. In recent\nyears, large language models (LLMs) like BERT and RoBERTa have achieved\nstate-of-the-art (SOTA) results in several natural language processing (NLP)\ntasks. Unfortunately, the LLMs have not been applied extensively for CB\ndetection. In our paper, we explored the use of these models for cyberbullying\n(CB) detection. We have prepared a new dataset (D2) from existing studies\n(Formspring and Twitter). Our experimental results for dataset D1 and D2 showed\nthat RoBERTa outperformed other models.\n","authors":["Bayode Ogunleye","Babitha Dharmaraj"],"pdf_url":"https://arxiv.org/pdf/2402.04088v1.pdf","comment":"14 pages, Journal of Analytics"},{"id":"http://arxiv.org/abs/2402.04087v1","updated":"2024-02-06T15:45:27Z","published":"2024-02-06T15:45:27Z","title":"A Hard-to-Beat Baseline for Training-free CLIP-based Adaptation","summary":" Contrastive Language-Image Pretraining (CLIP) has gained popularity for its\nremarkable zero-shot capacity. Recent research has focused on developing\nefficient fine-tuning methods, such as prompt learning and adapter, to enhance\nCLIP's performance in downstream tasks. However, these methods still require\nadditional training time and computational resources, which is undesirable for\ndevices with limited resources. In this paper, we revisit a classical\nalgorithm, Gaussian Discriminant Analysis (GDA), and apply it to the downstream\nclassification of CLIP. Typically, GDA assumes that features of each class\nfollow Gaussian distributions with identical covariance. By leveraging Bayes'\nformula, the classifier can be expressed in terms of the class means and\ncovariance, which can be estimated from the data without the need for training.\nTo integrate knowledge from both visual and textual modalities, we ensemble it\nwith the original zero-shot classifier within CLIP. Extensive results on 17\ndatasets validate that our method surpasses or achieves comparable results with\nstate-of-the-art methods on few-shot classification, imbalanced learning, and\nout-of-distribution generalization. In addition, we extend our method to\nbase-to-new generalization and unsupervised learning, once again demonstrating\nits superiority over competing approaches. Our code is publicly available at\n\\url{https://github.com/mrflogs/ICLR24}.\n","authors":["Zhengbo Wang","Jian Liang","Lijun Sheng","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04087v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04084v1","updated":"2024-02-06T15:39:09Z","published":"2024-02-06T15:39:09Z","title":"Provably learning a multi-head attention layer","summary":" The multi-head attention layer is one of the key components of the\ntransformer architecture that sets it apart from traditional feed-forward\nmodels. Given a sequence length $k$, attention matrices\n$\\mathbf{\\Theta}_1,\\ldots,\\mathbf{\\Theta}_m\\in\\mathbb{R}^{d\\times d}$, and\nprojection matrices $\\mathbf{W}_1,\\ldots,\\mathbf{W}_m\\in\\mathbb{R}^{d\\times\nd}$, the corresponding multi-head attention layer $F: \\mathbb{R}^{k\\times d}\\to\n\\mathbb{R}^{k\\times d}$ transforms length-$k$ sequences of $d$-dimensional\ntokens $\\mathbf{X}\\in\\mathbb{R}^{k\\times d}$ via $F(\\mathbf{X}) \\triangleq\n\\sum^m_{i=1}\n\\mathrm{softmax}(\\mathbf{X}\\mathbf{\\Theta}_i\\mathbf{X}^\\top)\\mathbf{X}\\mathbf{W}_i$.\nIn this work, we initiate the study of provably learning a multi-head attention\nlayer from random examples and give the first nontrivial upper and lower bounds\nfor this problem:\n - Provided $\\{\\mathbf{W}_i, \\mathbf{\\Theta}_i\\}$ satisfy certain\nnon-degeneracy conditions, we give a $(dk)^{O(m^3)}$-time algorithm that learns\n$F$ to small error given random labeled examples drawn uniformly from $\\{\\pm\n1\\}^{k\\times d}$.\n - We prove computational lower bounds showing that in the worst case,\nexponential dependence on $m$ is unavoidable.\n We focus on Boolean $\\mathbf{X}$ to mimic the discrete nature of tokens in\nlarge language models, though our techniques naturally extend to standard\ncontinuous settings, e.g. Gaussian. Our algorithm, which is centered around\nusing examples to sculpt a convex body containing the unknown parameters, is a\nsignificant departure from existing provable algorithms for learning\nfeedforward networks, which predominantly exploit algebraic and rotation\ninvariance properties of the Gaussian distribution. In contrast, our analysis\nis more flexible as it primarily relies on various upper and lower tail bounds\nfor the input distribution and \"slices\" thereof.\n","authors":["Sitan Chen","Yuanzhi Li"],"pdf_url":"https://arxiv.org/pdf/2402.04084v1.pdf","comment":"105 pages, comments welcome"},{"id":"http://arxiv.org/abs/2309.02908v2","updated":"2024-02-06T15:37:11Z","published":"2023-09-06T11:02:53Z","title":"DECODE: Data-driven Energy Consumption Prediction leveraging Historical\n Data and Environmental Factors in Buildings","summary":" Energy prediction in buildings plays a crucial role in effective energy\nmanagement. Precise predictions are essential for achieving optimal energy\nconsumption and distribution within the grid. This paper introduces a Long\nShort-Term Memory (LSTM) model designed to forecast building energy consumption\nusing historical energy data, occupancy patterns, and weather conditions. The\nLSTM model provides accurate short, medium, and long-term energy predictions\nfor residential and commercial buildings compared to existing prediction\nmodels. We compare our LSTM model with established prediction methods,\nincluding linear regression, decision trees, and random forest. Encouragingly,\nthe proposed LSTM model emerges as the superior performer across all metrics.\nIt demonstrates exceptional prediction accuracy, boasting the highest R2 score\nof 0.97 and the most favorable mean absolute error (MAE) of 0.007. An\nadditional advantage of our developed model is its capacity to achieve\nefficient energy consumption forecasts even when trained on a limited dataset.\nWe address concerns about overfitting (variance) and underfitting (bias)\nthrough rigorous training and evaluation on real-world data. In summary, our\nresearch contributes to energy prediction by offering a robust LSTM model that\noutperforms alternative methods and operates with remarkable efficiency,\ngeneralizability, and reliability.\n","authors":["Aditya Mishra","Haroon R. Lone","Aayush Mishra"],"pdf_url":"https://arxiv.org/pdf/2309.02908v2.pdf","comment":"10 pages, 7 figures, 7 tables"},{"id":"http://arxiv.org/abs/2402.04082v1","updated":"2024-02-06T15:36:06Z","published":"2024-02-06T15:36:06Z","title":"An Optimal House Price Prediction Algorithm: XGBoost","summary":" An accurate prediction of house prices is a fundamental requirement for\nvarious sectors including real estate and mortgage lending. It is widely\nrecognized that a property value is not solely determined by its physical\nattributes but is significantly influenced by its surrounding neighbourhood.\nMeeting the diverse housing needs of individuals while balancing budget\nconstraints is a primary concern for real estate developers. To this end, we\naddressed the house price prediction problem as a regression task and thus\nemployed various machine learning techniques capable of expressing the\nsignificance of independent variables. We made use of the housing dataset of\nAmes City in Iowa, USA to compare support vector regressor, random forest\nregressor, XGBoost, multilayer perceptron and multiple linear regression\nalgorithms for house price prediction. Afterwards, we identified the key\nfactors that influence housing costs. Our results show that XGBoost is the best\nperforming model for house price prediction.\n","authors":["Hemlata Sharma","Hitesh Harsora","Bayode Ogunleye"],"pdf_url":"https://arxiv.org/pdf/2402.04082v1.pdf","comment":"16 pages, Journal of Analytics"},{"id":"http://arxiv.org/abs/2402.04081v1","updated":"2024-02-06T15:34:44Z","published":"2024-02-06T15:34:44Z","title":"Improved Generalization of Weight Space Networks via Augmentations","summary":" Learning in deep weight spaces (DWS), where neural networks process the\nweights of other neural networks, is an emerging research direction, with\napplications to 2D and 3D neural fields (INRs, NeRFs), as well as making\ninferences about other types of neural networks. Unfortunately, weight space\nmodels tend to suffer from substantial overfitting. We empirically analyze the\nreasons for this overfitting and find that a key reason is the lack of\ndiversity in DWS datasets. While a given object can be represented by many\ndifferent weight configurations, typical INR training sets fail to capture\nvariability across INRs that represent the same object. To address this, we\nexplore strategies for data augmentation in weight spaces and propose a MixUp\nmethod adapted for weight spaces. We demonstrate the effectiveness of these\nmethods in two setups. In classification, they improve performance similarly to\nhaving up to 10 times more data. In self-supervised contrastive learning, they\nyield substantial 5-10% gains in downstream classification.\n","authors":["Aviv Shamsian","Aviv Navon","David W. Zhang","Yan Zhang","Ethan Fetaya","Gal Chechik","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2402.04081v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2402.04080v1","updated":"2024-02-06T15:34:30Z","published":"2024-02-06T15:34:30Z","title":"Entropy-regularized Diffusion Policy with Q-Ensembles for Offline\n Reinforcement Learning","summary":" This paper presents advanced techniques of training diffusion policies for\noffline reinforcement learning (RL). At the core is a mean-reverting stochastic\ndifferential equation (SDE) that transfers a complex action distribution into a\nstandard Gaussian and then samples actions conditioned on the environment state\nwith a corresponding reverse-time SDE, like a typical diffusion policy. We show\nthat such an SDE has a solution that we can use to calculate the log\nprobability of the policy, yielding an entropy regularizer that improves the\nexploration of offline datasets. To mitigate the impact of inaccurate value\nfunctions from out-of-distribution data points, we further propose to learn the\nlower confidence bound of Q-ensembles for more robust policy improvement. By\ncombining the entropy-regularized diffusion policy with Q-ensembles in offline\nRL, our method achieves state-of-the-art performance on most tasks in D4RL\nbenchmarks. Code is available at\n\\href{https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble}{https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble}.\n","authors":["Ruoqi Zhang","Ziwei Luo","Jens Sjölund","Thomas B. Schön","Per Mattsson"],"pdf_url":"https://arxiv.org/pdf/2402.04080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00899v2","updated":"2024-02-06T15:27:03Z","published":"2024-01-31T20:36:13Z","title":"Weakly Supervised Learners for Correction of AI Errors with Provable\n Performance Guarantees","summary":" We present a new methodology for handling AI errors by introducing weakly\nsupervised AI error correctors with a priori performance guarantees. These AI\ncorrectors are auxiliary maps whose role is to moderate the decisions of some\npreviously constructed underlying classifier by either approving or rejecting\nits decisions. The rejection of a decision can be used as a signal to suggest\nabstaining from making a decision. A key technical focus of the work is in\nproviding performance guarantees for these new AI correctors through bounds on\nthe probabilities of incorrect decisions. These bounds are distribution\nagnostic and do not rely on assumptions on the data dimension. Our empirical\nexample illustrates how the framework can be applied to improve the performance\nof an image classifier in a challenging real-world task where training data are\nscarce.\n","authors":["Ivan Y. Tyukin","Tatiana Tyukina","Daniel van Helden","Zedong Zheng","Evgeny M. Mirkes","Oliver J. Sutton","Qinghua Zhou","Alexander N. Gorban","Penelope Allison"],"pdf_url":"https://arxiv.org/pdf/2402.00899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13033v2","updated":"2024-02-06T15:22:45Z","published":"2023-10-19T13:18:57Z","title":"LASER: Linear Compression in Wireless Distributed Optimization","summary":" Data-parallel SGD is the de facto algorithm for distributed optimization,\nespecially for large scale machine learning. Despite its merits, communication\nbottleneck is one of its persistent issues. Most compression schemes to\nalleviate this either assume noiseless communication links, or fail to achieve\ngood performance on practical tasks. In this paper, we close this gap and\nintroduce LASER: LineAr CompreSsion in WirEless DistRibuted Optimization. LASER\ncapitalizes on the inherent low-rank structure of gradients and transmits them\nefficiently over the noisy channels. Whilst enjoying theoretical guarantees\nsimilar to those of the classical SGD, LASER shows consistent gains over\nbaselines on a variety of practical benchmarks. In particular, it outperforms\nthe state-of-the-art compression schemes on challenging computer vision and GPT\nlanguage modeling tasks. On the latter, we obtain $50$-$64 \\%$ improvement in\nperplexity over our baselines for noisy channels.\n","authors":["Ashok Vardhan Makkuva","Marco Bondaschi","Thijs Vogels","Martin Jaggi","Hyeji Kim","Michael C. Gastpar"],"pdf_url":"https://arxiv.org/pdf/2310.13033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04068v1","updated":"2024-02-06T15:13:17Z","published":"2024-02-06T15:13:17Z","title":"Retrieve to Explain: Evidence-driven Predictions with Language Models","summary":" Machine learning models, particularly language models, are notoriously\ndifficult to introspect. Black-box models can mask both issues in model\ntraining and harmful biases. For human-in-the-loop processes, opaque\npredictions can drive lack of trust, limiting a model's impact even when it\nperforms effectively. To address these issues, we introduce Retrieve to Explain\n(R2E). R2E is a retrieval-based language model that prioritizes amongst a\npre-defined set of possible answers to a research question based on the\nevidence in a document corpus, using Shapley values to identify the relative\nimportance of pieces of evidence to the final prediction. R2E can adapt to new\nevidence without retraining, and incorporate structured data through templating\ninto natural language. We assess on the use case of drug target identification\nfrom published scientific literature, where we show that the model outperforms\nan industry-standard genetics-based approach on predicting clinical trial\noutcomes.\n","authors":["Ravi Patel","Angus Brayne","Rogier Hintzen","Daniel Jaroslawicz","Georgiana Neculae","Dane Corneil"],"pdf_url":"https://arxiv.org/pdf/2402.04068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04062v1","updated":"2024-02-06T15:05:40Z","published":"2024-02-06T15:05:40Z","title":"Link Prediction with Relational Hypergraphs","summary":" Link prediction with knowledge graphs has been thoroughly studied in graph\nmachine learning, leading to a rich landscape of graph neural network\narchitectures with successful applications. Nonetheless, it remains challenging\nto transfer the success of these architectures to link prediction with\nrelational hypergraphs. The presence of relational hyperedges makes link\nprediction a task between $k$ nodes for varying choices of $k$, which is\nsubstantially harder than link prediction with knowledge graphs, where every\nrelation is binary ($k=2$). In this paper, we propose two frameworks for link\nprediction with relational hypergraphs and conduct a thorough analysis of the\nexpressive power of the resulting model architectures via corresponding\nrelational Weisfeiler-Leman algorithms, and also via some natural logical\nformalisms. Through extensive empirical analysis, we validate the power of the\nproposed model architectures on various relational hypergraph benchmarks. The\nresulting model architectures substantially outperform every baseline for\ninductive link prediction, and lead to state-of-the-art results for\ntransductive link prediction. Our study therefore unlocks applications of graph\nneural networks to fully relational structures.\n","authors":["Xingyue Huang","Miguel Romero Orth","Pablo Barceló","Michael M. Bronstein","İsmail İlkan Ceylan"],"pdf_url":"https://arxiv.org/pdf/2402.04062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04061v1","updated":"2024-02-06T15:05:25Z","published":"2024-02-06T15:05:25Z","title":"TopoNav: Topological Navigation for Efficient Exploration in Sparse\n Reward Environments","summary":" Autonomous robots exploring unknown areas face a significant challenge --\nnavigating effectively without prior maps and with limited external feedback.\nThis challenge intensifies in sparse reward environments, where traditional\nexploration techniques often fail. In this paper, we introduce TopoNav, a novel\nframework that empowers robots to overcome these constraints and achieve\nefficient, adaptable, and goal-oriented exploration. TopoNav's fundamental\nbuilding blocks are active topological mapping, intrinsic reward mechanisms,\nand hierarchical objective prioritization. Throughout its exploration, TopoNav\nconstructs a dynamic topological map that captures key locations and pathways.\nIt utilizes intrinsic rewards to guide the robot towards designated sub-goals\nwithin this map, fostering structured exploration even in sparse reward\nsettings. To ensure efficient navigation, TopoNav employs the Hierarchical\nObjective-Driven Active Topologies framework, enabling the robot to prioritize\nimmediate tasks like obstacle avoidance while maintaining focus on the overall\ngoal. We demonstrate TopoNav's effectiveness in simulated environments that\nreplicate real-world conditions. Our results reveal significant improvements in\nexploration efficiency, navigational accuracy, and adaptability to unforeseen\nobstacles, showcasing its potential to revolutionize autonomous exploration in\na wide range of applications, including search and rescue, environmental\nmonitoring, and planetary exploration.\n","authors":["Jumman Hossain","Abu-Zaher Faridee","Nirmalya Roy","Jade Freeman","Timothy Gregory","Theron T. Trout"],"pdf_url":"https://arxiv.org/pdf/2402.04061v1.pdf","comment":"Paper under review"},{"id":"http://arxiv.org/abs/2402.04059v1","updated":"2024-02-06T15:03:53Z","published":"2024-02-06T15:03:53Z","title":"Deep Learning for Multivariate Time Series Imputation: A Survey","summary":" The ubiquitous missing values cause the multivariate time series data to be\npartially observed, destroying the integrity of time series and hindering the\neffective time series data analysis. Recently deep learning imputation methods\nhave demonstrated remarkable success in elevating the quality of corrupted time\nseries data, subsequently enhancing performance in downstream tasks. In this\npaper, we conduct a comprehensive survey on the recently proposed deep learning\nimputation methods. First, we propose a taxonomy for the reviewed methods, and\nthen provide a structured review of these methods by highlighting their\nstrengths and limitations. We also conduct empirical experiments to study\ndifferent methods and compare their enhancement for downstream tasks. Finally,\nthe open issues for future research on multivariate time series imputation are\npointed out. All code and configurations of this work, including a regularly\nmaintained multivariate time series imputation paper list, can be found in the\nGitHub repository~\\url{https://github.com/WenjieDu/Awesome\\_Imputation}.\n","authors":["Jun Wang","Wenjie Du","Wei Cao","Keli Zhang","Wenjia Wang","Yuxuan Liang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2402.04059v1.pdf","comment":"9 pages, 1 figure, 5 tables, 58 referred papers"},{"id":"http://arxiv.org/abs/2304.05099v2","updated":"2024-02-06T15:03:21Z","published":"2023-04-11T09:51:13Z","title":"Feudal Graph Reinforcement Learning","summary":" Graph-based representations and weight-sharing modular policies constitute\nprominent approaches to tackling composable control problems in Reinforcement\nLearning (RL). However, as shown by recent graph deep learning literature,\nmessage-passing operators can create bottlenecks in information propagation and\nhinder global coordination. The issue becomes dramatic in tasks where\nhigh-level planning is needed. In this work, we propose a novel methodology,\nnamed Feudal Graph Reinforcement Learning (FGRL), that addresses such\nchallenges by relying on hierarchical RL and a pyramidal message-passing\narchitecture. In particular, FGRL defines a hierarchy of policies where\nhigh-level commands are propagated from the top of the hierarchy down through a\nlayered graph structure. The bottom layers mimic the morphology of the physical\nsystem, while the upper layers capture more abstract sub-modules. The resulting\nagents are then characterized by a committee of policies where actions at a\ncertain level set goals for the level below, thus implementing a hierarchical\ndecision-making structure that encompasses task decomposition. We evaluate the\nproposed framework on locomotion tasks on benchmark MuJoCo environments and\nshow that FGRL compares favorably against relevant baselines. Furthermore, an\nin-depth analysis of the command propagation mechanism provides evidence that\nthe introduced message-passing scheme favors the learning of hierarchical\ndecision-making policies.\n","authors":["Tommaso Marzi","Arshjot Khehra","Andrea Cini","Cesare Alippi"],"pdf_url":"https://arxiv.org/pdf/2304.05099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01881v2","updated":"2024-02-06T15:03:09Z","published":"2024-02-02T20:12:05Z","title":"Large Language Model Agent for Hyper-Parameter Optimization","summary":" Hyperparameter optimization is critical in modern machine learning, requiring\nexpert knowledge, numerous trials, and high computational and human resources.\nDespite the advancements in Automated Machine Learning (AutoML), challenges in\nterms of trial efficiency, setup complexity, and interoperability still\npersist. To address these issues, we introduce a novel paradigm leveraging\nLarge Language Models (LLMs) to automate hyperparameter optimization across\ndiverse machine learning tasks, which is named AgentHPO (short for LLM\nAgent-based Hyperparameter Optimization). Specifically, AgentHPO processes the\ntask information autonomously, conducts experiments with specific\nhyperparameters (HPs), and iteratively optimizes them based on historical\ntrials. This human-like optimization process largely reduces the number of\nrequired trials, simplifies the setup process, and enhances interpretability\nand user trust, compared to traditional AutoML methods. Extensive empirical\nexperiments conducted on 12 representative machine-learning tasks indicate that\nAgentHPO not only matches but also often surpasses the best human trials in\nterms of performance while simultaneously providing explainable results.\nFurther analysis sheds light on the strategies employed by the LLM in\noptimizing these tasks, highlighting its effectiveness and adaptability in\nvarious scenarios.\n","authors":["Siyi Liu","Chen Gao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2402.01881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04054v1","updated":"2024-02-06T15:00:08Z","published":"2024-02-06T15:00:08Z","title":"More Flexible PAC-Bayesian Meta-Learning by Learning Learning Algorithms","summary":" We introduce a new framework for studying meta-learning methods using\nPAC-Bayesian theory. Its main advantage over previous work is that it allows\nfor more flexibility in how the transfer of knowledge between tasks is\nrealized. For previous approaches, this could only happen indirectly, by means\nof learning prior distributions over models. In contrast, the new\ngeneralization bounds that we prove express the process of meta-learning much\nmore directly as learning the learning algorithm that should be used for future\ntasks. The flexibility of our framework makes it suitable to analyze a wide\nrange of meta-learning mechanisms and even design new mechanisms. Other than\nour theoretical contributions we also show empirically that our framework\nimproves the prediction quality in practical meta-learning mechanisms.\n","authors":["Hossein Zakerinia","Amin Behjati","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2402.04054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16624v2","updated":"2024-02-06T14:57:31Z","published":"2023-12-27T16:13:14Z","title":"Dual-stage optimizer for systematic overestimation adjustment applied to\n multi-objective genetic algorithms for biomarker selection","summary":" The challenge in biomarker discovery using machine learning from omics data\nlies in the abundance of molecular features but scarcity of samples. Most\nfeature selection methods in machine learning require evaluating various sets\nof features (models) to determine the most effective combination. This process,\ntypically conducted using a validation dataset, involves testing different\nfeature sets to optimize the model's performance. Evaluations have performance\nestimation error and when the selection involves many models the best ones are\nalmost certainly overestimated. Biomarker identification with feature selection\nmethods can be addressed as a multi-objective problem with trade-offs between\npredictive ability and parsimony in the number of features. Genetic algorithms\nare a popular tool for multi-objective optimization but they evolve numerous\nsolutions thus are prone to overestimation. Methods have been proposed to\nreduce the overestimation after a model has already been selected in\nsingle-objective problems, but no algorithm existed capable of reducing the\noverestimation during the optimization, improving model selection, or applied\nin the more general multi-objective domain. We propose DOSA-MO, a novel\nmulti-objective optimization wrapper algorithm that learns how the original\nestimation, its variance, and the feature set size of the solutions predict the\noverestimation. DOSA-MO adjusts the expectation of the performance during the\noptimization, improving the composition of the solution set. We verify that\nDOSA-MO improves the performance of a state-of-the-art genetic algorithm on\nleft-out or external sample sets, when predicting cancer subtypes and/or\npatient overall survival, using three transcriptomics datasets for kidney and\nbreast cancer.\n","authors":["Luca Cattelani","Vittorio Fortino"],"pdf_url":"https://arxiv.org/pdf/2312.16624v2.pdf","comment":"Added a picture with the algorithm steps and a supplementary section\n with disambiguation of the technical terms. Moved sections in the\n supplementary to shorten the main text. Fixed typos"},{"id":"http://arxiv.org/abs/2402.04051v1","updated":"2024-02-06T14:53:28Z","published":"2024-02-06T14:53:28Z","title":"Analysis of Linear Mode Connectivity via Permutation-Based Weight\n Matching","summary":" Recently, Ainsworth et al. showed that using weight matching (WM) to minimize\nthe $L_2$ distance in a permutation search of model parameters effectively\nidentifies permutations that satisfy linear mode connectivity (LMC), in which\nthe loss along a linear path between two independently trained models with\ndifferent seeds remains nearly constant. This paper provides a theoretical\nanalysis of LMC using WM, which is crucial for understanding stochastic\ngradient descent's effectiveness and its application in areas like model\nmerging. We first experimentally and theoretically show that permutations found\nby WM do not significantly reduce the $L_2$ distance between two models and the\noccurrence of LMC is not merely due to distance reduction by WM in itself. We\nthen provide theoretical insights showing that permutations can change the\ndirections of the singular vectors, but not the singular values, of the weight\nmatrices in each layer. This finding shows that permutations found by WM mainly\nalign the directions of singular vectors associated with large singular values\nacross models. This alignment brings the singular vectors with large singular\nvalues, which determine the model functionality, closer between pre-merged and\npost-merged models, so that the post-merged model retains functionality similar\nto the pre-merged models, making it easy to satisfy LMC. Finally, we analyze\nthe difference between WM and straight-through estimator (STE), a\ndataset-dependent permutation search method, and show that WM outperforms STE,\nespecially when merging three or more models.\n","authors":["Akira Ito","Masanori Yamada","Atsutoshi Kumagai"],"pdf_url":"https://arxiv.org/pdf/2402.04051v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2402.04050v1","updated":"2024-02-06T14:53:19Z","published":"2024-02-06T14:53:19Z","title":"Connecting the Dots: Collaborative Fine-tuning for Black-Box\n Vision-Language Models","summary":" With the emergence of pretrained vision-language models (VLMs), considerable\nefforts have been devoted to fine-tuning them for downstream tasks. Despite the\nprogress made in designing efficient fine-tuning methods, such methods require\naccess to the model's parameters, which can be challenging as model owners\noften opt to provide their models as a black box to safeguard model ownership.\nThis paper proposes a \\textbf{C}ollabo\\textbf{ra}tive\n\\textbf{F}ine-\\textbf{T}uning (\\textbf{CraFT}) approach for fine-tuning\nblack-box VLMs to downstream tasks, where one only has access to the input\nprompts and the output predictions of the model. CraFT comprises two modules, a\nprompt generation module for learning text prompts and a prediction refinement\nmodule for enhancing output predictions in residual style. Additionally, we\nintroduce an auxiliary prediction-consistent loss to promote consistent\noptimization across these modules. These modules are optimized by a novel\ncollaborative training algorithm. Extensive experiments on few-shot\nclassification over 15 datasets demonstrate the superiority of CraFT. The\nresults show that CraFT achieves a decent gain of about 12\\% with 16-shot\ndatasets and only 8,000 queries. Moreover, CraFT trains faster and uses only\nabout 1/80 of the memory footprint for deployment, while sacrificing only\n1.62\\% compared to the white-box method.\n","authors":["Zhengbo Wang","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2402.04050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04046v1","updated":"2024-02-06T14:48:34Z","published":"2024-02-06T14:48:34Z","title":"Generative Modeling of Graphs via Joint Diffusion of Node and Edge\n Attributes","summary":" Graph generation is integral to various engineering and scientific\ndisciplines. Nevertheless, existing methodologies tend to overlook the\ngeneration of edge attributes. However, we identify critical applications where\nedge attributes are essential, making prior methods potentially unsuitable in\nsuch contexts. Moreover, while trivial adaptations are available, empirical\ninvestigations reveal their limited efficacy as they do not properly model the\ninterplay among graph components. To address this, we propose a joint\nscore-based model of nodes and edges for graph generation that considers all\ngraph components. Our approach offers two key novelties: (i) node and edge\nattributes are combined in an attention module that generates samples based on\nthe two ingredients; and (ii) node, edge and adjacency information are mutually\ndependent during the graph diffusion process. We evaluate our method on\nchallenging benchmarks involving real-world and synthetic datasets in which\nedge features are crucial. Additionally, we introduce a new synthetic dataset\nthat incorporates edge values. Furthermore, we propose a novel application that\ngreatly benefits from the method due to its nature: the generation of traffic\nscenes represented as graphs. Our method outperforms other graph generation\nmethods, demonstrating a significant advantage in edge-related measures.\n","authors":["Nimrod Berman","Eitan Kosman","Dotan Di Castro","Omri Azencot"],"pdf_url":"https://arxiv.org/pdf/2402.04046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08249v2","updated":"2024-02-06T14:42:41Z","published":"2023-09-15T08:46:53Z","title":"Deep Nonnegative Matrix Factorization with Beta Divergences","summary":" Deep Nonnegative Matrix Factorization (deep NMF) has recently emerged as a\nvaluable technique for extracting multiple layers of features across different\nscales. However, all existing deep NMF models and algorithms have primarily\ncentered their evaluation on the least squares error, which may not be the most\nappropriate metric for assessing the quality of approximations on diverse\ndatasets. For instance, when dealing with data types such as audio signals and\ndocuments, it is widely acknowledged that $\\beta$-divergences offer a more\nsuitable alternative. In this paper, we develop new models and algorithms for\ndeep NMF using some $\\beta$-divergences, with a focus on the Kullback-Leibler\ndivergence. Subsequently, we apply these techniques to the extraction of facial\nfeatures, the identification of topics within document collections, and the\nidentification of materials within hyperspectral images.\n","authors":["Valentin Leplat","Le Thi Khanh Hien","Akwum Onwunta","Nicolas Gillis"],"pdf_url":"https://arxiv.org/pdf/2309.08249v2.pdf","comment":"32 pages. We have improved the presentation of the paper, and added\n numerical experiments for beta=3/2 with 4 layers on the CBCL data set"},{"id":"http://arxiv.org/abs/2402.04038v1","updated":"2024-02-06T14:34:17Z","published":"2024-02-06T14:34:17Z","title":"PAC-Bayesian Adversarially Robust Generalization Bounds for Graph Neural\n Network","summary":" Graph neural networks (GNNs) have gained popularity for various graph-related\ntasks. However, similar to deep neural networks, GNNs are also vulnerable to\nadversarial attacks. Empirical studies have shown that adversarially robust\ngeneralization has a pivotal role in establishing effective defense algorithms\nagainst adversarial attacks. In this paper, we contribute by providing\nadversarially robust generalization bounds for two kinds of popular GNNs, graph\nconvolutional network (GCN) and message passing graph neural network, using the\nPAC-Bayesian framework. Our result reveals that spectral norm of the diffusion\nmatrix on the graph and spectral norm of the weights as well as the\nperturbation factor govern the robust generalization bounds of both models. Our\nbounds are nontrivial generalizations of the results developed in (Liao et al.,\n2020) from the standard setting to adversarial setting while avoiding\nexponential dependence of the maximum node degree. As corollaries, we derive\nbetter PAC-Bayesian robust generalization bounds for GCN in the standard\nsetting, which improve the bounds in (Liao et al., 2020) by avoiding\nexponential dependence on the maximum node degree.\n","authors":["Tan Sun","Junhong Lin"],"pdf_url":"https://arxiv.org/pdf/2402.04038v1.pdf","comment":"32pages"},{"id":"http://arxiv.org/abs/2309.16883v3","updated":"2024-02-06T14:30:26Z","published":"2023-09-28T22:41:47Z","title":"The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing","summary":" Real-life applications of deep neural networks are hindered by their unsteady\npredictions when faced with noisy inputs and adversarial attacks. The certified\nradius is in this context a crucial indicator of the robustness of models.\nHowever how to design an efficient classifier with an associated certified\nradius? Randomized smoothing provides a promising framework by relying on noise\ninjection into the inputs to obtain a smoothed and robust classifier. In this\npaper, we first show that the variance introduced by the Monte-Carlo sampling\nin the randomized smoothing procedure estimate closely interacts with two other\nimportant properties of the classifier, \\textit{i.e.} its Lipschitz constant\nand margin. More precisely, our work emphasizes the dual impact of the\nLipschitz constant of the base classifier, on both the smoothed classifier and\nthe empirical variance. Moreover, to increase the certified robust radius, we\nintroduce a different way to convert logits to probability vectors for the base\nclassifier to leverage the variance-margin trade-off. We leverage the use of\nBernstein's concentration inequality along with enhanced Lipschitz bounds for\nrandomized smoothing. Experimental results show a significant improvement in\ncertified accuracy compared to current state-of-the-art methods. Our novel\ncertification procedure allows us to use pre-trained models that are used with\nrandomized smoothing, effectively improving the current certification radius in\na zero-shot manner.\n","authors":["Blaise Delattre","Alexandre Araujo","Quentin Barthélemy","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2309.16883v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04033v1","updated":"2024-02-06T14:26:22Z","published":"2024-02-06T14:26:22Z","title":"On provable privacy vulnerabilities of graph representations","summary":" Graph representation learning (GRL) is critical for extracting insights from\ncomplex network structures, but it also raises security concerns due to\npotential privacy vulnerabilities in these representations. This paper\ninvestigates the structural vulnerabilities in graph neural models where\nsensitive topological information can be inferred through edge reconstruction\nattacks. Our research primarily addresses the theoretical underpinnings of\ncosine-similarity-based edge reconstruction attacks (COSERA), providing\ntheoretical and empirical evidence that such attacks can perfectly reconstruct\nsparse Erdos Renyi graphs with independent random features as graph size\nincreases. Conversely, we establish that sparsity is a critical factor for\nCOSERA's effectiveness, as demonstrated through analysis and experiments on\nstochastic block models. Finally, we explore the resilience of (provably)\nprivate graph representations produced via noisy aggregation (NAG) mechanism\nagainst COSERA. We empirically delineate instances wherein COSERA demonstrates\nboth efficacy and deficiency in its capacity to function as an instrument for\nelucidating the trade-off between privacy and utility.\n","authors":["Ruofan Wu","Guanhua Fang","Qiying Pan","Mingyang Zhang","Tengfei Liu","Weiqiang Wang","Wenbiao Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.04033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04031v1","updated":"2024-02-06T14:26:02Z","published":"2024-02-06T14:26:02Z","title":"Polyp-DDPM: Diffusion-Based Semantic Polyp Synthesis for Enhanced\n Segmentation","summary":" This study introduces Polyp-DDPM, a diffusion-based method for generating\nrealistic images of polyps conditioned on masks, aimed at enhancing the\nsegmentation of gastrointestinal (GI) tract polyps. Our approach addresses the\nchallenges of data limitations, high annotation costs, and privacy concerns\nassociated with medical images. By conditioning the diffusion model on\nsegmentation masks-binary masks that represent abnormal areas-Polyp-DDPM\noutperforms state-of-the-art methods in terms of image quality (achieving a\nFrechet Inception Distance (FID) score of 78.47, compared to scores above\n83.79) and segmentation performance (achieving an Intersection over Union (IoU)\nof 0.7156, versus less than 0.6694 for synthetic images from baseline models\nand 0.7067 for real data). Our method generates a high-quality, diverse\nsynthetic dataset for training, thereby enhancing polyp segmentation models to\nbe comparable with real images and offering greater data augmentation\ncapabilities to improve segmentation models. The source code and pretrained\nweights for Polyp-DDPM are made publicly available at\nhttps://github.com/mobaidoctor/polyp-ddpm.\n","authors":["Zolnamar Dorjsembe","Hsing-Kuo Pao","Furen Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.04031v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2402.04030v1","updated":"2024-02-06T14:25:09Z","published":"2024-02-06T14:25:09Z","title":"Reducing the Cost of Quantum Chemical Data By Backpropagating Through\n Density Functional Theory","summary":" Density Functional Theory (DFT) accurately predicts the quantum chemical\nproperties of molecules, but scales as $O(N_{\\text{electrons}}^3)$. Sch\\\"utt et\nal. (2019) successfully approximate DFT 1000x faster with Neural Networks (NN).\nArguably, the biggest problem one faces when scaling to larger molecules is the\ncost of DFT labels. For example, it took years to create the PCQ dataset\n(Nakata & Shimazaki, 2017) on which subsequent NNs are trained within a week.\nDFT labels molecules by minimizing energy $E(\\cdot )$ as a \"loss function.\" We\nbypass dataset creation by directly training NNs with $E(\\cdot )$ as a loss\nfunction. For comparison, Sch\\\"utt et al. (2019) spent 626 hours creating a\ndataset on which they trained their NN for 160h, for a total of 786h; our\nmethod achieves comparable performance within 31h.\n","authors":["Alexander Mathiasen","Hatem Helal","Paul Balanca","Adam Krzywaniak","Ali Parviz","Frederik Hvilshøj","Blazej Banaszewski","Carlo Luschi","Andrew William Fitzgibbon"],"pdf_url":"https://arxiv.org/pdf/2402.04030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04029v1","updated":"2024-02-06T14:24:29Z","published":"2024-02-06T14:24:29Z","title":"Positive concave deep equilibrium models","summary":" Deep equilibrium (DEQ) models are widely recognized as a memory efficient\nalternative to standard neural networks, achieving state-of-the-art performance\nin language modeling and computer vision tasks. These models solve a fixed\npoint equation instead of explicitly computing the output, which sets them\napart from standard neural networks. However, existing DEQ models often lack\nformal guarantees of the existence and uniqueness of the fixed point, and the\nconvergence of the numerical scheme used for computing the fixed point is not\nformally established. As a result, DEQ models are potentially unstable in\npractice. To address these drawbacks, we introduce a novel class of DEQ models\ncalled positive concave deep equilibrium (pcDEQ) models. Our approach, which is\nbased on nonlinear Perron-Frobenius theory, enforces nonnegative weights and\nactivation functions that are concave on the positive orthant. By imposing\nthese constraints, we can easily ensure the existence and uniqueness of the\nfixed point without relying on additional complex assumptions commonly found in\nthe DEQ literature, such as those based on monotone operator theory in convex\nanalysis. Furthermore, the fixed point can be computed with the standard fixed\npoint algorithm, and we provide theoretical guarantees of geometric\nconvergence, which, in particular, simplifies the training process. Experiments\ndemonstrate the competitiveness of our pcDEQ models against other implicit\nmodels.\n","authors":["Mateusz Gabor","Tomasz Piotrowski","Renato L. G. Cavalcante"],"pdf_url":"https://arxiv.org/pdf/2402.04029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04028v1","updated":"2024-02-06T14:24:28Z","published":"2024-02-06T14:24:28Z","title":"AlbNews: A Corpus of Headlines for Topic Modeling in Albanian","summary":" The scarcity of available text corpora for low-resource languages like\nAlbanian is a serious hurdle for research in natural language processing tasks.\nThis paper introduces AlbNews, a collection of 600 topically labeled news\nheadlines and 2600 unlabeled ones in Albanian. The data can be freely used for\nconducting topic modeling research. We report the initial classification scores\nof some traditional machine learning classifiers trained with the AlbNews\nsamples. These results show that basic models outrun the ensemble learning ones\nand can serve as a baseline for future experiments.\n","authors":["Erion Çano","Dario Lamaj"],"pdf_url":"https://arxiv.org/pdf/2402.04028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03892v2","updated":"2024-02-06T14:12:49Z","published":"2024-01-08T13:43:56Z","title":"Sampling in Unit Time with Kernel Fisher-Rao Flow","summary":" We introduce a new mean-field ODE and corresponding interacting particle\nsystems (IPS) for sampling from an unnormalized target density. The IPS are\ngradient-free, available in closed form, and only require the ability to sample\nfrom a reference density and compute the (unnormalized) target-to-reference\ndensity ratio. The mean-field ODE is obtained by solving a Poisson equation for\na velocity field that transports samples along the geometric mixture of the two\ndensities, which is the path of a particular Fisher-Rao gradient flow. We\nemploy a RKHS ansatz for the velocity field, which makes the Poisson equation\ntractable and enables discretization of the resulting mean-field ODE over\nfinite samples. The mean-field ODE can be additionally be derived from a\ndiscrete-time perspective as the limit of successive linearizations of the\nMonge-Amp\\`ere equations within a framework known as sample-driven optimal\ntransport. We introduce a stochastic variant of our approach and demonstrate\nempirically that our IPS can produce high-quality samples from varied target\ndistributions, outperforming comparable gradient-free particle systems and\ncompetitive with gradient-based alternatives.\n","authors":["Aimee Maurais","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2401.03892v2.pdf","comment":"Updated with additional numerical examples and a stochastic variant\n of the approach"},{"id":"http://arxiv.org/abs/2402.04022v1","updated":"2024-02-06T14:12:46Z","published":"2024-02-06T14:12:46Z","title":"A General Theory for Kernel Packets: from state space model to compactly\n supported basis","summary":" It is well known that the state space (SS) model formulation of a Gaussian\nprocess (GP) can lower its training and prediction time both to O(n) for n data\npoints. We prove that an $m$-dimensional SS model formulation of GP is\nequivalent to a concept we introduce as the general right Kernel Packet (KP): a\ntransformation for the GP covariance function $K$ such that\n$\\sum_{i=0}^{m}a_iD_t^{(j)}K(t,t_i)=0$ holds for any $t \\leq t_1$, 0 $\\leq j\n\\leq m-1$, and $m+1$ consecutive points $t_i$, where ${D}_t^{(j)}f(t) $ denotes\n$j$-th order derivative acting on $t$. We extend this idea to the backward SS\nmodel formulation of the GP, leading to the concept of the left KP for next $m$\nconsecutive points: $\\sum_{i=0}^{m}b_i{D}_t^{(j)}K(t,t_{m+i})=0$ for any $t\\geq\nt_{2m}$. By combining both left and right KPs, we can prove that a suitable\nlinear combination of these covariance functions yields $m$ compactly supported\nKP functions: $\\phi^{(j)}(t)=0$ for any $t\\not\\in(t_0,t_{2m})$ and\n$j=0,\\cdots,m-1$. KPs further reduces the prediction time of GP to O(log n) or\neven O(1) and can be applied to more general problems involving the derivative\nof GPs.\n","authors":["Liang Ding","Tuo Rui"],"pdf_url":"https://arxiv.org/pdf/2402.04022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.13059v4","updated":"2024-02-06T14:11:06Z","published":"2022-02-26T04:49:01Z","title":"Theoretical Error Analysis of Entropy Approximation for Gaussian Mixture","summary":" Gaussian mixture distributions are commonly employed to represent general\nprobability distributions. Despite the importance of using Gaussian mixtures\nfor uncertainty estimation, the entropy of a Gaussian mixture cannot be\nanalytically calculated. Notably, Gal and Ghahramani [2016] proposed the\napproximate entropy that is the sum of the entropies of unimodal Gaussian\ndistributions. This approximation is easy to analytically calculate regardless\nof dimension, but there lack theoretical guarantees. In this paper, we\ntheoretically analyze the approximation error between the true entropy and the\napproximate one to reveal when this approximation works effectively. This error\nis controlled by how far apart each Gaussian component of the Gaussian mixture.\nTo measure such separation, we introduce the ratios of the distances between\nthe means to the sum of the variances of each Gaussian component of the\nGaussian mixture, and we reveal that the error converges to zero as the ratios\ntend to infinity. This convergence situation is more likely to occur in higher\ndimensional spaces. Therefore, our results provide a guarantee that this\napproximation works well in higher dimension problems, particularly in\nscenarios such as neural networks that involve a large number of weights.\n","authors":["Takashi Furuya","Hiroyuki Kusumoto","Koichi Taniguchi","Naoya Kanno","Kazuma Suetake"],"pdf_url":"https://arxiv.org/pdf/2202.13059v4.pdf","comment":"34 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.10931v2","updated":"2024-02-06T14:05:49Z","published":"2023-03-20T08:09:13Z","title":"Approaching an unknown communication system by latent space exploration\n and causal inference","summary":" This paper proposes a methodology for discovering meaningful properties in\ndata by exploring the latent space of unsupervised deep generative models. We\ncombine manipulation of individual latent variables to extreme values with\nmethods inspired by causal inference into an approach we call causal\ndisentanglement with extreme values (CDEV) and show that this method yields\ninsights for model interpretability. With this, we can test for what properties\nof unknown data the model encodes as meaningful, using it to glean insight into\nthe communication system of sperm whales (Physeter macrocephalus), one of the\nmost intriguing and understudied animal communication systems. The network\narchitecture used has been shown to learn meaningful representations of speech;\nhere, it is used as a learning mechanism to decipher the properties of another\nvocal communication system in which case we have no ground truth. The proposed\nmethodology suggests that sperm whales encode information using the number of\nclicks in a sequence, the regularity of their timing, and audio properties such\nas the spectral mean and the acoustic regularity of the sequences. Some of\nthese findings are consistent with existing hypotheses, while others are\nproposed for the first time. We also argue that our models uncover rules that\ngovern the structure of units in the communication system and apply them while\ngenerating innovative data not shown during training. This paper suggests that\nan interpretation of the outputs of deep neural networks with causal inference\nmethodology can be a viable strategy for approaching data about which little is\nknown and presents another case of how deep learning can limit the hypothesis\nspace. Finally, the proposed approach can be extended to other architectures\nand datasets.\n","authors":["Gašper Beguš","Andrej Leban","Shane Gero"],"pdf_url":"https://arxiv.org/pdf/2303.10931v2.pdf","comment":"25 pages, 23 figures; new format and section layout (moved some\n sections to the appendix), added replication experiments, updated references:\n to a subsequent experimental validation of the work, as well as to related\n methodological work"},{"id":"http://arxiv.org/abs/2402.04010v1","updated":"2024-02-06T14:05:05Z","published":"2024-02-06T14:05:05Z","title":"Efficient Availability Attacks against Supervised and Contrastive\n Learning Simultaneously","summary":" Availability attacks can prevent the unauthorized use of private data and\ncommercial datasets by generating imperceptible noise and making unlearnable\nexamples before release. Ideally, the obtained unlearnability prevents\nalgorithms from training usable models. When supervised learning (SL)\nalgorithms have failed, a malicious data collector possibly resorts to\ncontrastive learning (CL) algorithms to bypass the protection. Through\nevaluation, we have found that most of the existing methods are unable to\nachieve both supervised and contrastive unlearnability, which poses risks to\ndata protection. Different from recent methods based on contrastive error\nminimization, we employ contrastive-like data augmentations in supervised error\nminimization or maximization frameworks to obtain attacks effective for both SL\nand CL. Our proposed AUE and AAP attacks achieve state-of-the-art worst-case\nunlearnability across SL and CL algorithms with less computation consumption,\nshowcasing prospects in real-world applications.\n","authors":["Yihan Wang","Yifan Zhu","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2402.04010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04005v1","updated":"2024-02-06T14:00:43Z","published":"2024-02-06T14:00:43Z","title":"Bayesian Uncertainty for Gradient Aggregation in Multi-Task Learning","summary":" As machine learning becomes more prominent there is a growing demand to\nperform several inference tasks in parallel. Running a dedicated model for each\ntask is computationally expensive and therefore there is a great interest in\nmulti-task learning (MTL). MTL aims at learning a single model that solves\nseveral tasks efficiently. Optimizing MTL models is often achieved by computing\na single gradient per task and aggregating them for obtaining a combined update\ndirection. However, these approaches do not consider an important aspect, the\nsensitivity in the gradient dimensions. Here, we introduce a novel gradient\naggregation approach using Bayesian inference. We place a probability\ndistribution over the task-specific parameters, which in turn induce a\ndistribution over the gradients of the tasks. This additional valuable\ninformation allows us to quantify the uncertainty in each of the gradients\ndimensions, which can then be factored in when aggregating them. We empirically\ndemonstrate the benefits of our approach in a variety of datasets, achieving\nstate-of-the-art performance.\n","authors":["Idan Achituve","Idit Diamant","Arnon Netzer","Gal Chechik","Ethan Fetaya"],"pdf_url":"https://arxiv.org/pdf/2402.04005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04004v1","updated":"2024-02-06T13:59:56Z","published":"2024-02-06T13:59:56Z","title":"Understanding the Effect of Noise in LLM Training Data with Algorithmic\n Chains of Thought","summary":" During both pretraining and fine-tuning, Large Language Models\n(\\textbf{LLMs}) are trained on trillions of tokens of text of widely varying\nquality. Both phases of training typically involve heuristically filtering out\n``low-quality'' or \\textit{noisy} training samples, yet little is known\nquantitatively about how the type or intensity of noise affects downstream\nperformance. In this work, we study how noise in chain of thought\n(\\textbf{CoT}) impacts task performance in the highly-controlled setting of\nalgorithmically solvable tasks. First, we develop the Traced Integer\n(\\textbf{TInt}) framework to generate highly customizable noised execution\ntraces for any arithmetic function on lists of integers. We then define two\ntypes of noise: \\textit{static} noise, a local form of noise which is applied\nafter the CoT trace is computed, and \\textit{dynamic} noise, a global form of\nnoise which propagates errors in the trace as it is computed. We then evaluate\nthe test performance of pretrained models both prompted and fine-tuned on\nnoised datasets with varying levels of dataset contamination and intensity. We\nfind fine-tuned models are extremely robust to high levels of static noise but\nstruggle significantly more with lower levels of dynamic noise. In contrast,\nfew-shot prompted models appear more sensitive to even static noise. We\nconclude with a discussion of how our findings impact noise filtering\nbest-practices, in particular emphasizing the importance of removing samples\ncontaining destructive dynamic noise with global errors.\n","authors":["Alex Havrilla","Maia Iyer"],"pdf_url":"https://arxiv.org/pdf/2402.04004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03994v1","updated":"2024-02-06T13:47:12Z","published":"2024-02-06T13:47:12Z","title":"Gradient Sketches for Training Data Attribution and Studying the Loss\n Landscape","summary":" Random projections or sketches of gradients and Hessian vector products play\nan essential role in applications where one needs to store many such vectors\nwhile retaining accurate information about their relative geometry. Two\nimportant scenarios are training data attribution (tracing a model's behavior\nto the training data), where one needs to store a gradient for each training\nexample, and the study of the spectrum of the Hessian (to analyze the training\ndynamics), where one needs to store multiple Hessian vector products. While\nsketches that use dense matrices are easy to implement, they are memory bound\nand cannot be scaled to modern neural networks. Motivated by work on the\nintrinsic dimension of neural networks, we propose and study a design space for\nscalable sketching algorithms. We demonstrate the efficacy of our approach in\nthree applications: training data attribution, the analysis of the Hessian\nspectrum and the computation of the intrinsic dimension when fine-tuning\npre-trained language models.\n","authors":["Andrea Schioppa"],"pdf_url":"https://arxiv.org/pdf/2402.03994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03992v1","updated":"2024-02-06T13:45:01Z","published":"2024-02-06T13:45:01Z","title":"Space Group Constrained Crystal Generation","summary":" Crystals are the foundation of numerous scientific and industrial\napplications. While various learning-based approaches have been proposed for\ncrystal generation, existing methods seldom consider the space group constraint\nwhich is crucial in describing the geometry of crystals and closely relevant to\nmany desirable properties. However, considering space group constraint is\nchallenging owing to its diverse and nontrivial forms. In this paper, we reduce\nthe space group constraint into an equivalent formulation that is more\ntractable to be handcrafted into the generation process. In particular, we\ntranslate the space group constraint into two parts: the basis constraint of\nthe invariant logarithmic space of the lattice matrix and the Wyckoff position\nconstraint of the fractional coordinates. Upon the derived constraints, we then\npropose DiffCSP++, a novel diffusion model that has enhanced a previous work\nDiffCSP by further taking space group constraint into account. Experiments on\nseveral popular datasets verify the benefit of the involvement of the space\ngroup constraint, and show that our DiffCSP++ achieves promising performance on\ncrystal structure prediction, ab initio crystal generation and controllable\ngeneration with customized space groups.\n","authors":["Rui Jiao","Wenbing Huang","Yu Liu","Deli Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.03992v1.pdf","comment":"ICLR 2024 poster"},{"id":"http://arxiv.org/abs/2402.03991v1","updated":"2024-02-06T13:44:39Z","published":"2024-02-06T13:44:39Z","title":"Neural Rank Collapse: Weight Decay and Small Within-Class Variability\n Yield Low-Rank Bias","summary":" Recent work in deep learning has shown strong empirical and theoretical\nevidence of an implicit low-rank bias: weight matrices in deep networks tend to\nbe approximately low-rank and removing relatively small singular values during\ntraining or from available trained models may significantly reduce model size\nwhile maintaining or even improving model performance. However, the majority of\nthe theoretical investigations around low-rank bias in neural networks deal\nwith oversimplified deep linear networks. In this work, we consider general\nnetworks with nonlinear activations and the weight decay parameter, and we show\nthe presence of an intriguing neural rank collapse phenomenon, connecting the\nlow-rank bias of trained networks with networks' neural collapse properties: as\nthe weight decay parameter grows, the rank of each layer in the network\ndecreases proportionally to the within-class variability of the hidden-space\nembeddings of the previous layers. Our theoretical findings are supported by a\nrange of experimental evaluations illustrating the phenomenon.\n","authors":["Emanuele Zangrando","Piero Deidda","Simone Brugiapaglia","Nicola Guglielmi","Francesco Tudisco"],"pdf_url":"https://arxiv.org/pdf/2402.03991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03990v1","updated":"2024-02-06T13:43:22Z","published":"2024-02-06T13:43:22Z","title":"Subsampling is not Magic: Why Large Batch Sizes Work for Differentially\n Private Stochastic Optimisation","summary":" We study the effect of the batch size to the total gradient variance in\ndifferentially private stochastic gradient descent (DP-SGD), seeking a\ntheoretical explanation for the usefulness of large batch sizes. As DP-SGD is\nthe basis of modern DP deep learning, its properties have been widely studied,\nand recent works have empirically found large batch sizes to be beneficial.\nHowever, theoretical explanations of this benefit are currently heuristic at\nbest. We first observe that the total gradient variance in DP-SGD can be\ndecomposed into subsampling-induced and noise-induced variances. We then prove\nthat in the limit of an infinite number of iterations, the effective\nnoise-induced variance is invariant to the batch size. The remaining\nsubsampling-induced variance decreases with larger batch sizes, so large\nbatches reduce the effective total gradient variance. We confirm numerically\nthat the asymptotic regime is relevant in practical settings when the batch\nsize is not small, and find that outside the asymptotic regime, the total\ngradient variance decreases even more with large batch sizes. We also find a\nsufficient condition that implies that large batch sizes similarly reduce\neffective DP noise variance for one iteration of DP-SGD.\n","authors":["Ossi Räisä","Joonas Jälkö","Antti Honkela"],"pdf_url":"https://arxiv.org/pdf/2402.03990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00344v2","updated":"2024-02-06T13:40:35Z","published":"2023-09-30T11:38:13Z","title":"HarmonyDream: Task Harmonization Inside World Models","summary":" Model-based reinforcement learning (MBRL) holds the promise of\nsample-efficient learning by utilizing a world model, which models how the\nenvironment works and typically encompasses components for two tasks:\nobservation modeling and reward modeling. In this paper, through a dedicated\nempirical investigation, we gain a deeper understanding of the role each task\nplays in world models and uncover the overlooked potential of sample-efficient\nMBRL by mitigating the domination of either observation or reward modeling. Our\nkey insight is that while prevalent approaches of explicit MBRL attempt to\nrestore abundant details of the environment via observation models, it is\ndifficult due to the environment's complexity and limited model capacity. On\nthe other hand, reward models, while dominating implicit MBRL and adept at\nlearning compact task-centric dynamics, are inadequate for sample-efficient\nlearning without richer learning signals. Motivated by these insights and\ndiscoveries, we propose a simple yet effective approach, HarmonyDream, which\nautomatically adjusts loss coefficients to maintain task harmonization, i.e. a\ndynamic equilibrium between the two tasks in world model learning. Our\nexperiments show that the base MBRL method equipped with HarmonyDream gains\n10%-69% absolute performance boosts on visual robotic tasks and sets a new\nstate-of-the-art result on the Atari 100K benchmark.\n","authors":["Haoyu Ma","Jialong Wu","Ningya Feng","Chenjun Xiao","Dong Li","Jianye Hao","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2310.00344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.02926v3","updated":"2024-02-06T13:38:40Z","published":"2021-06-05T16:11:02Z","title":"IM-META: Influence Maximization Using Node Metadata in Networks With\n Unknown Topology","summary":" Since the structure of complex networks is often unknown, we may identify the\nmost influential seed nodes by exploring only a part of the underlying network,\ngiven a small budget for node queries. We propose IM-META, a solution to\ninfluence maximization (IM) in networks with unknown topology by retrieving\ninformation from queries and node metadata. Since using such metadata is not\nwithout risk due to the noisy nature of metadata and uncertainties in\nconnectivity inference, we formulate a new IM problem that aims to find both\nseed nodes and queried nodes. In IM-META, we develop an effective method that\niteratively performs three steps: 1) we learn the relationship between\ncollected metadata and edges via a Siamese neural network, 2) we select a\nnumber of inferred confident edges to construct a reinforced graph, and 3) we\nidentify the next node to query by maximizing the inferred influence spread\nusing our topology-aware ranking strategy. Through experimental evaluation of\nIM-META on four real-world datasets, we demonstrate a) the speed of network\nexploration via node queries, b) the effectiveness of each module, c) the\nsuperiority over benchmark methods, d) the robustness to more difficult\nsettings, e) the hyperparameter sensitivity, and f) the scalability.\n","authors":["Cong Tran","Won-Yong Shin","Andreas Spitz"],"pdf_url":"https://arxiv.org/pdf/2106.02926v3.pdf","comment":"14 pages, 11 figures, 4 tables, to appear in the IEEE Transactions on\n Network Science and Engineering (Please cite our journal version that will\n appear in an upcoming issue.)"},{"id":"http://arxiv.org/abs/2209.07481v3","updated":"2024-02-06T13:35:14Z","published":"2022-09-15T17:22:04Z","title":"Variational Representations of Annealing Paths: Bregman Information\n under Monotonic Embedding","summary":" Markov Chain Monte Carlo methods for sampling from complex distributions and\nestimating normalization constants often simulate samples from a sequence of\nintermediate distributions along an annealing path, which bridges between a\ntractable initial distribution and a target density of interest. Prior works\nhave constructed annealing paths using quasi-arithmetic means, and interpreted\nthe resulting intermediate densities as minimizing an expected divergence to\nthe endpoints. To analyze these variational representations of annealing paths,\nwe extend known results showing that the arithmetic mean over arguments\nminimizes the expected Bregman divergence to a single representative point. In\nparticular, we obtain an analogous result for quasi-arithmetic means, when the\ninputs to the Bregman divergence are transformed under a monotonic embedding\nfunction. Our analysis highlights the interplay between quasi-arithmetic means,\nparametric families, and divergence functionals using the rho-tau\nrepresentational Bregman divergence framework, and associates common divergence\nfunctionals with intermediate densities along an annealing path.\n","authors":["Rob Brekelmans","Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2209.07481v3.pdf","comment":"Published in Information Geometry (Info. Geo. 2024)"},{"id":"http://arxiv.org/abs/2402.03985v1","updated":"2024-02-06T13:20:46Z","published":"2024-02-06T13:20:46Z","title":"A Bias-Variance Decomposition for Ensembles over Multiple Synthetic\n Datasets","summary":" Recent studies have highlighted the benefits of generating multiple synthetic\ndatasets for supervised learning, from increased accuracy to more effective\nmodel selection and uncertainty estimation. These benefits have clear empirical\nsupport, but the theoretical understanding of them is currently very light. We\nseek to increase the theoretical understanding by deriving bias-variance\ndecompositions for several settings of using multiple synthetic datasets. Our\ntheory predicts multiple synthetic datasets to be especially beneficial for\nhigh-variance downstream predictors, and yields a simple rule of thumb to\nselect the appropriate number of synthetic datasets in the case of mean-squared\nerror and Brier score. We investigate how our theory works in practice by\nevaluating the performance of an ensemble over many synthetic datasets for\nseveral real datasets and downstream predictors. The results follow our theory,\nshowing that our insights are also practically relevant.\n","authors":["Ossi Räisä","Antti Honkela"],"pdf_url":"https://arxiv.org/pdf/2402.03985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19190v4","updated":"2024-02-06T13:19:57Z","published":"2023-05-30T16:34:28Z","title":"Inverse Approximation Theory for Nonlinear Recurrent Neural Networks","summary":" We prove an inverse approximation theorem for the approximation of nonlinear\nsequence-to-sequence relationships using recurrent neural networks (RNNs). This\nis a so-called Bernstein-type result in approximation theory, which deduces\nproperties of a target function under the assumption that it can be effectively\napproximated by a hypothesis space. In particular, we show that nonlinear\nsequence relationships that can be stably approximated by nonlinear RNNs must\nhave an exponential decaying memory structure - a notion that can be made\nprecise. This extends the previously identified curse of memory in linear RNNs\ninto the general nonlinear setting, and quantifies the essential limitations of\nthe RNN architecture for learning sequential relationships with long-term\nmemory. Based on the analysis, we propose a principled reparameterization\nmethod to overcome the limitations. Our theoretical results are confirmed by\nnumerical experiments. The code has been released in\nhttps://github.com/radarFudan/Curse-of-memory\n","authors":["Shida Wang","Zhong Li","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2305.19190v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03982v1","updated":"2024-02-06T13:19:26Z","published":"2024-02-06T13:19:26Z","title":"On Convergence of Adam for Stochastic Optimization under Relaxed\n Assumptions","summary":" The Adaptive Momentum Estimation (Adam) algorithm is highly effective in\ntraining various deep learning tasks. Despite this, there's limited theoretical\nunderstanding for Adam, especially when focusing on its vanilla form in\nnon-convex smooth scenarios with potential unbounded gradients and affine\nvariance noise. In this paper, we study vanilla Adam under these challenging\nconditions. We introduce a comprehensive noise model which governs affine\nvariance noise, bounded noise and sub-Gaussian noise. We show that Adam can\nfind a stationary point with a $\\mathcal{O}(\\text{poly}(\\log T)/\\sqrt{T})$ rate\nin high probability under this general noise model where $T$ denotes total\nnumber iterations, matching the lower rate of stochastic first-order algorithms\nup to logarithm factors. More importantly, we reveal that Adam is free of\ntuning step-sizes with any problem-parameters, yielding a better adaptation\nproperty than the Stochastic Gradient Descent under the same conditions. We\nalso provide a probabilistic convergence result for Adam under a generalized\nsmooth condition which allows unbounded smoothness parameters and has been\nillustrated empirically to more accurately capture the smooth property of many\npractical objective functions.\n","authors":["Yusu Hong","Junhong Lin"],"pdf_url":"https://arxiv.org/pdf/2402.03982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03979v1","updated":"2024-02-06T13:16:50Z","published":"2024-02-06T13:16:50Z","title":"Cross Entropy versus Label Smoothing: A Neural Collapse Perspective","summary":" Label smoothing loss is a widely adopted technique to mitigate overfitting in\ndeep neural networks. This paper studies label smoothing from the perspective\nof Neural Collapse (NC), a powerful empirical and theoretical framework which\ncharacterizes model behavior during the terminal phase of training. We first\nshow empirically that models trained with label smoothing converge faster to\nneural collapse solutions and attain a stronger level of neural collapse.\nAdditionally, we show that at the same level of NC1, models under label\nsmoothing loss exhibit intensified NC2. These findings provide valuable\ninsights into the performance benefits and enhanced model calibration under\nlabel smoothing loss. We then leverage the unconstrained feature model to\nderive closed-form solutions for the global minimizers for both loss functions\nand further demonstrate that models under label smoothing have a lower\nconditioning number and, therefore, theoretically converge faster. Our study,\ncombining empirical evidence and theoretical results, not only provides nuanced\ninsights into the differences between label smoothing and cross-entropy losses,\nbut also serves as an example of how the powerful neural collapse framework can\nbe used to improve our understanding of DNNs.\n","authors":["Li Guo","Keith Ross","Zifan Zhao","Andriopoulos George","Shuyang Ling","Yufeng Xu","Zixuan Dong"],"pdf_url":"https://arxiv.org/pdf/2402.03979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19302v3","updated":"2024-02-06T13:14:35Z","published":"2023-05-30T15:26:43Z","title":"Smooth, exact rotational symmetrization for deep learning on point\n clouds","summary":" Point clouds are versatile representations of 3D objects and have found\nwidespread application in science and engineering. Many successful\ndeep-learning models have been proposed that use them as input. The domain of\nchemical and materials modeling is especially challenging because exact\ncompliance with physical constraints is highly desirable for a model to be\nusable in practice. These constraints include smoothness and invariance with\nrespect to translations, rotations, and permutations of identical atoms. If\nthese requirements are not rigorously fulfilled, atomistic simulations might\nlead to absurd outcomes even if the model has excellent accuracy. Consequently,\ndedicated architectures, which achieve invariance by restricting their design\nspace, have been developed. General-purpose point-cloud models are more varied\nbut often disregard rotational symmetry. We propose a general symmetrization\nmethod that adds rotational equivariance to any given model while preserving\nall the other requirements. Our approach simplifies the development of better\natomic-scale machine-learning schemes by relaxing the constraints on the design\nspace and making it possible to incorporate ideas that proved effective in\nother domains. We demonstrate this idea by introducing the Point Edge\nTransformer (PET) architecture, which is not intrinsically equivariant but\nachieves state-of-the-art performance on several benchmark datasets of\nmolecules and solids. A-posteriori application of our general protocol makes\nPET exactly equivariant, with minimal changes to its accuracy.\n","authors":["Sergey N. Pozdnyakov","Michele Ceriotti"],"pdf_url":"https://arxiv.org/pdf/2305.19302v3.pdf","comment":"Enhancing figures; minor polishing"},{"id":"http://arxiv.org/abs/2311.01344v2","updated":"2024-02-06T13:10:23Z","published":"2023-11-02T15:55:20Z","title":"Like an Open Book? Read Neural Network Architecture with Simple Power\n Analysis on 32-bit Microcontrollers","summary":" Model extraction is a growing concern for the security of AI systems. For\ndeep neural network models, the architecture is the most important information\nan adversary aims to recover. Being a sequence of repeated computation blocks,\nneural network models deployed on edge-devices will generate distinctive\nside-channel leakages. The latter can be exploited to extract critical\ninformation when targeted platforms are physically accessible. By combining\ntheoretical knowledge about deep learning practices and analysis of a\nwidespread implementation library (ARM CMSIS-NN), our purpose is to answer this\ncritical question: how far can we extract architecture information by simply\nexamining an EM side-channel trace? For the first time, we propose an\nextraction methodology for traditional MLP and CNN models running on a high-end\n32-bit microcontroller (Cortex-M7) that relies only on simple pattern\nrecognition analysis. Despite few challenging cases, we claim that, contrary to\nparameters extraction, the complexity of the attack is relatively low and we\nhighlight the urgent need for practicable protections that could fit the strong\nmemory and latency requirements of such platforms.\n","authors":["Raphael Joud","Pierre-Alain Moellic","Simon Pontie","Jean-Baptiste Rigaud"],"pdf_url":"https://arxiv.org/pdf/2311.01344v2.pdf","comment":"Accepted CARDIS 2023; ANR PICTURE PROJECT (ANR-20-CE39-0013)"},{"id":"http://arxiv.org/abs/2402.03973v1","updated":"2024-02-06T13:06:14Z","published":"2024-02-06T13:06:14Z","title":"Humans Beat Deep Networks at Recognizing Objects in Unusual Poses, Given\n Enough Time","summary":" Deep learning is closing the gap with humans on several object recognition\nbenchmarks. Here we investigate this gap in the context of challenging images\nwhere objects are seen from unusual viewpoints. We find that humans excel at\nrecognizing objects in unusual poses, in contrast with state-of-the-art\npretrained networks (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) which are\nsystematically brittle in this condition. Remarkably, as we limit image\nexposure time, human performance degrades to the level of deep networks,\nsuggesting that additional mental processes (requiring additional time) take\nplace when humans identify objects in unusual poses. Finally, our analysis of\nerror patterns of humans vs. networks reveals that even time-limited humans are\ndissimilar to feed-forward deep networks. We conclude that more work is needed\nto bring computer vision systems to the level of robustness of the human visual\nsystem. Understanding the nature of the mental processes taking place during\nextra viewing time may be key to attain such robustness.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03970v1","updated":"2024-02-06T12:59:02Z","published":"2024-02-06T12:59:02Z","title":"Tabular Data: Is Attention All You Need?","summary":" Deep Learning has revolutionized the field of AI and led to remarkable\nachievements in applications involving image and text data. Unfortunately,\nthere is inconclusive evidence on the merits of neural networks for structured\ntabular data. In this paper, we introduce a large-scale empirical study\ncomparing neural networks against gradient-boosted decision trees on tabular\ndata, but also transformer-based architectures against traditional multi-layer\nperceptrons (MLP) with residual connections. In contrast to prior work, our\nempirical findings indicate that neural networks are competitive against\ndecision trees. Furthermore, we assess that transformer-based architectures do\nnot outperform simpler variants of traditional MLP architectures on tabular\ndatasets. As a result, this paper helps the research and practitioner\ncommunities make informed choices on deploying neural networks on future\ntabular data applications.\n","authors":["Guri Zabërgja","Arlind Kadra","Josif Grabocka"],"pdf_url":"https://arxiv.org/pdf/2402.03970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03969v1","updated":"2024-02-06T12:58:38Z","published":"2024-02-06T12:58:38Z","title":"In-context learning agents are asymmetric belief updaters","summary":" We study the in-context learning dynamics of large language models (LLMs)\nusing three instrumental learning tasks adapted from cognitive psychology. We\nfind that LLMs update their beliefs in an asymmetric manner and learn more from\nbetter-than-expected outcomes than from worse-than-expected ones. Furthermore,\nwe show that this effect reverses when learning about counterfactual feedback\nand disappears when no agency is implied. We corroborate these findings by\ninvestigating idealized in-context learning agents derived through\nmeta-reinforcement learning, where we observe similar patterns. Taken together,\nour results contribute to our understanding of how in-context learning works by\nhighlighting that the framing of a problem significantly influences how\nlearning occurs, a phenomenon also observed in human cognition.\n","authors":["Johannes A. Schubert","Akshay K. Jagadish","Marcel Binz","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2402.03969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03966v1","updated":"2024-02-06T12:56:55Z","published":"2024-02-06T12:56:55Z","title":"On dimensionality of feature vectors in MPNNs","summary":" We revisit the classical result of Morris et al.~(AAAI'19) that\nmessage-passing graphs neural networks (MPNNs) are equal in their\ndistinguishing power to the Weisfeiler--Leman (WL) isomorphism test.\n Morris et al.~show their simulation result with ReLU activation function and\n$O(n)$-dimensional feature vectors, where $n$ is the number of nodes of the\ngraph. Recently, by introducing randomness into the architecture, Aamand et\nal.~(NeurIPS'22) were able to improve this bound to $O(\\log n)$-dimensional\nfeature vectors, although at the expense of guaranteeing perfect simulation\nonly with high probability.\n In all these constructions, to guarantee equivalence to the WL test, the\ndimension of feature vectors in the MPNN has to increase with the size of the\ngraphs. However, architectures used in practice have feature vectors of\nconstant dimension. Thus, there is a gap between the guarantees provided by\nthese results and the actual characteristics of architectures used in practice.\nIn this paper we close this gap by showing that, for \\emph{any} non-polynomial\nanalytic (like the sigmoid) activation function, to guarantee that MPNNs are\nequivalent to the WL test, feature vectors of dimension $d=1$ is all we need,\nindependently of the size of the graphs.\n Our main technical insight is that for simulating multi-sets in the WL-test,\nit is enough to use linear independence of feature vectors over rationals\ninstead of reals. Countability of the set of rationals together with nice\nproperties of analytic functions allow us to carry out the simulation invariant\nover the iterations of the WL test without increasing the dimension of the\nfeature vectors.\n","authors":["César Bravo","Alexander Kozachinskiy","Cristóbal Rojas"],"pdf_url":"https://arxiv.org/pdf/2402.03966v1.pdf","comment":"15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.03021v2","updated":"2024-02-06T12:50:12Z","published":"2024-02-05T14:00:53Z","title":"Data-induced multiscale losses and efficient multirate gradient descent\n schemes","summary":" This paper investigates the impact of multiscale data on machine learning\nalgorithms, particularly in the context of deep learning. A dataset is\nmultiscale if its distribution shows large variations in scale across different\ndirections. This paper reveals multiscale structures in the loss landscape,\nincluding its gradients and Hessians inherited from the data. Correspondingly,\nit introduces a novel gradient descent approach, drawing inspiration from\nmultiscale algorithms used in scientific computing. This approach seeks to\ntranscend empirical learning rate selection, offering a more systematic,\ndata-informed strategy to enhance training efficiency, especially in the later\nstages.\n","authors":["Juncai He","Liangchen Liu","Yen-Hsi Richard Tsai"],"pdf_url":"https://arxiv.org/pdf/2402.03021v2.pdf","comment":"28 pages, 4 figures, submitted under review"},{"id":"http://arxiv.org/abs/2211.06108v5","updated":"2024-02-06T12:41:20Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n Detection Systems","summary":" In autonomous driving, LiDAR and radar are crucial for environmental\nperception. LiDAR offers precise 3D spatial sensing information but struggles\nin adverse weather like fog. Conversely, radar signals can penetrate rain or\nmist due to their specific wavelength but are prone to noise disturbances.\nRecent state-of-the-art works reveal that the fusion of radar and LiDAR can\nlead to robust detection in adverse weather. The existing works adopt\nconvolutional neural network architecture to extract features from each sensor\ndata, then align and aggregate the two branch features to predict object\ndetection results. However, these methods have low accuracy of predicted\nbounding boxes due to a simple design of label assignment and fusion\nstrategies. In this paper, we propose a bird's-eye view fusion learning-based\nanchor box-free object detection system, which fuses the feature derived from\nthe radar range-azimuth heatmap and the LiDAR point cloud to estimate possible\nobjects. Different label assignment strategies have been designed to facilitate\nthe consistency between the classification of foreground or background anchor\npoints and the corresponding bounding box regressions. Furthermore, the\nperformance of the proposed object detector is further enhanced by employing a\nnovel interactive transformer module. The superior performance of the methods\nproposed in this paper has been demonstrated using the recently published\nOxford Radar RobotCar dataset. Our system's average precision significantly\noutperforms the state-of-the-art method by 13.1% and 19.0% at Intersection of\nUnion (IoU) of 0.8 under 'Clear+Foggy' training conditions for 'Clear' and\n'Foggy' testing, respectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v5.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.07335v2","updated":"2024-02-06T12:39:18Z","published":"2023-12-12T14:53:18Z","title":"Momentum Particle Maximum Likelihood","summary":" Maximum likelihood estimation (MLE) of latent variable models is often recast\nas an optimization problem over the extended space of parameters and\nprobability distributions. For example, the Expectation Maximization (EM)\nalgorithm can be interpreted as coordinate descent applied to a suitable free\nenergy functional over this space. Recently, this perspective has been combined\nwith insights from optimal transport and Wasserstein gradient flows to develop\nparticle-based algorithms applicable to wider classes of models than standard\nEM.\n Drawing inspiration from prior works which interpret `momentum-enriched'\noptimisation algorithms as discretizations of ordinary differential equations,\nwe propose an analogous dynamical systems-inspired approach to minimizing the\nfree energy functional over the extended space of parameters and probability\ndistributions. The result is a dynamic system that blends elements of\nNesterov's Accelerated Gradient method, the underdamped Langevin diffusion, and\nparticle methods.\n Under suitable assumptions, we establish quantitative convergence of the\nproposed system to the unique minimiser of the functional in continuous time.\nWe then propose a numerical discretization of this system which enables its\napplication to parameter estimation in latent variable models. Through\nnumerical experiments, we demonstrate that the resulting algorithm converges\nfaster than existing methods and compares favourably with other (approximate)\nMLE algorithms.\n","authors":["Jen Ning Lim","Juan Kuntz","Samuel Power","Adam M. Johansen"],"pdf_url":"https://arxiv.org/pdf/2312.07335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00736v2","updated":"2024-02-06T12:20:06Z","published":"2024-01-01T12:25:57Z","title":"Diffusion Models, Image Super-Resolution And Everything: A Survey","summary":" Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field\nand further closed the gap between image quality and human perceptual\npreferences. They are easy to train and can produce very high-quality samples\nthat exceed the realism of those produced by previous generative methods.\nDespite their promising results, they also come with new challenges that need\nfurther research: high computational demands, comparability, lack of\nexplainability, color shifts, and more. Unfortunately, entry into this field is\noverwhelming because of the abundance of publications. To address this, we\nprovide a unified recount of the theoretical foundations underlying DMs applied\nto image SR and offer a detailed analysis that underscores the unique\ncharacteristics and methodologies within this domain, distinct from broader\nexisting reviews in the field. This survey articulates a cohesive understanding\nof DM principles and explores current research avenues, including alternative\ninput domains, conditioning techniques, guidance mechanisms, corruption spaces,\nand zero-shot learning approaches. By offering a detailed examination of the\nevolution and current trends in image SR through the lens of DMs, this survey\nsheds light on the existing challenges and charts potential future directions,\naiming to inspire further innovation in this rapidly advancing area.\n","authors":["Brian B. Moser","Arundhati S. Shanbhag","Federico Raue","Stanislav Frolov","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.00736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03941v1","updated":"2024-02-06T12:18:54Z","published":"2024-02-06T12:18:54Z","title":"Discovery of the Hidden World with Large Language Models","summary":" Science originates with discovering new causal knowledge from a combination\nof known facts and observations. Traditional causal discovery approaches mainly\nrely on high-quality measured variables, usually given by human experts, to\nfind causal relations. However, the causal variables are usually unavailable in\na wide range of real-world applications. The rise of large language models\n(LLMs) that are trained to learn rich knowledge from the massive observations\nof the world, provides a new opportunity to assist with discovering high-level\nhidden variables from the raw observational data. Therefore, we introduce COAT:\nCausal representatiOn AssistanT. COAT incorporates LLMs as a factor proposer\nthat extracts the potential causal factors from unstructured data. Moreover,\nLLMs can also be instructed to provide additional information used to collect\ndata values (e.g., annotation criteria) and to further parse the raw\nunstructured data into structured data. The annotated data will be fed to a\ncausal learning module (e.g., the FCI algorithm) that provides both rigorous\nexplanations of the data, as well as useful feedback to further improve the\nextraction of causal factors by LLMs. We verify the effectiveness of COAT in\nuncovering the underlying causal system with two case studies of review rating\nanalysis and neuropathic diagnosis.\n","authors":["Chenxi Liu","Yongqiang Chen","Tongliang Liu","Mingming Gong","James Cheng","Bo Han","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03941v1.pdf","comment":"Preliminary version of an ongoing project; Chenxi and Yongqiang\n contributed equally; 26 pages, 41 figures; Project page:\n https://causalcoat.github.io/"},{"id":"http://arxiv.org/abs/2311.17431v9","updated":"2024-02-06T12:18:29Z","published":"2023-11-29T08:21:42Z","title":"Grounding Foundation Models through Federated Transfer Learning: A\n General Framework","summary":" Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and\npowerful emergent abilities have achieved remarkable success in various natural\nlanguage processing and computer vision tasks. Grounding FMs by adapting them\nto domain-specific tasks or augmenting them with domain-specific knowledge\nenables us to exploit the full potential of FMs. However, grounding FMs faces\nseveral challenges, stemming primarily from constrained computing resources,\ndata privacy, model heterogeneity, and model ownership. Federated Transfer\nLearning (FTL), the combination of federated learning and transfer learning,\nprovides promising solutions to address these challenges. In recent years, the\nneed for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in\nboth academia and industry. Motivated by the strong growth in FTL-FM research\nand the potential impact of FTL-FM on industrial applications, we propose an\nFTL-FM framework that formulates problems of grounding FMs in the federated\nlearning setting, construct a detailed taxonomy based on the FTL-FM framework\nto categorize state-of-the-art FTL-FM works, and comprehensively overview\nFTL-FM works based on the proposed taxonomy. We also establish correspondences\nbetween FTL-FM and conventional phases of adapting FM so that FM practitioners\ncan align their research works with FTL-FM. In addition, we overview advanced\nefficiency-improving and privacy-preserving techniques because efficiency and\nprivacy are critical concerns in FTL-FM. Last, we discuss opportunities and\nfuture research directions of FTL-FM.\n","authors":["Yan Kang","Tao Fan","Hanlin Gu","Xiaojin Zhang","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17431v9.pdf","comment":"In progress. fixed some typos, errors, and revised the text a little\n bit"},{"id":"http://arxiv.org/abs/2108.08677v3","updated":"2024-02-06T12:06:44Z","published":"2021-08-19T13:38:43Z","title":"Order Optimal Bounds for One-Shot Federated Learning over non-Convex\n Loss Functions","summary":" We consider the problem of federated learning in a one-shot setting in which\nthere are $m$ machines, each observing $n$ sample functions from an unknown\ndistribution on non-convex loss functions. Let $F:[-1,1]^d\\to\\mathbb{R}$ be the\nexpected loss function with respect to this unknown distribution. The goal is\nto find an estimate of the minimizer of $F$. Based on its observations, each\nmachine generates a signal of bounded length $B$ and sends it to a server. The\nserver collects signals of all machines and outputs an estimate of the\nminimizer of $F$. We show that the expected loss of any algorithm is lower\nbounded by $\\max\\big(1/(\\sqrt{n}(mB)^{1/d}), 1/\\sqrt{mn}\\big)$, up to a\nlogarithmic factor. We then prove that this lower bound is order optimal in $m$\nand $n$ by presenting a distributed learning algorithm, called Multi-Resolution\nEstimator for Non-Convex loss function (MRE-NC), whose expected loss matches\nthe lower bound for large $mn$ up to polylogarithmic factors.\n","authors":["Arsalan Sharifnassab","Saber Salehkaleybar","S. Jamaloddin Golestani"],"pdf_url":"https://arxiv.org/pdf/2108.08677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03931v1","updated":"2024-02-06T12:01:00Z","published":"2024-02-06T12:01:00Z","title":"Fully autonomous tuning of a spin qubit","summary":" Spanning over two decades, the study of qubits in semiconductors for quantum\ncomputing has yielded significant breakthroughs. However, the development of\nlarge-scale semiconductor quantum circuits is still limited by challenges in\nefficiently tuning and operating these circuits. Identifying optimal operating\nconditions for these qubits is complex, involving the exploration of vast\nparameter spaces. This presents a real 'needle in the haystack' problem, which,\nuntil now, has resisted complete automation due to device variability and\nfabrication imperfections. In this study, we present the first fully autonomous\ntuning of a semiconductor qubit, from a grounded device to Rabi oscillations, a\nclear indication of successful qubit operation. We demonstrate this automation,\nachieved without human intervention, in a Ge/Si core/shell nanowire device. Our\napproach integrates deep learning, Bayesian optimization, and computer vision\ntechniques. We expect this automation algorithm to apply to a wide range of\nsemiconductor qubit devices, allowing for statistical studies of qubit quality\nmetrics. As a demonstration of the potential of full automation, we\ncharacterise how the Rabi frequency and g-factor depend on barrier gate\nvoltages for one of the qubits found by the algorithm. Twenty years after the\ninitial demonstrations of spin qubit operation, this significant advancement is\npoised to finally catalyze the operation of large, previously unexplored\nquantum circuits.\n","authors":["Jonas Schuff","Miguel J. Carballido","Madeleine Kotzagiannidis","Juan Carlos Calvo","Marco Caselli","Jacob Rawling","David L. Craig","Barnaby van Straaten","Brandon Severin","Federico Fedele","Simon Svab","Pierre Chevalier Kwon","Rafael S. Eggli","Taras Patlatiuk","Nathan Korda","Dominik Zumbühl","Natalia Ares"],"pdf_url":"https://arxiv.org/pdf/2402.03931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03923v1","updated":"2024-02-06T11:46:47Z","published":"2024-02-06T11:46:47Z","title":"Return-Aligned Decision Transformer","summary":" Traditional approaches in offline reinforcement learning aim to learn the\noptimal policy that maximizes the cumulative reward, also known as return.\nHowever, as applications broaden, it becomes increasingly crucial to train\nagents that not only maximize the returns, but align the actual return with a\nspecified target return, giving control over the agent's performance. Decision\nTransformer (DT) optimizes a policy that generates actions conditioned on the\ntarget return through supervised learning and is equipped with a mechanism to\ncontrol the agent using the target return. Despite being designed to align the\nactual return with the target return, we have empirically identified a\ndiscrepancy between the actual return and the target return in DT. In this\npaper, we propose Return-Aligned Decision Transformer (RADT), designed to\neffectively align the actual return with the target return. Our model decouples\nreturns from the conventional input sequence, which typically consists of\nreturns, states, and actions, to enhance the relationships between returns and\nstates, as well as returns and actions. Extensive experiments show that RADT\nreduces the discrepancies between the actual return and the target return of\nDT-based methods.\n","authors":["Tsunehiko Tanaka","Kenshi Abe","Kaito Ariu","Tetsuro Morimura","Edgar Simo-Serra"],"pdf_url":"https://arxiv.org/pdf/2402.03923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03921v1","updated":"2024-02-06T11:44:06Z","published":"2024-02-06T11:44:06Z","title":"Large Language Models to Enhance Bayesian Optimization","summary":" Bayesian optimization (BO) is a powerful approach for optimizing complex and\nexpensive-to-evaluate black-box functions. Its importance is underscored in\nmany applications, notably including hyperparameter tuning, but its efficacy\ndepends on efficiently balancing exploration and exploitation. While there has\nbeen substantial progress in BO methods, striking this balance still remains a\ndelicate process. In this light, we present \\texttt{LLAMBO}, a novel approach\nthat integrates the capabilities of large language models (LLM) within BO. At a\nhigh level, we frame the BO problem in natural language terms, enabling LLMs to\niteratively propose promising solutions conditioned on historical evaluations.\nMore specifically, we explore how combining contextual understanding, few-shot\nlearning proficiency, and domain knowledge of LLMs can enhance various\ncomponents of model-based BO. Our findings illustrate that \\texttt{LLAMBO} is\neffective at zero-shot warmstarting, and improves surrogate modeling and\ncandidate sampling, especially in the early stages of search when observations\nare sparse. Our approach is performed in context and does not require LLM\nfinetuning. Additionally, it is modular by design, allowing individual\ncomponents to be integrated into existing BO frameworks, or function cohesively\nas an end-to-end method. We empirically validate \\texttt{LLAMBO}'s efficacy on\nthe problem of hyperparameter tuning, highlighting strong empirical performance\nacross a range of diverse benchmarks, proprietary, and synthetic tasks.\n","authors":["Tennison Liu","Nicolás Astorga","Nabeel Seedat","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2402.03921v1.pdf","comment":"Accepted as Poster at ICLR2024"},{"id":"http://arxiv.org/abs/2402.03917v1","updated":"2024-02-06T11:35:02Z","published":"2024-02-06T11:35:02Z","title":"Elastic Feature Consolidation for Cold Start Exemplar-free Incremental\n Learning","summary":" Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a\nsequence of tasks without having access to previous task data. In this paper,\nwe consider the challenging Cold Start scenario in which insufficient data is\navailable in the first task to learn a high-quality backbone. This is\nespecially challenging for EFCIL since it requires high plasticity, which\nresults in feature drift which is difficult to compensate for in the\nexemplar-free setting. To address this problem, we propose a simple and\neffective approach that consolidates feature representations by regularizing\ndrift in directions highly relevant to previous tasks and employs prototypes to\nreduce task-recency bias. Our method, called Elastic Feature Consolidation\n(EFC), exploits a tractable second-order approximation of feature drift based\non an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in\nfeature space which we use to regularize feature drift in important directions\nand to update Gaussian prototypes used in a novel asymmetric cross entropy loss\nwhich effectively balances prototype rehearsal with data from new tasks.\nExperimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and\nImageNet-1K demonstrate that Elastic Feature Consolidation is better able to\nlearn new tasks by maintaining model plasticity and significantly outperform\nthe state-of-the-art.\n","authors":["Simone Magistri","Tomaso Trinci","Albin Soutif-Cormerais","Joost van de Weijer","Andrew D. Bagdanov"],"pdf_url":"https://arxiv.org/pdf/2402.03917v1.pdf","comment":"Accepted at Twelfth International Conference on Learning\n Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2402.03915v1","updated":"2024-02-06T11:31:04Z","published":"2024-02-06T11:31:04Z","title":"Learning Metrics that Maximise Power for Accelerated A/B-Tests","summary":" Online controlled experiments are a crucial tool to allow for confident\ndecision-making in technology companies. A North Star metric is defined (such\nas long-term revenue or user retention), and system variants that statistically\nsignificantly improve on this metric in an A/B-test can be considered superior.\nNorth Star metrics are typically delayed and insensitive. As a result, the cost\nof experimentation is high: experiments need to run for a long time, and even\nthen, type-II errors (i.e. false negatives) are prevalent.\n We propose to tackle this by learning metrics from short-term signals that\ndirectly maximise the statistical power they harness with respect to the North\nStar. We show that existing approaches are prone to overfitting, in that higher\naverage metric sensitivity does not imply improved type-II errors, and propose\nto instead minimise the $p$-values a metric would have produced on a log of\npast experiments. We collect such datasets from two social media applications\nwith over 160 million Monthly Active Users each, totalling over 153 A/B-pairs.\nEmpirical results show that we are able to increase statistical power by up to\n78% when using our learnt metrics stand-alone, and by up to 210% when used in\ntandem with the North Star. Alternatively, we can obtain constant statistical\npower at a sample size that is down to 12% of what the North Star requires,\nsignificantly reducing the cost of experimentation.\n","authors":["Olivier Jeunen","Aleksei Ustimenko"],"pdf_url":"https://arxiv.org/pdf/2402.03915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03905v1","updated":"2024-02-06T11:17:16Z","published":"2024-02-06T11:17:16Z","title":"Employee Turnover Analysis Using Machine Learning Algorithms","summary":" Employee's knowledge is an organization asset. Turnover may impose apparent\nand hidden costs and irreparable damages. To overcome and mitigate this risk,\nemployee's condition should be monitored. Due to high complexity of analyzing\nwell-being features, employee's turnover predicting can be delegated to machine\nlearning techniques. In this paper, we discuss employee's attrition rate. Three\ndifferent supervised learning algorithms comprising AdaBoost, SVM and\nRandomForest are used to benchmark employee attrition accuracy. Attained models\ncan help out at establishing predictive analytics.\n","authors":["Mahyar Karimi","Kamyar Seyedkazem Viliyani"],"pdf_url":"https://arxiv.org/pdf/2402.03905v1.pdf","comment":"6 pages, 11 feagures, 2 tables"},{"id":"http://arxiv.org/abs/2402.03903v1","updated":"2024-02-06T11:13:57Z","published":"2024-02-06T11:13:57Z","title":"Compound Returns Reduce Variance in Reinforcement Learning","summary":" Multistep returns, such as $n$-step returns and $\\lambda$-returns, are\ncommonly used to improve the sample efficiency of reinforcement learning (RL)\nmethods. The variance of the multistep returns becomes the limiting factor in\ntheir length; looking too far into the future increases variance and reverses\nthe benefits of multistep learning. In our work, we demonstrate the ability of\ncompound returns -- weighted averages of $n$-step returns -- to reduce\nvariance. We prove for the first time that any compound return with the same\ncontraction modulus as a given $n$-step return has strictly lower variance. We\nadditionally prove that this variance-reduction property improves the\nfinite-sample complexity of temporal-difference learning under linear function\napproximation. Because general compound returns can be expensive to implement,\nwe introduce two-bootstrap returns which reduce variance while remaining\nefficient, even when using minibatched experience replay. We conduct\nexperiments showing that two-bootstrap returns can improve the sample\nefficiency of $n$-step deep RL agents, with little additional computational\ncost.\n","authors":["Brett Daley","Martha White","Marlos C. Machado"],"pdf_url":"https://arxiv.org/pdf/2402.03903v1.pdf","comment":"Preprint. 8 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.03902v1","updated":"2024-02-06T11:13:54Z","published":"2024-02-06T11:13:54Z","title":"A phase transition between positional and semantic learning in a\n solvable model of dot-product attention","summary":" We investigate how a dot-product attention layer learns a positional\nattention matrix (with tokens attending to each other based on their respective\npositions) and a semantic attention matrix (with tokens attending to each other\nbased on their meaning). For an algorithmic task, we experimentally show how\nthe same simple architecture can learn to implement a solution using either the\npositional or semantic mechanism. On the theoretical side, we study the\nlearning of a non-linear self-attention layer with trainable tied and low-rank\nquery and key matrices. In the asymptotic limit of high-dimensional data and a\ncomparably large number of training samples, we provide a closed-form\ncharacterization of the global minimum of the non-convex empirical loss\nlandscape. We show that this minimum corresponds to either a positional or a\nsemantic mechanism and evidence an emergent phase transition from the former to\nthe latter with increasing sample complexity. Finally, we compare the\ndot-product attention layer to linear positional baseline, and show that it\noutperforms the latter using the semantic mechanism provided it has access to\nsufficient data.\n","authors":["Hugo Cui","Freya Behrens","Florent Krzakala","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2402.03902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03901v1","updated":"2024-02-06T11:13:26Z","published":"2024-02-06T11:13:26Z","title":"Batch Universal Prediction","summary":" Large language models (LLMs) have recently gained much popularity due to\ntheir surprising ability at generating human-like English sentences. LLMs are\nessentially predictors, estimating the probability of a sequence of words given\nthe past. Therefore, it is natural to evaluate their performance from a\nuniversal prediction perspective. In order to do that fairly, we introduce the\nnotion of batch regret as a modification of the classical average regret, and\nwe study its asymptotical value for add-constant predictors, in the case of\nmemoryless sources and first-order Markov sources.\n","authors":["Marco Bondaschi","Michael Gastpar"],"pdf_url":"https://arxiv.org/pdf/2402.03901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03898v1","updated":"2024-02-06T11:10:35Z","published":"2024-02-06T11:10:35Z","title":"DistiLLM: Towards Streamlined Distillation for Large Language Models","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\na smaller student model, reducing its inference cost and memory footprint while\npreserving model capabilities. However, current KD methods for auto-regressive\nsequence models (e.g., large language models) suffer from missing a\nstandardized objective function. Moreover, the recent use of student-generated\noutputs to address training-inference mismatches has significantly escalated\ncomputational costs. To tackle these issues, we introduce DistiLLM, a more\neffective and efficient KD framework for auto-regressive language models.\nDistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence\nloss, where we unveil and leverage its theoretical properties, and (2) an\nadaptive off-policy approach designed to enhance the efficiency in utilizing\nstudent-generated outputs. Extensive experiments, including\ninstruction-following tasks, demonstrate the effectiveness of DistiLLM in\nbuilding high-performing student models while achieving up to 4.3$\\times$\nspeedup compared to recent KD methods.\n","authors":["Jongwoo Ko","Sungnyun Kim","Tianyi Chen","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2402.03898v1.pdf","comment":"Code is available at https://github.com/jongwooko/distillm"},{"id":"http://arxiv.org/abs/2205.15059v4","updated":"2024-02-06T10:58:38Z","published":"2022-05-30T12:40:32Z","title":"Hilbert Curve Projection Distance for Distribution Comparison","summary":" Distribution comparison plays a central role in many machine learning tasks\nlike data classification and generative modeling. In this study, we propose a\nnovel metric, called Hilbert curve projection (HCP) distance, to measure the\ndistance between two probability distributions with low complexity. In\nparticular, we first project two high-dimensional probability distributions\nusing Hilbert curve to obtain a coupling between them, and then calculate the\ntransport distance between these two distributions in the original space,\naccording to the coupling. We show that HCP distance is a proper metric and is\nwell-defined for probability measures with bounded supports. Furthermore, we\ndemonstrate that the modified empirical HCP distance with the $L_p$ cost in the\n$d$-dimensional space converges to its population counterpart at a rate of no\nmore than $O(n^{-1/2\\max\\{d,p\\}})$. To suppress the curse-of-dimensionality, we\nalso develop two variants of the HCP distance using (learnable) subspace\nprojections. Experiments on both synthetic and real-world data show that our\nHCP distance works as an effective surrogate of the Wasserstein distance with\nlow complexity and overcomes the drawbacks of the sliced Wasserstein distance.\n","authors":["Tao Li","Cheng Meng","Hongteng Xu","Jun Yu"],"pdf_url":"https://arxiv.org/pdf/2205.15059v4.pdf","comment":"33 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.03893v1","updated":"2024-02-06T10:58:13Z","published":"2024-02-06T10:58:13Z","title":"Prediction Horizon Requirements for Automated Driving: Optimizing\n Safety, Comfort, and Efficiency","summary":" Predicting the movement of other road users is beneficial for improving\nautomated vehicle (AV) performance. However, the relationship between the time\nhorizon associated with these predictions and AV performance remains unclear.\nDespite the existence of numerous trajectory prediction algorithms, no studies\nhave been conducted on how varying prediction lengths affect AV safety and\nother vehicle performance metrics, resulting in undefined horizon requirements\nfor prediction methods. Our study addresses this gap by examining the effects\nof different prediction horizons on AV performance, focusing on safety,\ncomfort, and efficiency. Through multiple experiments using a state-of-the-art,\nrisk-based predictive trajectory planner, we simulated predictions with\nhorizons up to 20 seconds. Based on our simulations, we propose a framework for\nspecifying the minimum required and optimal prediction horizons based on\nspecific AV performance criteria and application needs. Our results indicate\nthat a horizon of 1.6 seconds is required to prevent collisions with crossing\npedestrians, horizons of 7-8 seconds yield the best efficiency, and horizons up\nto 15 seconds improve passenger comfort. We conclude that prediction horizon\nrequirements are application-dependent, and recommend aiming for a prediction\nhorizon of 11.8 seconds as a general guideline for applications involving\ncrossing pedestrians.\n","authors":["Manuel Muñoz Sánchez","Chris van der Ploeg","Robin Smit","Jos Elfring","Emilia Silvas","René van de Molengraft"],"pdf_url":"https://arxiv.org/pdf/2402.03893v1.pdf","comment":"Submitted to IEEE Intelligent Vehicles Symposium. 9 pages. 10\n figures. 6 tables"},{"id":"http://arxiv.org/abs/2402.03885v1","updated":"2024-02-06T10:48:46Z","published":"2024-02-06T10:48:46Z","title":"MOMENT: A Family of Open Time-series Foundation Models","summary":" We introduce MOMENT, a family of open-source foundation models for\ngeneral-purpose time-series analysis. Pre-training large models on time-series\ndata is challenging due to (1) the absence of a large and cohesive public\ntime-series repository, and (2) diverse time-series characteristics which make\nmulti-dataset training onerous. Additionally, (3) experimental benchmarks to\nevaluate these models, especially in scenarios with limited resources, time,\nand supervision, are still in their nascent stages. To address these\nchallenges, we compile a large and diverse collection of public time-series,\ncalled the Time-series Pile, and systematically tackle time-series-specific\nchallenges to unlock large-scale multi-dataset pre-training. Finally, we build\non recent work to design a benchmark to evaluate time-series foundation models\non diverse tasks and datasets in limited supervision settings. Experiments on\nthis benchmark demonstrate the effectiveness of our pre-trained models with\nminimal data and task-specific fine-tuning. Finally, we present several\ninteresting empirical observations about large pre-trained time-series models.\nOur code is available anonymously at anonymous.4open.science/r/BETT-773F/.\n","authors":["Mononito Goswami","Konrad Szafer","Arjun Choudhry","Yifu Cai","Shuo Li","Artur Dubrawski"],"pdf_url":"https://arxiv.org/pdf/2402.03885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03883v1","updated":"2024-02-06T10:45:51Z","published":"2024-02-06T10:45:51Z","title":"A Framework for Bilevel Optimization on Riemannian Manifolds","summary":" Bilevel optimization has seen an increasing presence in various domains of\napplications. In this work, we propose a framework for solving bilevel\noptimization problems where variables of both lower and upper level problems\nare constrained on Riemannian manifolds. We provide several hypergradient\nestimation strategies on manifolds and study their estimation error. We provide\nconvergence and complexity analysis for the proposed hypergradient descent\nalgorithm on manifolds. We also extend the developments to stochastic bilevel\noptimization and to the use of general retraction. We showcase the utility of\nthe proposed framework on various applications.\n","authors":["Andi Han","Bamdev Mishra","Pratik Jawanpuria","Akiko Takeda"],"pdf_url":"https://arxiv.org/pdf/2402.03883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03871v1","updated":"2024-02-06T10:32:39Z","published":"2024-02-06T10:32:39Z","title":"Geometric quantum machine learning of BQP$^A$ protocols and latent graph\n classifiers","summary":" Geometric quantum machine learning (GQML) aims to embed problem symmetries\nfor learning efficient solving protocols. However, the question remains if\n(G)QML can be routinely used for constructing protocols with an exponential\nseparation from classical analogs. In this Letter we consider Simon's problem\nfor learning properties of Boolean functions, and show that this can be related\nto an unsupervised circuit classification problem. Using the workflow of\ngeometric QML, we learn from first principles Simon's algorithm, thus\ndiscovering an example of BQP$^A\\neq$BPP protocol with respect to some dataset\n(oracle $A$). Our key findings include the development of an equivariant\nfeature map for embedding Boolean functions, based on twirling with respect to\nidentified bitflip and permutational symmetries, and measurement based on\ninvariant observables with a sampling advantage. The proposed workflow points\nto the importance of data embeddings and classical post-processing, while\nkeeping the variational circuit as a trivial identity operator. Next,\ndeveloping the intuition for the function learning, we visualize instances as\ndirected computational hypergraphs, and observe that the GQML protocol can\naccess their global topological features for distinguishing bijective and\nsurjective functions. Finally, we discuss the prospects for learning other\nBQP$^A$-type protocols, and conjecture that this depends on the ability of\nsimplifying embeddings-based oracles $A$ applied as a linear combination of\nunitaries.\n","authors":["Chukwudubem Umeano","Vincent E. Elfving","Oleksandr Kyriienko"],"pdf_url":"https://arxiv.org/pdf/2402.03871v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.03179v2","updated":"2024-02-06T10:26:43Z","published":"2024-02-05T16:45:38Z","title":"Cool-chic video: Learned video coding with 800 parameters","summary":" We propose a lightweight learned video codec with 900 multiplications per\ndecoded pixel and 800 parameters overall. To the best of our knowledge, this is\none of the neural video codecs with the lowest decoding complexity. It is built\nupon the overfitted image codec Cool-chic and supplements it with an inter\ncoding module to leverage the video's temporal redundancies. The proposed model\nis able to compress videos using both low-delay and random access\nconfigurations and achieves rate-distortion close to AVC while out-performing\nother overfitted codecs such as FFNeRV. The system is made open-source:\norange-opensource.github.io/Cool-Chic.\n","authors":["Thomas Leguay","Théo Ladune","Pierrick Philippe","Olivier Déforges"],"pdf_url":"https://arxiv.org/pdf/2402.03179v2.pdf","comment":"10 pages, published in Data Compression Conference 2024"},{"id":"http://arxiv.org/abs/2310.19391v2","updated":"2024-02-06T10:25:37Z","published":"2023-10-30T09:53:42Z","title":"Causal Fair Metric: Bridging Causality, Individual Fairness, and\n Adversarial Robustness","summary":" Despite the essential need for comprehensive considerations in responsible\nAI, factors like robustness, fairness, and causality are often studied in\nisolation. Adversarial perturbation, used to identify vulnerabilities in\nmodels, and individual fairness, aiming for equitable treatment of similar\nindividuals, despite initial differences, both depend on metrics to generate\ncomparable input data instances. Previous attempts to define such joint metrics\noften lack general assumptions about data or structural causal models and were\nunable to reflect counterfactual proximity. To address this, our paper\nintroduces a causal fair metric formulated based on causal structures\nencompassing sensitive attributes and protected causal perturbation. To enhance\nthe practicality of our metric, we propose metric learning as a method for\nmetric estimation and deployment in real-world problems in the absence of\nstructural causal models. We also demonstrate the application of our novel\nmetric in classifiers. Empirical evaluation of real-world and synthetic\ndatasets illustrates the effectiveness of our proposed metric in achieving an\naccurate classifier with fairness, resilience to adversarial perturbations, and\na nuanced understanding of causal relationships.\n","authors":["Ahmad-Reza Ehyaei","Golnoosh Farnadi","Samira Samadi"],"pdf_url":"https://arxiv.org/pdf/2310.19391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03864v1","updated":"2024-02-06T10:24:36Z","published":"2024-02-06T10:24:36Z","title":"The Challenges of the Nonlinear Regime for Physics-Informed Neural\n Networks","summary":" The Neural Tangent Kernel (NTK) viewpoint represents a valuable approach to\nexamine the training dynamics of Physics-Informed Neural Networks (PINNs) in\nthe infinite width limit. We leverage this perspective and focus on the case of\nnonlinear Partial Differential Equations (PDEs) solved by PINNs. We provide\ntheoretical results on the different behaviors of the NTK depending on the\nlinearity of the differential operator. Moreover, inspired by our theoretical\nresults, we emphasize the advantage of employing second-order methods for\ntraining PINNs. Additionally, we explore the convergence capabilities of\nsecond-order methods and address the challenges of spectral bias and slow\nconvergence. Every theoretical result is supported by numerical examples with\nboth linear and nonlinear PDEs, and we validate our training method on\nbenchmark test cases.\n","authors":["Andrea Bonfanti","Giuseppe Bruno","Cristina Cipriani"],"pdf_url":"https://arxiv.org/pdf/2402.03864v1.pdf","comment":"8 pages, 10 figures, appendix of 10 additional pages"},{"id":"http://arxiv.org/abs/2304.11062v2","updated":"2024-02-06T10:16:54Z","published":"2023-04-19T16:18:54Z","title":"Scaling Transformer to 1M tokens and beyond with RMT","summary":" A major limitation for the broader scope of problems solvable by transformers\nis the quadratic scaling of computational complexity with input size. In this\nstudy, we investigate the recurrent memory augmentation of pre-trained\ntransformer models to extend input context length while linearly scaling\ncompute. Our approach demonstrates the capability to store information in\nmemory for sequences of up to an unprecedented two million tokens while\nmaintaining high retrieval accuracy. Experiments with language modeling tasks\nshow perplexity improvement as the number of processed input segments\nincreases. These results underscore the effectiveness of our method, which has\nsignificant potential to enhance long-term dependency handling in natural\nlanguage understanding and generation tasks, as well as enable large-scale\ncontext processing for memory-intensive applications.\n","authors":["Aydar Bulatov","Yuri Kuratov","Yermek Kapushev","Mikhail S. Burtsev"],"pdf_url":"https://arxiv.org/pdf/2304.11062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01148v3","updated":"2024-02-06T10:06:22Z","published":"2024-01-02T10:58:54Z","title":"PAC-Bayes-Chernoff bounds for unbounded losses","summary":" We introduce a new PAC-Bayes oracle bound for unbounded losses. This result\ncan be understood as a PAC-Bayesian version of the Cram\\'er-Chernoff bound. The\nproof technique relies on controlling the tails of certain random variables\ninvolving the Cram\\'er transform of the loss. We highlight several applications\nof the main theorem. First, we show that our result naturally allows exact\noptimization of the free parameter on many PAC-Bayes bounds. Second, we recover\nand generalize previous results. Finally, we show that our approach allows\nworking with richer assumptions that result in more informative and potentially\ntighter bounds. In this direction, we provide a general bound under a new\n``model-dependent bounded CGF\" assumption from which we obtain bounds based on\nparameter norms and log-Sobolev inequalities. All these bounds can be minimized\nto obtain novel posteriors.\n","authors":["Ioar Casado","Luis A. Ortega","Andrés R. Masegosa","Aritz Pérez"],"pdf_url":"https://arxiv.org/pdf/2401.01148v3.pdf","comment":"Updated Section 5"},{"id":"http://arxiv.org/abs/2402.03855v1","updated":"2024-02-06T10:06:13Z","published":"2024-02-06T10:06:13Z","title":"Position Paper: Toward New Frameworks for Studying Model Representations","summary":" Mechanistic interpretability (MI) aims to understand AI models by\nreverse-engineering the exact algorithms neural networks learn. Most works in\nMI so far have studied behaviors and capabilities that are trivial and\ntoken-aligned. However, most capabilities are not that trivial, which advocates\nfor the study of hidden representations inside these networks as the unit of\nanalysis. We do a literature review, formalize representations for features and\nbehaviors, highlight their importance and evaluation, and perform some basic\nexploration in the mechanistic interpretability of representations. With\ndiscussion and exploratory results, we justify our position that studying\nrepresentations is an important and under-studied field, and that currently\nestablished methods in MI are not sufficient to understand representations,\nthus pushing for the research community to work toward new frameworks for\nstudying representations.\n","authors":["Satvik Golechha","James Dao"],"pdf_url":"https://arxiv.org/pdf/2402.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02229v2","updated":"2024-02-06T09:51:09Z","published":"2024-02-03T18:19:46Z","title":"Vanilla Bayesian Optimization Performs Great in High Dimensions","summary":" High-dimensional problems have long been considered the Achilles' heel of\nBayesian optimization algorithms. Spurred by the curse of dimensionality, a\nlarge collection of algorithms aim to make it more performant in this setting,\ncommonly by imposing various simplifying assumptions on the objective. In this\npaper, we identify the degeneracies that make vanilla Bayesian optimization\npoorly suited to high-dimensional tasks, and further show how existing\nalgorithms address these degeneracies through the lens of lowering the model\ncomplexity. Moreover, we propose an enhancement to the prior assumptions that\nare typical to vanilla Bayesian optimization algorithms, which reduces the\ncomplexity to manageable levels without imposing structural restrictions on the\nobjective. Our modification - a simple scaling of the Gaussian process\nlengthscale prior with the dimensionality - reveals that standard Bayesian\noptimization works drastically better than previously thought in high\ndimensions, clearly outperforming existing state-of-the-art algorithms on\nmultiple commonly considered real-world high-dimensional tasks.\n","authors":["Carl Hvarfner","Erik Orm Hellsten","Luigi Nardi"],"pdf_url":"https://arxiv.org/pdf/2402.02229v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03846v1","updated":"2024-02-06T09:48:33Z","published":"2024-02-06T09:48:33Z","title":"Efficient Generation of Hidden Outliers for Improved Outlier Detection","summary":" Outlier generation is a popular technique used for solving important outlier\ndetection tasks. Generating outliers with realistic behavior is challenging.\nPopular existing methods tend to disregard the 'multiple views' property of\noutliers in high-dimensional spaces. The only existing method accounting for\nthis property falls short in efficiency and effectiveness. We propose BISECT, a\nnew outlier generation method that creates realistic outliers mimicking said\nproperty. To do so, BISECT employs a novel proposition introduced in this\narticle stating how to efficiently generate said realistic outliers. Our method\nhas better guarantees and complexity than the current methodology for\nrecreating 'multiple views'. We use the synthetic outliers generated by BISECT\nto effectively enhance outlier detection in diverse datasets, for multiple use\ncases. For instance, oversampling with BISECT reduced the error by up to 3\ntimes when compared with the baselines.\n","authors":["Jose Cribeiro-Ramallo","Vadim Arzamasov","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2402.03846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14777v3","updated":"2024-02-06T09:41:44Z","published":"2023-05-24T06:31:05Z","title":"Generative Modeling through the Semi-dual Formulation of Unbalanced\n Optimal Transport","summary":" Optimal Transport (OT) problem investigates a transport map that bridges two\ndistributions while minimizing a given cost function. In this regard, OT\nbetween tractable prior distribution and data has been utilized for generative\nmodeling tasks. However, OT-based methods are susceptible to outliers and face\noptimization challenges during training. In this paper, we propose a novel\ngenerative model based on the semi-dual formulation of Unbalanced Optimal\nTransport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution\nmatching. This approach provides better robustness against outliers, stability\nduring training, and faster convergence. We validate these properties\nempirically through experiments. Moreover, we study the theoretical upper-bound\nof divergence between distributions in UOT. Our model outperforms existing\nOT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 6.36\non CelebA-HQ-256. The code is available at\n\\url{https://github.com/Jae-Moo/UOTM}.\n","authors":["Jaemoo Choi","Jaewoong Choi","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2305.14777v3.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2402.03845v1","updated":"2024-02-06T09:41:43Z","published":"2024-02-06T09:41:43Z","title":"On gauge freedom, conservativity and intrinsic dimensionality estimation\n in diffusion models","summary":" Diffusion models are generative models that have recently demonstrated\nimpressive performances in terms of sampling quality and density estimation in\nhigh dimensions. They rely on a forward continuous diffusion process and a\nbackward continuous denoising process, which can be described by a\ntime-dependent vector field and is used as a generative model. In the original\nformulation of the diffusion model, this vector field is assumed to be the\nscore function (i.e. it is the gradient of the log-probability at a given time\nin the diffusion process). Curiously, on the practical side, most studies on\ndiffusion models implement this vector field as a neural network function and\ndo not constrain it be the gradient of some energy function (that is, most\nstudies do not constrain the vector field to be conservative). Even though some\nstudies investigated empirically whether such a constraint will lead to a\nperformance gain, they lead to contradicting results and failed to provide\nanalytical results. Here, we provide three analytical results regarding the\nextent of the modeling freedom of this vector field. {Firstly, we propose a\nnovel decomposition of vector fields into a conservative component and an\northogonal component which satisfies a given (gauge) freedom. Secondly, from\nthis orthogonal decomposition, we show that exact density estimation and exact\nsampling is achieved when the conservative component is exactly equals to the\ntrue score and therefore conservativity is neither necessary nor sufficient to\nobtain exact density estimation and exact sampling. Finally, we show that when\nit comes to inferring local information of the data manifold, constraining the\nvector field to be conservative is desirable.\n","authors":["Christian Horvat","Jean-Pascal Pfister"],"pdf_url":"https://arxiv.org/pdf/2402.03845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13227v2","updated":"2024-02-06T09:39:54Z","published":"2024-01-24T04:50:16Z","title":"LPNL: Scalable Link Prediction with Large Language Models","summary":" Exploring the application of large language models (LLMs) to graph learning\nis a emerging endeavor. However, the vast amount of information inherent in\nlarge graphs poses significant challenges to this process. This work focuses on\nthe link prediction task and introduces $\\textbf{LPNL}$ (Link Prediction via\nNatural Language), a framework based on large language models designed for\nscalable link prediction on large-scale heterogeneous graphs. We design novel\nprompts for link prediction that articulate graph details in natural language.\nWe propose a two-stage sampling pipeline to extract crucial information from\nthe graphs, and a divide-and-conquer strategy to control the input tokens\nwithin predefined limits, addressing the challenge of overwhelming information.\nWe fine-tune a T5 model based on our self-supervised learning designed for link\nprediction. Extensive experimental results demonstrate that LPNL outperforms\nmultiple advanced baselines in link prediction tasks on large-scale graphs.\n","authors":["Baolong Bi","Shenghua Liu","Yiwei Wang","Lingrui Mei","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.13227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03838v1","updated":"2024-02-06T09:35:40Z","published":"2024-02-06T09:35:40Z","title":"Gaussian process regression with Sliced Wasserstein Weisfeiler-Lehman\n graph kernels","summary":" Supervised learning has recently garnered significant attention in the field\nof computational physics due to its ability to effectively extract complex\npatterns for tasks like solving partial differential equations, or predicting\nmaterial properties. Traditionally, such datasets consist of inputs given as\nmeshes with a large number of nodes representing the problem geometry (seen as\ngraphs), and corresponding outputs obtained with a numerical solver. This means\nthe supervised learning model must be able to handle large and sparse graphs\nwith continuous node attributes. In this work, we focus on Gaussian process\nregression, for which we introduce the Sliced Wasserstein Weisfeiler-Lehman\n(SWWL) graph kernel. In contrast to existing graph kernels, the proposed SWWL\nkernel enjoys positive definiteness and a drastic complexity reduction, which\nmakes it possible to process datasets that were previously impossible to\nhandle. The new kernel is first validated on graph classification for molecular\ndatasets, where the input graphs have a few tens of nodes. The efficiency of\nthe SWWL kernel is then illustrated on graph regression in computational fluid\ndynamics and solid mechanics, where the input graphs are made up of tens of\nthousands of nodes.\n","authors":["Raphaël Carpintero Perez","Sébastien da Veiga","Josselin Garnier","Brian Staber"],"pdf_url":"https://arxiv.org/pdf/2402.03838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05591v3","updated":"2024-02-06T09:33:48Z","published":"2023-07-10T17:59:21Z","title":"Linear Alignment of Vision-language Models for Image Captioning","summary":" Recently, vision-language models like CLIP have advanced the state of the art\nin a variety of multi-modal tasks including image captioning and caption\nevaluation. Many approaches adapt CLIP-style models to a downstream task by\ntraining a mapping network between CLIP and a language model. This is costly as\nit usually involves calculating gradients for large models. We propose a more\nefficient training protocol that fits a linear mapping between image and text\nembeddings of CLIP via a closed-form solution. This bypasses the need for\ngradient computation and results in a lightweight captioning method called\nReCap, which can be trained up to 1000 times faster than existing lightweight\nmethods. Moreover, we propose two new learning-based image-captioning metrics\nthat build on CLIP score along with our linear mapping. Furthermore, we combine\nReCap with our new metrics to design an iterative datastore-augmentation loop\n(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k,\nVizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art\nlightweight methods on established metrics while outperforming them on our new\nmetrics, which are better aligned with human ratings on Flickr8k-Expert and\nFlickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to\nother domains and that our DAL leads to a performance boost.\n","authors":["Fabian Paischer","Markus Hofmarcher","Sepp Hochreiter","Thomas Adler"],"pdf_url":"https://arxiv.org/pdf/2307.05591v3.pdf","comment":"8 pages (+ references and appendix)"},{"id":"http://arxiv.org/abs/2312.12044v2","updated":"2024-02-06T09:32:36Z","published":"2023-12-19T10:57:12Z","title":"XLand-MiniGrid: Scalable Meta-Reinforcement Learning Environments in JAX","summary":" Inspired by the diversity and depth of XLand and the simplicity and\nminimalism of MiniGrid, we present XLand-MiniGrid, a suite of tools and\ngrid-world environments for meta-reinforcement learning research. Written in\nJAX, XLand-MiniGrid is designed to be highly scalable and can potentially run\non GPU or TPU accelerators, democratizing large-scale experimentation with\nlimited resources. Along with the environments, XLand-MiniGrid provides\npre-sampled benchmarks with millions of unique tasks of varying difficulty and\neasy-to-use baselines that allow users to quickly start training adaptive\nagents. In addition, we have conducted a preliminary analysis of scaling and\ngeneralization, showing that our baselines are capable of reaching millions of\nsteps per second during training and validating that the proposed benchmarks\nare challenging.\n","authors":["Alexander Nikulin","Vladislav Kurenkov","Ilya Zisman","Artem Agarkov","Viacheslav Sinii","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.12044v2.pdf","comment":"NeurIPS 2023, Workshop, Source code:\n https://github.com/corl-team/xland-minigrid"},{"id":"http://arxiv.org/abs/2005.09218v2","updated":"2024-02-06T09:21:28Z","published":"2020-05-19T05:28:35Z","title":"Large Margin Mechanism and Pseudo Query Set on Cross-Domain Few-Shot\n Learning","summary":" In recent years, few-shot learning problems have received a lot of attention.\nWhile methods in most previous works were trained and tested on datasets in one\nsingle domain, cross-domain few-shot learning is a brand-new branch of few-shot\nlearning problems, where models handle datasets in different domains between\ntraining and testing phases. In this paper, to solve the problem that the model\nis pre-trained (meta-trained) on a single dataset while fine-tuned on datasets\nin four different domains, including common objects, satellite images, and\nmedical images, we propose a novel large margin fine-tuning method (LMM-PQS),\nwhich generates pseudo query images from support images and fine-tunes the\nfeature extraction modules with a large margin mechanism inspired by methods in\nface recognition. According to the experiment results, LMM-PQS surpasses the\nbaseline models by a significant margin and demonstrates that our approach is\nrobust and can easily adapt pre-trained models to new domains with few data.\n","authors":["Jia-Fong Yeh","Hsin-Ying Lee","Bing-Chen Tsai","Yi-Rong Chen","Ping-Chia Huang","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2005.09218v2.pdf","comment":"Full version of the CDFSL competition report (in CVPRW'20), archived"},{"id":"http://arxiv.org/abs/2402.02430v2","updated":"2024-02-06T09:18:44Z","published":"2024-02-04T09:59:18Z","title":"Exploiting Low-level Representations for Ultra-Fast Road Segmentation","summary":" Achieving real-time and accuracy on embedded platforms has always been the\npursuit of road segmentation methods. To this end, they have proposed many\nlightweight networks. However, they ignore the fact that roads are \"stuff\"\n(background or environmental elements) rather than \"things\" (specific\nidentifiable objects), which inspires us to explore the feasibility of\nrepresenting roads with low-level instead of high-level features. Surprisingly,\nwe find that the primary stage of mainstream network models is sufficient to\nrepresent most pixels of the road for segmentation. Motivated by this, we\npropose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg).\nSpecifically, LFD-RoadSeg employs a bilateral structure. The spatial detail\nbranch is firstly designed to extract low-level feature representation for the\nroad by the first stage of ResNet-18. To suppress texture-less regions mistaken\nas the road in the low-level feature, the context semantic branch is then\ndesigned to extract the context feature in a fast manner. To this end, in the\nsecond branch, we asymmetrically downsample the input image and design an\naggregation module to achieve comparable receptive fields to the third stage of\nResNet-18 but with less time consumption. Finally, to segment the road from the\nlow-level feature, a selective fusion module is proposed to calculate\npixel-wise attention between the low-level representation and context feature,\nand suppress the non-road low-level response by this attention. On KITTI-Road,\nLFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average\nprecision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on\na Jetson TX2, all with a compact model size of just 936k parameters. The source\ncode is available at https://github.com/zhouhuan-hust/LFD-RoadSeg.\n","authors":["Huan Zhou","Feng Xue","Yucong Li","Shi Gong","Yiqun Li","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02430v2.pdf","comment":"11 pages, 7 figures, IEEE TITS"},{"id":"http://arxiv.org/abs/2402.03828v1","updated":"2024-02-06T09:17:07Z","published":"2024-02-06T09:17:07Z","title":"Estimating Barycenters of Distributions with Neural Optimal Transport","summary":" Given a collection of probability measures, a practitioner sometimes needs to\nfind an \"average\" distribution which adequately aggregates reference\ndistributions. A theoretically appealing notion of such an average is the\nWasserstein barycenter, which is the primal focus of our work. By building upon\nthe dual formulation of Optimal Transport (OT), we propose a new scalable\napproach for solving the Wasserstein barycenter problem. Our methodology is\nbased on the recent Neural OT solver: it has bi-level adversarial learning\nobjective and works for general cost functions. These are key advantages of our\nmethod, since the typical adversarial algorithms leveraging barycenter tasks\nutilize tri-level optimization and focus mostly on quadratic cost. We also\nestablish theoretical error bounds for our proposed approach and showcase its\napplicability and effectiveness on illustrative scenarios and image data\nsetups.\n","authors":["Alexander Kolesov","Petr Mokrov","Igor Udovichenko","Milena Gazdieva","Gudmund Pammer","Evgeny Burnaev","Alexander Korotin"],"pdf_url":"https://arxiv.org/pdf/2402.03828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00487v3","updated":"2024-02-06T09:12:09Z","published":"2023-01-31T11:34:56Z","title":"A Comprehensive Survey of Continual Learning: Theory, Method and\n Application","summary":" To cope with real-world dynamics, an intelligent system needs to\nincrementally acquire, update, accumulate, and exploit knowledge throughout its\nlifetime. This ability, known as continual learning, provides a foundation for\nAI systems to develop themselves adaptively. In a general sense, continual\nlearning is explicitly limited by catastrophic forgetting, where learning a new\ntask usually results in a dramatic performance degradation of the old tasks.\nBeyond this, increasingly numerous advances have emerged in recent years that\nlargely extend the understanding and application of continual learning. The\ngrowing and widespread interest in this direction demonstrates its realistic\nsignificance as well as complexity. In this work, we present a comprehensive\nsurvey of continual learning, seeking to bridge the basic settings, theoretical\nfoundations, representative methods, and practical applications. Based on\nexisting theoretical and empirical results, we summarize the general objectives\nof continual learning as ensuring a proper stability-plasticity trade-off and\nan adequate intra/inter-task generalizability in the context of resource\nefficiency. Then we provide a state-of-the-art and elaborated taxonomy,\nextensively analyzing how representative methods address continual learning,\nand how they are adapted to particular challenges in realistic applications.\nThrough an in-depth discussion of promising directions, we believe that such a\nholistic perspective can greatly facilitate subsequent exploration in this\nfield and beyond.\n","authors":["Liyuan Wang","Xingxing Zhang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.00487v3.pdf","comment":"The concise version is in IEEE Transactions on Pattern Analysis and\n Machine Intelligence (TPAMI)"},{"id":"http://arxiv.org/abs/2402.03822v1","updated":"2024-02-06T09:10:35Z","published":"2024-02-06T09:10:35Z","title":"RevOrder: A Novel Method for Enhanced Arithmetic in Language Models","summary":" This paper presents RevOrder, a novel technique aimed at improving arithmetic\noperations in large language models (LLMs) by reversing the output digits in\naddition, subtraction, and n-digit by 1-digit (nD by 1D) multiplication tasks.\nOur method significantly reduces the Count of Sequential Intermediate Digits\n(CSID) to $\\mathcal{O}(1)$, a new metric we introduce to assess equation\ncomplexity. Through comprehensive testing, RevOrder not only achieves perfect\naccuracy in basic arithmetic operations but also substantially boosts LLM\nperformance in division tasks, particularly with large numbers where\ntraditional models struggle. Implementation of RevOrder is cost-effective for\nboth training and inference phases. Moreover, applying RevOrder to fine-tune\nthe LLaMA2-7B model on the GSM8K math task results in a considerable\nimprovement, reducing equation calculation errors by 46% and increasing overall\nscores from 41.6 to 44.4.\n","authors":["Si Shen","Peijun Shen","Danhao Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.03822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03819v1","updated":"2024-02-06T09:07:41Z","published":"2024-02-06T09:07:41Z","title":"Theoretical and experimental study of SMOTE: limitations and comparisons\n of rebalancing strategies","summary":" Synthetic Minority Oversampling Technique (SMOTE) is a common rebalancing\nstrategy for handling imbalanced data sets. Asymptotically, we prove that SMOTE\n(with default parameter) regenerates the original distribution by simply\ncopying the original minority samples. We also prove that SMOTE density\nvanishes near the boundary of the support of the minority distribution,\ntherefore justifying the common BorderLine SMOTE strategy. Then we introduce\ntwo new SMOTE-related strategies, and compare them with state-of-the-art\nrebalancing procedures. We show that rebalancing strategies are only required\nwhen the data set is highly imbalanced. For such data sets, SMOTE, our\nproposals, or undersampling procedures are the best strategies.\n","authors":["Abdoulaye Sakho","Erwan Scornet","Emmanuel Malherbe"],"pdf_url":"https://arxiv.org/pdf/2402.03819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03818v1","updated":"2024-02-06T09:07:26Z","published":"2024-02-06T09:07:26Z","title":"Asymptotic generalization error of a single-layer graph convolutional\n network","summary":" While graph convolutional networks show great practical promises, the\ntheoretical understanding of their generalization properties as a function of\nthe number of samples is still in its infancy compared to the more broadly\nstudied case of supervised fully connected neural networks. In this article, we\npredict the performances of a single-layer graph convolutional network (GCN)\ntrained on data produced by attributed stochastic block models (SBMs) in the\nhigh-dimensional limit. Previously, only ridge regression on contextual-SBM\n(CSBM) has been considered in Shi et al. 2022; we generalize the analysis to\narbitrary convex loss and regularization for the CSBM and add the analysis for\nanother data model, the neural-prior SBM. We also study the high\nsignal-to-noise ratio limit, detail the convergence rates of the GCN and show\nthat, while consistent, it does not reach the Bayes-optimal rate for any of the\nconsidered cases.\n","authors":["O. Duranthon","L. Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2402.03818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18168v5","updated":"2024-02-06T09:04:04Z","published":"2023-10-27T14:27:43Z","title":"Personas as a Way to Model Truthfulness in Language Models","summary":" Large language models (LLMs) are trained on vast amounts of text from the\ninternet, which contains both factual and misleading information about the\nworld. While unintuitive from a classic view of LMs, recent work has shown that\nthe truth value of a statement can be elicited from the model's\nrepresentations. This paper presents an explanation for why LMs appear to know\nthe truth despite not being trained with truth labels. We hypothesize that the\npretraining data is generated by groups of (un)truthful agents whose outputs\nshare common features, and they form a (un)truthful persona. By training on\nthis data, LMs can infer and represent the persona in its activation space.\nThis allows the model to separate truth from falsehoods and controls the\ntruthfulness of its generation. We show evidence for the persona hypothesis via\ntwo observations: (1) we can probe whether a model's answer will be truthful\nbefore it is generated; (2) finetuning a model on a set of facts improves its\ntruthfulness on unseen topics. Next, using arithmetics as a synthetic\nenvironment, we show that structures of the pretraining data are crucial for\nthe model to infer the truthful persona. Overall, our findings suggest that\nmodels can exploit hierarchical structures in the data to learn abstract\nconcepts like truthfulness.\n","authors":["Nitish Joshi","Javier Rando","Abulhair Saparov","Najoung Kim","He He"],"pdf_url":"https://arxiv.org/pdf/2310.18168v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v3","updated":"2024-02-06T09:03:53Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n Graph, Image, and Text","summary":" Large language models have made significant strides in natural language\nprocessing, enabling innovative applications in molecular science by processing\ntextual representations of molecules. However, most existing language models\ncannot capture the rich information with complex molecular structures or\nimages. In this paper, we introduce GIT-Mol, a multi-modal large language model\nthat integrates the Graph, Image, and Text information. To facilitate the\nintegration of multi-modal molecular data, we propose GIT-Former, a novel\narchitecture that is capable of aligning all modalities into a unified latent\nspace. We achieve a 5%-10% accuracy increase in properties prediction and a\n20.2% boost in molecule generation validity compared to the baselines. With the\nany-to-language molecular translation strategy, our model has the potential to\nperform more downstream tasks, such as compound name recognition and chemical\nreaction prediction.\n","authors":["Pengfei Liu","Yiming Ren","Jun Tao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v3.pdf","comment":"The article has been accepted by Computers in Biology and Medicine,\n with 14 pages and 4 figures"},{"id":"http://arxiv.org/abs/2402.03815v1","updated":"2024-02-06T09:00:05Z","published":"2024-02-06T09:00:05Z","title":"Expediting In-Network Federated Learning by Voting-Based Consensus Model\n Compression","summary":" Recently, federated learning (FL) has gained momentum because of its\ncapability in preserving data privacy. To conduct model training by FL,\nmultiple clients exchange model updates with a parameter server via Internet.\nTo accelerate the communication speed, it has been explored to deploy a\nprogrammable switch (PS) in lieu of the parameter server to coordinate clients.\nThe challenge to deploy the PS in FL lies in its scarce memory space,\nprohibiting running memory consuming aggregation algorithms on the PS. To\novercome this challenge, we propose Federated Learning in-network Aggregation\nwith Compression (FediAC) algorithm, consisting of two phases: client voting\nand model aggregating. In the former phase, clients report their significant\nmodel update indices to the PS to estimate global significant model updates. In\nthe latter phase, clients upload global significant model updates to the PS for\naggregation. FediAC consumes much less memory space and communication traffic\nthan existing works because the first phase can guarantee consensus compression\nacross clients. The PS easily aligns model update indices to swiftly complete\naggregation in the second phase. Finally, we conduct extensive experiments by\nusing public datasets to demonstrate that FediAC remarkably surpasses the\nstate-of-the-art baselines in terms of model accuracy and communication\ntraffic.\n","authors":["Xiaoxin Su","Yipeng Zhou","Laizhong Cui","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2402.03815v1.pdf","comment":"To appear in 2024 IEEE International Conference on Computer\n Communications(INFOCOM 2024)"},{"id":"http://arxiv.org/abs/2402.03813v1","updated":"2024-02-06T08:57:49Z","published":"2024-02-06T08:57:49Z","title":"NK Hybrid Genetic Algorithm for Clustering","summary":" The NK hybrid genetic algorithm for clustering is proposed in this paper. In\norder to evaluate the solutions, the hybrid algorithm uses the NK clustering\nvalidation criterion 2 (NKCV2). NKCV2 uses information about the disposition of\n$N$ small groups of objects. Each group is composed of $K+1$ objects of the\ndataset. Experimental results show that density-based regions can be identified\nby using NKCV2 with fixed small $K$. In NKCV2, the relationship between\ndecision variables is known, which in turn allows us to apply gray box\noptimization. Mutation operators, a partition crossover, and a local search\nstrategy are proposed, all using information about the relationship between\ndecision variables. In partition crossover, the evaluation function is\ndecomposed into $q$ independent components; partition crossover then\ndeterministically returns the best among $2^q$ possible offspring with\ncomputational complexity $O(N)$. The NK hybrid genetic algorithm allows the\ndetection of clusters with arbitrary shapes and the automatic estimation of the\nnumber of clusters. In the experiments, the NK hybrid genetic algorithm\nproduced very good results when compared to another genetic algorithm approach\nand to state-of-art clustering algorithms.\n","authors":["Renato Tinós","Liang Zhao","Francisco Chicano","Darrell Whitley"],"pdf_url":"https://arxiv.org/pdf/2402.03813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03814v1","updated":"2024-02-06T08:57:49Z","published":"2024-02-06T08:57:49Z","title":"Masked Graph Autoencoder with Non-discrete Bandwidths","summary":" Masked graph autoencoders have emerged as a powerful graph self-supervised\nlearning method that has yet to be fully explored. In this paper, we unveil\nthat the existing discrete edge masking and binary link reconstruction\nstrategies are insufficient to learn topologically informative representations,\nfrom the perspective of message propagation on graph neural networks. These\nlimitations include blocking message flows, vulnerability to over-smoothness,\nand suboptimal neighborhood discriminability. Inspired by these understandings,\nwe explore non-discrete edge masks, which are sampled from a continuous and\ndispersive probability distribution instead of the discrete Bernoulli\ndistribution. These masks restrict the amount of output messages for each edge,\nreferred to as \"bandwidths\". We propose a novel, informative, and effective\ntopological masked graph autoencoder using bandwidth masking and a layer-wise\nbandwidth prediction objective. We demonstrate its powerful graph topological\nlearning ability both theoretically and empirically. Our proposed framework\noutperforms representative baselines in both self-supervised link prediction\n(improving the discrete edge reconstructors by at most 20%) and node\nclassification on numerous datasets, solely with a structure-learning pretext.\nOur implementation is available at https://github.com/Newiz430/Bandana.\n","authors":["Ziwen Zhao","Yuhua Li","Yixiong Zou","Jiliang Tang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2402.03814v1.pdf","comment":"Full version (17 pages, 8 figures, 12 tables), accepted by TheWebConf\n 2024 (WWW 2024)"},{"id":"http://arxiv.org/abs/2402.03808v1","updated":"2024-02-06T08:48:39Z","published":"2024-02-06T08:48:39Z","title":"SDEMG: Score-based Diffusion Model for Surface Electromyographic Signal\n Denoising","summary":" Surface electromyography (sEMG) recordings can be influenced by\nelectrocardiogram (ECG) signals when the muscle being monitored is close to the\nheart. Several existing methods use signal-processing-based approaches, such as\nhigh-pass filter and template subtraction, while some derive mapping functions\nto restore clean sEMG signals from noisy sEMG (sEMG with ECG interference).\nRecently, the score-based diffusion model, a renowned generative model, has\nbeen introduced to generate high-quality and accurate samples with noisy input\ndata. In this study, we proposed a novel approach, termed SDEMG, as a\nscore-based diffusion model for sEMG signal denoising. To evaluate the proposed\nSDEMG approach, we conduct experiments to reduce noise in sEMG signals,\nemploying data from an openly accessible source, the Non-Invasive Adaptive\nProsthetics database, along with ECG signals from the MIT-BIH Normal Sinus\nRhythm Database. The experiment result indicates that SDEMG outperformed\ncomparative methods and produced high-quality sEMG samples. The source code of\nSDEMG the framework is available at: https://github.com/tonyliu0910/SDEMG\n","authors":["Yu-Tung Liu","Kuan-Chen Wang","Kai-Chun Liu","Sheng-Yu Peng","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2402.03808v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.03807v1","updated":"2024-02-06T08:48:01Z","published":"2024-02-06T08:48:01Z","title":"SEABO: A Simple Search-Based Method for Offline Imitation Learning","summary":" Offline reinforcement learning (RL) has attracted much attention due to its\nability in learning from static offline datasets and eliminating the need of\ninteracting with the environment. Nevertheless, the success of offline RL\nrelies heavily on the offline transitions annotated with reward labels. In\npractice, we often need to hand-craft the reward function, which is sometimes\ndifficult, labor-intensive, or inefficient. To tackle this challenge, we set\nour focus on the offline imitation learning (IL) setting, and aim at getting a\nreward function based on the expert data and unlabeled data. To that end, we\npropose a simple yet effective search-based offline IL method, tagged SEABO.\nSEABO allocates a larger reward to the transition that is close to its closest\nneighbor in the expert demonstration, and a smaller reward otherwise, all in an\nunsupervised learning manner. Experimental results on a variety of D4RL\ndatasets indicate that SEABO can achieve competitive performance to offline RL\nalgorithms with ground-truth rewards, given only a single expert trajectory,\nand can outperform prior reward learning and offline IL methods across many\ntasks. Moreover, we demonstrate that SEABO also works well if the expert\ndemonstrations contain only observations. Our code is publicly available at\nhttps://github.com/dmksjfl/SEABO.\n","authors":["Jiafei Lyu","Xiaoteng Ma","Le Wan","Runze Liu","Xiu Li","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2402.03807v1.pdf","comment":"To appear in ICLR2024"},{"id":"http://arxiv.org/abs/2402.03806v1","updated":"2024-02-06T08:47:16Z","published":"2024-02-06T08:47:16Z","title":"Explainable Automated Machine Learning for Credit Decisions: Enhancing\n Human Artificial Intelligence Collaboration in Financial Engineering","summary":" This paper explores the integration of Explainable Automated Machine Learning\n(AutoML) in the realm of financial engineering, specifically focusing on its\napplication in credit decision-making. The rapid evolution of Artificial\nIntelligence (AI) in finance has necessitated a balance between sophisticated\nalgorithmic decision-making and the need for transparency in these systems. The\nfocus is on how AutoML can streamline the development of robust machine\nlearning models for credit scoring, while Explainable AI (XAI) methods,\nparticularly SHapley Additive exPlanations (SHAP), provide insights into the\nmodels' decision-making processes. This study demonstrates how the combination\nof AutoML and XAI not only enhances the efficiency and accuracy of credit\ndecisions but also fosters trust and collaboration between humans and AI\nsystems. The findings underscore the potential of explainable AutoML in\nimproving the transparency and accountability of AI-driven financial decisions,\naligning with regulatory requirements and ethical considerations.\n","authors":["Marc Schmitt"],"pdf_url":"https://arxiv.org/pdf/2402.03806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03804v1","updated":"2024-02-06T08:45:51Z","published":"2024-02-06T08:45:51Z","title":"ReLU$^2$ Wins: Discovering Efficient Activation Functions for Sparse\n LLMs","summary":" Sparse computation offers a compelling solution for the inference of Large\nLanguage Models (LLMs) in low-resource scenarios by dynamically skipping the\ncomputation of inactive neurons. While traditional approaches focus on\nReLU-based LLMs, leveraging zeros in activation values, we broaden the scope of\nsparse LLMs beyond zero activation values. We introduce a general method that\ndefines neuron activation through neuron output magnitudes and a tailored\nmagnitude threshold, demonstrating that non-ReLU LLMs also exhibit sparse\nactivation. To find the most efficient activation function for sparse\ncomputation, we propose a systematic framework to examine the sparsity of LLMs\nfrom three aspects: the trade-off between sparsity and performance, the\npredictivity of sparsity, and the hardware affinity. We conduct thorough\nexperiments on LLMs utilizing different activation functions, including ReLU,\nSwiGLU, ReGLU, and ReLU$^2$. The results indicate that models employing\nReLU$^2$ excel across all three evaluation aspects, highlighting its potential\nas an efficient activation function for sparse LLMs. We will release the code\nto facilitate future research.\n","authors":["Zhengyan Zhang","Yixin Song","Guanghui Yu","Xu Han","Yankai Lin","Chaojun Xiao","Chenyang Song","Zhiyuan Liu","Zeyu Mi","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.03804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19600v3","updated":"2024-02-06T08:45:27Z","published":"2023-05-31T07:00:42Z","title":"Adaptive Self-Distillation for Minimizing Client Drift in Heterogeneous\n Federated Learning","summary":" Federated Learning (FL) is a machine learning paradigm that enables clients\nto jointly train a global model by aggregating the locally trained models\nwithout sharing any local training data. In practice, there can often be\nsubstantial heterogeneity (e.g., class imbalance) across the local data\ndistributions observed by each of these clients. Under such non-iid data\ndistributions across clients, FL suffers from the 'client-drift' problem where\nevery client drifts to its own local optimum. This results in slower\nconvergence and poor performance of the aggregated model. To address this\nlimitation, we propose a novel regularization technique based on adaptive\nself-distillation (ASD) for training models on the client side. Our\nregularization scheme adaptively adjusts to the client's training data based on\nthe global model entropy and the client's label distribution. The proposed\nregularization can be easily integrated atop existing, state-of-the-art FL\nalgorithms, leading to a further boost in the performance of these\noff-the-shelf methods. We theoretically explain how ASD reduces client-drift\nand also explain its generalization ability. We demonstrate the efficacy of our\napproach through extensive experiments on multiple real-world benchmarks and\nshow substantial gains in performance over state-of-the-art methods.\n","authors":["M. Yashwanth","Gaurav Kumar Nayak","Arya Singh","Yogesh Simmhan","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2305.19600v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14670v3","updated":"2024-02-06T08:42:12Z","published":"2023-06-26T13:06:34Z","title":"Improved Bayes Risk Can Yield Reduced Social Welfare Under Competition","summary":" As the scale of machine learning models increases, trends such as scaling\nlaws anticipate consistent downstream improvements in predictive accuracy.\nHowever, these trends take the perspective of a single model-provider in\nisolation, while in reality providers often compete with each other for users.\nIn this work, we demonstrate that competition can fundamentally alter the\nbehavior of these scaling trends, even causing overall predictive accuracy\nacross users to be non-monotonic or decreasing with scale. We define a model of\ncompetition for classification tasks, and use data representations as a lens\nfor studying the impact of increases in scale. We find many settings where\nimproving data representation quality (as measured by Bayes risk) decreases the\noverall predictive accuracy across users (i.e., social welfare) for a\nmarketplace of competing model-providers. Our examples range from closed-form\nformulas in simple settings to simulations with pretrained representations on\nCIFAR-10. At a conceptual level, our work suggests that favorable scaling\ntrends for individual model-providers need not translate to downstream\nimprovements in social welfare in marketplaces with multiple model providers.\n","authors":["Meena Jagadeesan","Michael I. Jordan","Jacob Steinhardt","Nika Haghtalab"],"pdf_url":"https://arxiv.org/pdf/2306.14670v3.pdf","comment":"Appeared at NeurIPS 2023; this is the full version"},{"id":"http://arxiv.org/abs/2202.09724v5","updated":"2024-02-06T08:38:09Z","published":"2022-02-20T03:35:44Z","title":"Bayes-Optimal Classifiers under Group Fairness","summary":" Machine learning algorithms are becoming integrated into more and more\nhigh-stakes decision-making processes, such as in social welfare issues. Due to\nthe need of mitigating the potentially disparate impacts from algorithmic\npredictions, many approaches have been proposed in the emerging area of fair\nmachine learning. However, the fundamental problem of characterizing\nBayes-optimal classifiers under various group fairness constraints has only\nbeen investigated in some special cases. Based on the classical Neyman-Pearson\nargument (Neyman and Pearson, 1933; Shao, 2003) for optimal hypothesis testing,\nthis paper provides a unified framework for deriving Bayes-optimal classifiers\nunder group fairness. This enables us to propose a group-based thresholding\nmethod we call FairBayes, that can directly control disparity, and achieve an\nessentially optimal fairness-accuracy tradeoff. These advantages are supported\nby thorough experiments.\n","authors":["Xianli Zeng","Edgar Dobriban","Guang Cheng"],"pdf_url":"https://arxiv.org/pdf/2202.09724v5.pdf","comment":"This technical report has been largely superseded by our later paper:\n \"Bayes-Optimal Fair Classification with Linear Disparity Constraints via\n Pre-, In-, and Post-processing'' (arXiv:2402.02817). Please cite that one\n instead of this technical report"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.13967v2","updated":"2024-02-06T15:45:12Z","published":"2024-01-25T05:58:04Z","title":"Perceptual-oriented Learned Image Compression with Dynamic Kernel","summary":" In this paper, we extend our prior research named DKIC and propose the\nperceptual-oriented learned image compression method, PO-DKIC. Specifically,\nDKIC adopts a dynamic kernel-based dynamic residual block group to enhance the\ntransform coding and an asymmetric space-channel context entropy model to\nfacilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC\nintroduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to\nmaximize the overall perceptual quality under a rate constraint, we formulate\nthis challenge into a constrained programming problem and use the Linear\nInteger Programming method for resolution. The experiments demonstrate that our\nproposed method can generate realistic images with richer textures and finer\ndetails when compared to state-of-the-art image compression techniques.\n","authors":["Nianxiang Fu","Junxi Zhang","Huairui Wang","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03946v1","updated":"2024-02-06T12:20:10Z","published":"2024-02-06T12:20:10Z","title":"BioNet-XR: Biological Network Visualization Framework for Virtual\n Reality and Mixed Reality Environments","summary":" Protein-protein interaction networks (PPIN) enable the study of cellular\nprocesses in organisms. Visualizing PPINs in extended reality (XR), including\nvirtual reality (VR) and mixed reality (MR), is crucial for exploring\nsubnetworks, evaluating protein positions, and collaboratively analyzing and\ndiscussing on networks with the help of recent technological advancements.\nHere, we present BioNet-XR, a 3D visualization framework, to visualize PPINs in\nVR and MR environments. BioNet-XR was developed with the Unity3D game engine.\nOur framework provides state-of-the-art methods and visualization features\nincluding teleportation between nodes, general and first-person view to explore\nthe network, subnetwork construction via PageRank, Steiner tree, and all-pair\nshortest path algorithms for a given set of initial nodes. We used usability\ntests to gather feedback from both specialists (bioinformaticians) and\ngeneralists (multidisciplinary groups), addressing the need for usability\nevaluations of visualization tools. In the MR version of BioNet-XR, users can\nseamlessly transition to real-world environments and interact with protein\ninteraction networks. BioNet-XR is highly modular and adaptable for\nvisualization of other biological networks, such as metabolic and regulatory\nnetworks, and extension with additional network methods.\n","authors":["Busra Senderin","Nurcan Tuncbag","Elif Surer"],"pdf_url":"https://arxiv.org/pdf/2402.03946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00736v2","updated":"2024-02-06T12:20:06Z","published":"2024-01-01T12:25:57Z","title":"Diffusion Models, Image Super-Resolution And Everything: A Survey","summary":" Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field\nand further closed the gap between image quality and human perceptual\npreferences. They are easy to train and can produce very high-quality samples\nthat exceed the realism of those produced by previous generative methods.\nDespite their promising results, they also come with new challenges that need\nfurther research: high computational demands, comparability, lack of\nexplainability, color shifts, and more. Unfortunately, entry into this field is\noverwhelming because of the abundance of publications. To address this, we\nprovide a unified recount of the theoretical foundations underlying DMs applied\nto image SR and offer a detailed analysis that underscores the unique\ncharacteristics and methodologies within this domain, distinct from broader\nexisting reviews in the field. This survey articulates a cohesive understanding\nof DM principles and explores current research avenues, including alternative\ninput domains, conditioning techniques, guidance mechanisms, corruption spaces,\nand zero-shot learning approaches. By offering a detailed examination of the\nevolution and current trends in image SR through the lens of DMs, this survey\nsheds light on the existing challenges and charts potential future directions,\naiming to inspire further innovation in this rapidly advancing area.\n","authors":["Brian B. Moser","Arundhati S. Shanbhag","Federico Raue","Stanislav Frolov","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.00736v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05653v5","updated":"2024-02-06T11:12:02Z","published":"2022-09-13T00:01:23Z","title":"Semantic2Graph: Graph-based Multi-modal Feature Fusion for Action\n Segmentation in Videos","summary":" Video action segmentation have been widely applied in many fields. Most\nprevious studies employed video-based vision models for this purpose. However,\nthey often rely on a large receptive field, LSTM or Transformer methods to\ncapture long-term dependencies within videos, leading to significant\ncomputational resource requirements. To address this challenge, graph-based\nmodel was proposed. However, previous graph-based models are less accurate.\nHence, this study introduces a graph-structured approach named Semantic2Graph,\nto model long-term dependencies in videos, thereby reducing computational costs\nand raise the accuracy. We construct a graph structure of video at the\nframe-level. Temporal edges are utilized to model the temporal relations and\naction order within videos. Additionally, we have designed positive and\nnegative semantic edges, accompanied by corresponding edge weights, to capture\nboth long-term and short-term semantic relationships in video actions. Node\nattributes encompass a rich set of multi-modal features extracted from video\ncontent, graph structures, and label text, encompassing visual, structural, and\nsemantic cues. To synthesize this multi-modal information effectively, we\nemploy a graph neural network (GNN) model to fuse multi-modal features for node\naction label classification. Experimental results demonstrate that\nSemantic2Graph outperforms state-of-the-art methods in terms of performance,\nparticularly on benchmark datasets such as GTEA and 50Salads. Multiple ablation\nexperiments further validate the effectiveness of semantic features in\nenhancing model performance. Notably, the inclusion of semantic edges in\nSemantic2Graph allows for the cost-effective capture of long-term dependencies,\naffirming its utility in addressing the challenges posed by computational\nresource constraints in video-based vision models.\n","authors":["Junbin Zhang","Pei-Hsuan Tsai","Meng-Hsun Tsai"],"pdf_url":"https://arxiv.org/pdf/2209.05653v5.pdf","comment":"13 pages, 3 figures, 9 tables. Published on Applied Intelligence"},{"id":"http://arxiv.org/abs/2402.03658v1","updated":"2024-02-06T03:14:46Z","published":"2024-02-06T03:14:46Z","title":"Sentiment-enhanced Graph-based Sarcasm Explanation in Dialogue","summary":" Sarcasm Explanation in Dialogue (SED) is a new yet challenging task, which\naims to generate a natural language explanation for the given sarcastic\ndialogue that involves multiple modalities (i.e., utterance, video, and audio).\nAlthough existing studies have achieved great success based on the generative\npretrained language model BART, they overlook exploiting the sentiments\nresiding in the utterance, video and audio, which are vital clues for sarcasm\nexplanation. In fact, it is non-trivial to incorporate sentiments for boosting\nSED performance, due to three main challenges: 1) diverse effects of utterance\ntokens on sentiments; 2) gap between video-audio sentiment signals and the\nembedding space of BART; and 3) various relations among utterances, utterance\nsentiments, and video-audio sentiments. To tackle these challenges, we propose\na novel sEntiment-enhanceD Graph-based multimodal sarcasm Explanation\nframework, named EDGE. In particular, we first propose a lexicon-guided\nutterance sentiment inference module, where a heuristic utterance sentiment\nrefinement strategy is devised. We then develop a module named Joint Cross\nAttention-based Sentiment Inference (JCA-SI) by extending the multimodal\nsentiment analysis model JCA to derive the joint sentiment label for each\nvideo-audio clip. Thereafter, we devise a context-sentiment graph to\ncomprehensively model the semantic relations among the utterances, utterance\nsentiments, and video-audio sentiments, to facilitate sarcasm explanation\ngeneration. Extensive experiments on the publicly released dataset WITS verify\nthe superiority of our model over cutting-edge methods.\n","authors":["Kun Ouyang","Liqiang Jing","Xuemeng Song","Meng Liu","Yupeng Hu","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2402.03658v1.pdf","comment":null}]},"2024-02-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2402.05111v1","updated":"2024-02-07T18:59:31Z","published":"2024-02-07T18:59:31Z","title":"Edu-ConvoKit: An Open-Source Library for Education Conversation Data","summary":" We introduce Edu-ConvoKit, an open-source library designed to handle\npre-processing, annotation and analysis of conversation data in education.\nResources for analyzing education conversation data are scarce, making the\nresearch challenging to perform and therefore hard to access. We address these\nchallenges with Edu-ConvoKit. Edu-ConvoKit is open-source\n(https://github.com/stanfordnlp/edu-convokit ), pip-installable\n(https://pypi.org/project/edu-convokit/ ), with comprehensive documentation\n(https://edu-convokit.readthedocs.io/en/latest/ ). Our demo video is available\nat: https://youtu.be/zdcI839vAko?si=h9qlnl76ucSuXb8- . We include additional\nresources, such as Colab applications of Edu-ConvoKit to three diverse\neducation datasets and a repository of Edu-ConvoKit related papers, that can be\nfound in our GitHub repository.\n","authors":["Rose E. Wang","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2402.05111v1.pdf","comment":"https://github.com/stanfordnlp/edu-convokit\n https://edu-convokit.readthedocs.io/en/latest/"},{"id":"http://arxiv.org/abs/2402.05106v1","updated":"2024-02-07T18:57:37Z","published":"2024-02-07T18:57:37Z","title":"Image captioning for Brazilian Portuguese using GRIT model","summary":" This work presents the early development of a model of image captioning for\nthe Brazilian Portuguese language. We used the GRIT (Grid - and Region-based\nImage captioning Transformer) model to accomplish this work. GRIT is a\nTransformer-only neural architecture that effectively utilizes two visual\nfeatures to generate better captions. The GRIT method emerged as a proposal to\nbe a more efficient way to generate image captioning. In this work, we adapt\nthe GRIT model to be trained in a Brazilian Portuguese dataset to have an image\ncaptioning method for the Brazilian Portuguese Language.\n","authors":["Rafael Silva de Alencar","William Alberto Cruz Castañeda","Marcellus Amadeus"],"pdf_url":"https://arxiv.org/pdf/2402.05106v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.09666 by other authors"},{"id":"http://arxiv.org/abs/2402.00838v2","updated":"2024-02-07T18:53:02Z","published":"2024-02-01T18:28:55Z","title":"OLMo: Accelerating the Science of Language Models","summary":" Language models (LMs) have become ubiquitous in both NLP research and in\ncommercial product offerings. As their commercial importance has surged, the\nmost powerful models have become closed off, gated behind proprietary\ninterfaces, with important details of their training data, architectures, and\ndevelopment undisclosed. Given the importance of these details in\nscientifically studying these models, including their biases and potential\nrisks, we believe it is essential for the research community to have access to\npowerful, truly open LMs. To this end, this technical report details the first\nrelease of OLMo, a state-of-the-art, truly Open Language Model and its\nframework to build and study the science of language modeling. Unlike most\nprior efforts that have only released model weights and inference code, we\nrelease OLMo and the whole framework, including training data and training and\nevaluation code. We hope this release will empower and strengthen the open\nresearch community and inspire a new wave of innovation.\n","authors":["Dirk Groeneveld","Iz Beltagy","Pete Walsh","Akshita Bhagia","Rodney Kinney","Oyvind Tafjord","Ananya Harsh Jha","Hamish Ivison","Ian Magnusson","Yizhong Wang","Shane Arora","David Atkinson","Russell Authur","Khyathi Raghavi Chandu","Arman Cohan","Jennifer Dumas","Yanai Elazar","Yuling Gu","Jack Hessel","Tushar Khot","William Merrill","Jacob Morrison","Niklas Muennighoff","Aakanksha Naik","Crystal Nam","Matthew E. Peters","Valentina Pyatkin","Abhilasha Ravichander","Dustin Schwenk","Saurabh Shah","Will Smith","Emma Strubell","Nishant Subramani","Mitchell Wortsman","Pradeep Dasigi","Nathan Lambert","Kyle Richardson","Luke Zettlemoyer","Jesse Dodge","Kyle Lo","Luca Soldaini","Noah A. Smith","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2402.00838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05070v1","updated":"2024-02-07T18:21:17Z","published":"2024-02-07T18:21:17Z","title":"A Roadmap to Pluralistic Alignment","summary":" With increased power and prevalence of AI systems, it is ever more critical\nthat AI systems are designed to serve all, i.e., people with diverse values and\nperspectives. However, aligning models to serve pluralistic human values\nremains an open research question. In this piece, we propose a roadmap to\npluralistic alignment, specifically using language models as a test bed. We\nidentify and formalize three possible ways to define and operationalize\npluralism in AI systems: 1) Overton pluralistic models that present a spectrum\nof reasonable responses; 2) Steerably pluralistic models that can steer to\nreflect certain perspectives; and 3) Distributionally pluralistic models that\nare well-calibrated to a given population in distribution. We also propose and\nformalize three possible classes of pluralistic benchmarks: 1) Multi-objective\nbenchmarks, 2) Trade-off steerable benchmarks, which incentivize models to\nsteer to arbitrary trade-offs, and 3) Jury-pluralistic benchmarks which\nexplicitly model diverse human ratings. We use this framework to argue that\ncurrent alignment techniques may be fundamentally limited for pluralistic AI;\nindeed, we highlight empirical evidence, both from our own experiments and from\nother work, that standard alignment procedures might reduce distributional\npluralism in models, motivating the need for further research on pluralistic\nalignment.\n","authors":["Taylor Sorensen","Jared Moore","Jillian Fisher","Mitchell Gordon","Niloofar Mireshghallah","Christopher Michael Rytting","Andre Ye","Liwei Jiang","Ximing Lu","Nouha Dziri","Tim Althoff","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2402.05070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03561v2","updated":"2024-02-07T18:02:51Z","published":"2024-02-05T22:20:19Z","title":"VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language\n Navigation","summary":" Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate\nthrough realistic 3D outdoor environments based on natural language\ninstructions. The performance of existing VLN methods is limited by\ninsufficient diversity in navigation environments and limited training data. To\naddress these issues, we propose VLN-Video, which utilizes the diverse outdoor\nenvironments present in driving videos in multiple cities in the U.S. augmented\nwith automatically generated navigation instructions and actions to improve\noutdoor VLN performance. VLN-Video combines the best of intuitive classical\napproaches and modern deep learning techniques, using template infilling to\ngenerate grounded navigation instructions, combined with an image rotation\nsimilarity-based navigation action predictor to obtain VLN style data from\ndriving videos for pretraining deep learning VLN models. We pre-train the model\non the Touchdown dataset and our video-augmented dataset created from driving\nvideos with three proxy tasks: Masked Language Modeling, Instruction and\nTrajectory Matching, and Next Action Prediction, so as to learn\ntemporally-aware and visually-aligned instruction representations. The learned\ninstruction representation is adapted to the state-of-the-art navigator when\nfine-tuning on the Touchdown dataset. Empirical results demonstrate that\nVLN-Video significantly outperforms previous state-of-the-art models by 2.1% in\ntask completion rate, achieving a new state-of-the-art on the Touchdown\ndataset.\n","authors":["Jialu Li","Aishwarya Padmakumar","Gaurav Sukhatme","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2402.03561v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2402.01748v2","updated":"2024-02-07T17:55:11Z","published":"2024-01-30T00:21:41Z","title":"Large Multi-Modal Models (LMMs) as Universal Foundation Models for\n AI-Native Wireless Systems","summary":" Large language models (LLMs) and foundation models have been recently touted\nas a game-changer for 6G systems. However, recent efforts on LLMs for wireless\nnetworks are limited to a direct application of existing language models that\nwere designed for natural language processing (NLP) applications. To address\nthis challenge and create wireless-centric foundation models, this paper\npresents a comprehensive vision on how to design universal foundation models\nthat are tailored towards the deployment of artificial intelligence (AI)-native\nnetworks. Diverging from NLP-based foundation models, the proposed framework\npromotes the design of large multi-modal models (LMMs) fostered by three key\ncapabilities: 1) processing of multi-modal sensing data, 2) grounding of\nphysical symbol representations in real-world wireless systems using causal\nreasoning and retrieval-augmented generation (RAG), and 3) enabling\ninstructibility from the wireless environment feedback to facilitate dynamic\nnetwork adaptation thanks to logical and mathematical reasoning facilitated by\nneuro-symbolic AI. In essence, these properties enable the proposed LMM\nframework to build universal capabilities that cater to various cross-layer\nnetworking tasks and alignment of intents across different domains. Preliminary\nresults from experimental evaluation demonstrate the efficacy of grounding\nusing RAG in LMMs, and showcase the alignment of LMMs with wireless system\ndesigns. Furthermore, the enhanced rationale exhibited in the responses to\nmathematical questions by LMMs, compared to vanilla LLMs, demonstrates the\nlogical and mathematical reasoning capabilities inherent in LMMs. Building on\nthose results, we present a sequel of open questions and challenges for LMMs.\nWe then conclude with a set of recommendations that ignite the path towards\nLMM-empowered AI-native systems.\n","authors":["Shengzhe Xu","Christo Kurisummoottil Thomas","Omar Hashash","Nikhil Muralidhar","Walid Saad","Naren Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2402.01748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05044v1","updated":"2024-02-07T17:33:54Z","published":"2024-02-07T17:33:54Z","title":"SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large\n Language Models","summary":" In the rapidly evolving landscape of Large Language Models (LLMs), ensuring\nrobust safety measures is paramount. To meet this crucial need, we propose\n\\emph{SALAD-Bench}, a safety benchmark specifically designed for evaluating\nLLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench\ntranscends conventional benchmarks through its large scale, rich diversity,\nintricate taxonomy spanning three levels, and versatile\nfunctionalities.SALAD-Bench is crafted with a meticulous array of questions,\nfrom standard queries to complex ones enriched with attack, defense\nmodifications and multiple-choice. To effectively manage the inherent\ncomplexity, we introduce an innovative evaluators: the LLM-based MD-Judge for\nQA pairs with a particular focus on attack-enhanced queries, ensuring a\nseamless, and reliable evaluation. Above components extend SALAD-Bench from\nstandard LLM safety evaluation to both LLM attack and defense methods\nevaluation, ensuring the joint-purpose utility. Our extensive experiments shed\nlight on the resilience of LLMs against emerging threats and the efficacy of\ncontemporary defense tactics. Data and evaluator are released under\n\\url{https://github.com/OpenSafetyLab/SALAD-BENCH}. Warning: this paper\nincludes examples that may be offensive or harmful.\n","authors":["Lijun Li","Bowen Dong","Ruohui Wang","Xuhao Hu","Wangmeng Zuo","Dahua Lin","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2402.05044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04232v2","updated":"2024-02-07T17:27:09Z","published":"2024-02-06T18:39:43Z","title":"Can Generative Agents Predict Emotion?","summary":" Large Language Models (LLMs) have demonstrated a number of human-like\nabilities, however the empathic understanding and emotional state of LLMs is\nyet to be aligned to that of humans. In this work, we investigate how the\nemotional state of generative LLM agents evolves as they perceive new events,\nintroducing a novel architecture in which new experiences are compared to past\nmemories. Through this comparison, the agent gains the ability to understand\nnew experiences in context, which according to the appraisal theory of emotion\nis vital in emotion creation. First, the agent perceives new experiences as\ntime series text data. After perceiving each new input, the agent generates a\nsummary of past relevant memories, referred to as the norm, and compares the\nnew experience to this norm. Through this comparison we can analyse how the\nagent reacts to the new experience in context. The PANAS, a test of affect, is\nadministered to the agent, capturing the emotional state of the agent after the\nperception of the new event. Finally, the new experience is then added to the\nagents memory to be used in the creation of future norms. By creating multiple\nexperiences in natural language from emotionally charged situations, we test\nthe proposed architecture on a wide range of scenarios. The mixed results\nsuggests that introducing context can occasionally improve the emotional\nalignment of the agent, but further study and comparison with human evaluators\nis necessary. We hope that this paper is another step towards the alignment of\ngenerative agents.\n","authors":["Ciaran Regan","Nanami Iwahashi","Shogo Tanaka","Mizuki Oka"],"pdf_url":"https://arxiv.org/pdf/2402.04232v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.05034v1","updated":"2024-02-07T17:07:53Z","published":"2024-02-07T17:07:53Z","title":"How BERT Speaks Shakespearean English? Evaluating Historical Bias in\n Contextual Language Models","summary":" In this paper, we explore the idea of analysing the historical bias of\ncontextual language models based on BERT by measuring their adequacy with\nrespect to Early Modern (EME) and Modern (ME) English. In our preliminary\nexperiments, we perform fill-in-the-blank tests with 60 masked sentences (20\nEME-specific, 20 ME-specific and 20 generic) and three different models (i.e.,\nBERT Base, MacBERTh, English HLM). We then rate the model predictions according\nto a 5-point bipolar scale between the two language varieties and derive a\nweighted score to measure the adequacy of each model to EME and ME varieties of\nEnglish.\n","authors":["Miriam Cuscito","Alfio Ferrara","Martin Ruskov"],"pdf_url":"https://arxiv.org/pdf/2402.05034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01726v2","updated":"2024-02-07T17:04:31Z","published":"2024-01-27T14:32:12Z","title":"AI Does Not Alter Perceptions of Text Messages","summary":" For many people, anxiety, depression, and other social and mental factors can\nmake composing text messages an active challenge. To remedy this problem, large\nlanguage models (LLMs) may yet prove to be the perfect tool to assist users\nthat would otherwise find texting difficult or stressful. However, despite\nrapid uptake in LLM usage, considerations for their assistive usage in text\nmessage composition have not been explored. A primary concern regarding LLM\nusage is that poor public sentiment regarding AI introduces the possibility\nthat its usage may harm perceptions of AI-assisted text messages, making usage\ncounter-productive. To (in)validate this possibility, we explore how the belief\nthat a text message did or did not receive AI assistance in composition alters\nits perceived tone, clarity, and ability to convey intent. In this study, we\nsurvey the perceptions of 26 participants on 18 randomly labeled pre-composed\ntext messages. In analyzing the participants' ratings of message tone, clarity,\nand ability to convey intent, we find that there is no statistically\nsignificant evidence that the belief that AI is utilized alters recipient\nperceptions. This provides hopeful evidence that LLM-based text message\ncomposition assistance can be implemented without the risk of\ncounter-productive outcomes.\n","authors":["N'yoma Diamond"],"pdf_url":"https://arxiv.org/pdf/2402.01726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09992v2","updated":"2024-02-07T16:40:22Z","published":"2023-09-15T20:00:27Z","title":"OpenAI Cribbed Our Tax Example, But Can GPT-4 Really Do Tax?","summary":" The authors explain where OpenAI got the tax law example in its livestream\ndemonstration of GPT-4, why GPT-4 got the wrong answer, and how it fails to\nreliably calculate taxes.\n","authors":["Andrew Blair-Stanek","Nils Holzenberger","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2309.09992v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2402.01697v2","updated":"2024-02-07T16:17:02Z","published":"2024-01-24T10:09:11Z","title":"APT-Pipe: An Automatic Prompt-Tuning Tool for Social Computing Data\n Annotation","summary":" Recent research has highlighted the potential of LLM applications, like\nChatGPT, for performing label annotation on social computing text. However, it\nis already well known that performance hinges on the quality of the input\nprompts. To address this, there has been a flurry of research into prompt\ntuning -- techniques and guidelines that attempt to improve the quality of\nprompts. Yet these largely rely on manual effort and prior knowledge of the\ndataset being annotated. To address this limitation, we propose APT-Pipe, an\nautomated prompt-tuning pipeline. APT-Pipe aims to automatically tune prompts\nto enhance ChatGPT's text classification performance on any given dataset. We\nimplement APT-Pipe and test it across twelve distinct text classification\ndatasets. We find that prompts tuned by APT-Pipe help ChatGPT achieve higher\nweighted F1-score on nine out of twelve experimented datasets, with an\nimprovement of 7.01% on average. We further highlight APT-Pipe's flexibility as\na framework by showing how it can be extended to support additional tuning\nmechanisms.\n","authors":["Yiming Zhu","Zhizhuo Yin","Ehsan-Ul Haq","Lik-Hang Lee","Gareth Tyson","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2402.01697v2.pdf","comment":"Just accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2402.05000v1","updated":"2024-02-07T16:15:59Z","published":"2024-02-07T16:15:59Z","title":"Pedagogical Alignment of Large Language Models","summary":" In this paper, we introduce the novel concept of pedagogically aligned Large\nLanguage Models (LLMs) that signifies a transformative shift in the application\nof LLMs within educational contexts. Rather than providing direct responses to\nuser queries, pedagogically-aligned LLMs function as scaffolding tools,\nbreaking complex problems into manageable subproblems and guiding students\ntowards the final answer through constructive feedback and hints. The objective\nis to equip learners with problem-solving strategies that deepen their\nunderstanding and internalization of the subject matter. Previous research in\nthis field has primarily applied the supervised finetuning approach without\nframing the objective as an alignment problem, hence not employing\nreinforcement learning through human feedback (RLHF) methods. This study\nreinterprets the narrative by viewing the task through the lens of alignment\nand demonstrates how RLHF methods emerge naturally as a superior alternative\nfor aligning LLM behaviour. Building on this perspective, we propose a novel\napproach for constructing a reward dataset specifically designed for the\npedagogical alignment of LLMs. We apply three state-of-the-art RLHF algorithms\nand find that they outperform SFT significantly. Our qualitative analyses\nacross model differences and hyperparameter sensitivity further validate the\nsuperiority of RLHF over SFT. Also, our study sheds light on the potential of\nonline feedback for enhancing the performance of pedagogically-aligned LLMs,\nthus providing valuable insights for the advancement of these models in\neducational settings.\n","authors":["Shashank Sonkar","Kangqi Ni","Sapana Chaudhary","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2402.05000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04978v1","updated":"2024-02-07T15:56:17Z","published":"2024-02-07T15:56:17Z","title":"An Enhanced Prompt-Based LLM Reasoning Scheme via Knowledge\n Graph-Integrated Collaboration","summary":" While Large Language Models (LLMs) demonstrate exceptional performance in a\nmultitude of Natural Language Processing (NLP) tasks, they encounter challenges\nin practical applications, including issues with hallucinations, inadequate\nknowledge updating, and limited transparency in the reasoning process. To\novercome these limitations, this study innovatively proposes a collaborative\ntraining-free reasoning scheme involving tight cooperation between Knowledge\nGraph (KG) and LLMs. This scheme first involves using LLMs to iteratively\nexplore KG, selectively retrieving a task-relevant knowledge subgraph to\nsupport reasoning. The LLMs are then guided to further combine inherent\nimplicit knowledge to reason on the subgraph while explicitly elucidating the\nreasoning process. Through such a cooperative approach, our scheme achieves\nmore reliable knowledge-based reasoning and facilitates the tracing of the\nreasoning results. Experimental results show that our scheme significantly\nprogressed across multiple datasets, notably achieving over a 10% improvement\non the QALD10 dataset compared to the best baseline and the fine-tuned\nstate-of-the-art (SOTA) work. Building on this success, this study hopes to\noffer a valuable reference for future research in the fusion of KG and LLMs,\nthereby enhancing LLMs' proficiency in solving complex issues.\n","authors":["Yihao Li","Ru Zhang","Jianyi Liu","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2402.04978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04967v1","updated":"2024-02-07T15:44:55Z","published":"2024-02-07T15:44:55Z","title":"Text or Image? What is More Important in Cross-Domain Generalization\n Capabilities of Hate Meme Detection Models?","summary":" This paper delves into the formidable challenge of cross-domain\ngeneralization in multimodal hate meme detection, presenting compelling\nfindings. We provide enough pieces of evidence supporting the hypothesis that\nonly the textual component of hateful memes enables the existing multimodal\nclassifier to generalize across different domains, while the image component\nproves highly sensitive to a specific training dataset. The evidence includes\ndemonstrations showing that hate-text classifiers perform similarly to\nhate-meme classifiers in a zero-shot setting. Simultaneously, the introduction\nof captions generated from images of memes to the hate-meme classifier worsens\nperformance by an average F1 of 0.02. Through blackbox explanations, we\nidentify a substantial contribution of the text modality (average of 83%),\nwhich diminishes with the introduction of meme's image captions (52%).\nAdditionally, our evaluation on a newly created confounder dataset reveals\nhigher performance on text confounders as compared to image confounders with an\naverage $\\Delta$F1 of 0.18.\n","authors":["Piush Aggarwal","Jawar Mehrabanian","Weigang Huang","Özge Alacam","Torsten Zesch"],"pdf_url":"https://arxiv.org/pdf/2402.04967v1.pdf","comment":"Accepted at EACL'2024 Findings"},{"id":"http://arxiv.org/abs/2402.04957v1","updated":"2024-02-07T15:40:22Z","published":"2024-02-07T15:40:22Z","title":"Reconfidencing LLMs from the Grouping Loss Perspective","summary":" Large Language Models (LLMs), including ChatGPT and LLaMA, are susceptible to\ngenerating hallucinated answers in a confident tone. While efforts to elicit\nand calibrate confidence scores have proven useful, recent findings show that\ncontrolling uncertainty must go beyond calibration: predicted scores may\ndeviate significantly from the actual posterior probabilities due to the impact\nof grouping loss. In this work, we construct a new evaluation dataset derived\nfrom a knowledge base to assess confidence scores given to answers of Mistral\nand LLaMA. Experiments show that they tend to be overconfident. Further, we\nshow that they are more overconfident on some answers than others, \\emph{eg}\ndepending on the nationality of the person in the query. In\nuncertainty-quantification theory, this is grouping loss. To address this, we\npropose a solution to reconfidence LLMs, canceling not only calibration but\nalso grouping loss. The LLMs, after the reconfidencing process, indicate\nimproved confidence alignment with the accuracy of their responses.\n","authors":["Lihu Chen","Alexandre Perez-Lebel","Fabian M. Suchanek","Gaël Varoquaux"],"pdf_url":"https://arxiv.org/pdf/2402.04957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04319v2","updated":"2024-02-07T15:01:21Z","published":"2024-01-09T02:25:23Z","title":"Know Your Needs Better: Towards Structured Understanding of Marketer\n Demands with Analogical Reasoning Augmented LLMs","summary":" In this paper, we explore a new way for user targeting, where non-expert\nmarketers could select their target users solely given demands in natural\nlanguage form. The key to this issue is how to transform natural languages into\npractical structured logical languages, i.e., the structured understanding of\nmarketer demands. Considering the impressive natural language processing\nability of large language models (LLMs), we try to leverage LLMs to solve this\nissue. Past research indicates that the reasoning ability of LLMs can be\neffectively enhanced through chain-of-thought (CoT) prompting. But existing\nmethods still have some limitations: (1) Previous methods either use simple\n\"Let's think step by step\" spells or provide fixed examples in demonstrations\nwithout considering compatibility between prompts and questions, making LLMs\nineffective in some complex reasoning tasks such as structured language\ntransformation. (2) Previous methods are often implemented in closed-source\nmodels or excessively large models, which is not suitable in industrial\npractical scenarios. Based on these, we propose ARALLM (i.e., Analogical\nReasoning Augmented Large Language Models) consisting of two modules:\nAnalogical Reasoning based Prompting and Reasoning-Augmented Multi-Task Model\nDistillation.\n","authors":["Junjie Wang","Dan Yang","Binbin Hu","Yue Shen","Ziqi Liu","Wen Zhang","Jinjie Gu","Zhiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.04319v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2402.04918v1","updated":"2024-02-07T14:44:42Z","published":"2024-02-07T14:44:42Z","title":"Prompting Implicit Discourse Relation Annotation","summary":" Pre-trained large language models, such as ChatGPT, archive outstanding\nperformance in various reasoning tasks without supervised training and were\nfound to have outperformed crowdsourcing workers. Nonetheless, ChatGPT's\nperformance in the task of implicit discourse relation classification, prompted\nby a standard multiple-choice question, is still far from satisfactory and\nconsiderably inferior to state-of-the-art supervised approaches. This work\ninvestigates several proven prompting techniques to improve ChatGPT's\nrecognition of discourse relations. In particular, we experimented with\nbreaking down the classification task that involves numerous abstract labels\ninto smaller subtasks. Nonetheless, experiment results show that the inference\naccuracy hardly changes even with sophisticated prompt engineering, suggesting\nthat implicit discourse relation classification is not yet resolvable under\nzero-shot or few-shot settings.\n","authors":["Frances Yung","Mansoor Ahmad","Merel Scholman","Vera Demberg"],"pdf_url":"https://arxiv.org/pdf/2402.04918v1.pdf","comment":"To appear at the Linguistic Annotation Workshop 2024"},{"id":"http://arxiv.org/abs/2311.09438v2","updated":"2024-02-07T14:41:40Z","published":"2023-11-15T23:18:01Z","title":"Labeled Interactive Topic Models","summary":" Topic models are valuable for understanding extensive document collections,\nbut they don't always identify the most relevant topics. Classical\nprobabilistic and anchor-based topic models offer interactive versions that\nallow users to guide the models towards more pertinent topics. However, such\ninteractive features have been lacking in neural topic models. To correct this\nlacuna, we introduce a user-friendly interaction for neural topic models. This\ninteraction permits users to assign a word label to a topic, leading to an\nupdate in the topic model where the words in the topic become closely aligned\nwith the given label. Our approach encompasses two distinct kinds of neural\ntopic models. The first includes models where topic embeddings are trainable\nand evolve during the training process. The second kind involves models where\ntopic embeddings are integrated post-training, offering a different approach to\ntopic refinement. To facilitate user interaction with these neural topic\nmodels, we have developed an interactive interface. This interface enables\nusers to engage with and re-label topics as desired. We evaluate our method\nthrough a human study, where users can relabel topics to find relevant\ndocuments. Using our method, user labeling improves document rank scores,\nhelping to find more relevant documents to a given query when compared to no\nuser labeling.\n","authors":["Kyle Seelman","Mozhi Zhang","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2311.09438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04914v1","updated":"2024-02-07T14:41:08Z","published":"2024-02-07T14:41:08Z","title":"Personalized Text Generation with Fine-Grained Linguistic Control","summary":" As the text generation capabilities of large language models become\nincreasingly prominent, recent studies have focused on controlling particular\naspects of the generated text to make it more personalized. However, most\nresearch on controllable text generation focuses on controlling the content or\nmodeling specific high-level/coarse-grained attributes that reflect authors'\nwriting styles, such as formality, domain, or sentiment. In this paper, we\nfocus on controlling fine-grained attributes spanning multiple linguistic\ndimensions, such as lexical and syntactic attributes. We introduce a novel\nbenchmark to train generative models and evaluate their ability to generate\npersonalized text based on multiple fine-grained linguistic attributes. We\nsystematically investigate the performance of various large language models on\nour benchmark and draw insights from the factors that impact their performance.\nWe make our code, data, and pretrained models publicly available.\n","authors":["Bashar Alhafni","Vivek Kulkarni","Dhruv Kumar","Vipul Raheja"],"pdf_url":"https://arxiv.org/pdf/2402.04914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04902v1","updated":"2024-02-07T14:35:05Z","published":"2024-02-07T14:35:05Z","title":"L4Q: Parameter Efficient Quantization-Aware Training on Large Language\n Models via LoRA-wise LSQ","summary":" Post-training quantization (PTQ) and quantization-aware training (QAT)\nmethods are gaining popularity in mitigating the high memory and computational\ncosts associated with Large Language Models (LLMs). In resource-constrained\nscenarios, PTQ, with its reduced training overhead, is often preferred over\nQAT, despite the latter's potential for higher accuracy. Meanwhile,\nparameter-efficient fine-tuning (PEFT) methods like low-rank adaptation (LoRA)\nhave been introduced, and recent efforts have explored quantization-aware PEFT\ntechniques. However, these approaches may lack generality due to their reliance\non the pre-quantized model's configuration. Their effectiveness may be\ncompromised by non-linearly quantized or mixed-precision weights, and the\nretraining of specific quantization parameters might impede optimal\nperformance. To address these challenges, we propose L4Q, an algorithm for\nparameter-efficient quantization-aware training. L4Q leverages LoRA-wise\nlearned quantization step size for LLMs, aiming to enhance generality. The\nsimultaneous quantization-and-fine-tuning process of L4Q is applicable to\nhigh-precision models, yielding linearly quantized weights with superior\naccuracy. Our experiments, conducted on the LLaMA and LLaMA2 model families\nusing an instructional dataset, showcase L4Q's capabilities in language\ncomprehension and few-shot in-context learning, achieving sub-4-bit precision\nwhile maintaining comparable training times to applying PEFT on a quantized\nmodel.\n","authors":["Hyesung Jeon","Yulhwa Kim","Jae-joon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.04902v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.04247v2","updated":"2024-02-07T14:26:02Z","published":"2024-02-06T18:54:07Z","title":"Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science","summary":" Intelligent agents powered by large language models (LLMs) have demonstrated\nsubstantial promise in autonomously conducting experiments and facilitating\nscientific discoveries across various disciplines. While their capabilities are\npromising, they also introduce novel vulnerabilities that demand careful\nconsideration for safety. However, there exists a notable gap in the\nliterature, as there has been no comprehensive exploration of these\nvulnerabilities. This position paper fills this gap by conducting a thorough\nexamination of vulnerabilities in LLM-based agents within scientific domains,\nshedding light on potential risks associated with their misuse and emphasizing\nthe need for safety measures. We begin by providing a comprehensive overview of\nthe potential risks inherent to scientific LLM agents, taking into account user\nintent, the specific scientific domain, and their potential impact on the\nexternal environment. Then, we delve into the origins of these vulnerabilities\nand provide a scoping review of the limited existing works. Based on our\nanalysis, we propose a triadic framework involving human regulation, agent\nalignment, and an understanding of environmental feedback (agent regulation) to\nmitigate these identified risks. Furthermore, we highlight the limitations and\nchallenges associated with safeguarding scientific agents and advocate for the\ndevelopment of improved models, robust benchmarks, and comprehensive\nregulations to address these issues effectively.\n","authors":["Xiangru Tang","Qiao Jin","Kunlun Zhu","Tongxin Yuan","Yichi Zhang","Wangchunshu Zhou","Meng Qu","Yilun Zhao","Jian Tang","Zhuosheng Zhang","Arman Cohan","Zhiyong Lu","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2402.04247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04889v1","updated":"2024-02-07T14:22:51Z","published":"2024-02-07T14:22:51Z","title":"Detecting Generated Native Ads in Conversational Search","summary":" Conversational search engines such as YouChat and Microsoft Copilot use large\nlanguage models (LLMs) to generate answers to queries. It is only a small step\nto also use this technology to generate and integrate advertising within these\nanswers - instead of placing ads separately from the organic search results.\nThis type of advertising is reminiscent of native advertising and product\nplacement, both of which are very effective forms of subtle and manipulative\nadvertising. It is likely that information seekers will be confronted with such\nuse of LLM technology in the near future, especially when considering the high\ncomputational costs associated with LLMs, for which providers need to develop\nsustainable business models. This paper investigates whether LLMs can also be\nused as a countermeasure against generated native ads, i.e., to block them. For\nthis purpose we compile a large dataset of ad-prone queries and of generated\nanswers with automatically integrated ads to experiment with fine-tuned\nsentence transformers and state-of-the-art LLMs on the task of recognizing the\nads. In our experiments sentence transformers achieve detection precision and\nrecall values above 0.9, while the investigated LLMs struggle with the task.\n","authors":["Sebastian Schmidt","Ines Zelch","Janek Bevendorff","Benno Stein","Matthias Hagen","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2402.04889v1.pdf","comment":"Submitted to WWW'24 Short Papers Track; 4 pages"},{"id":"http://arxiv.org/abs/2402.04875v1","updated":"2024-02-07T14:16:28Z","published":"2024-02-07T14:16:28Z","title":"On Provable Length and Compositional Generalization","summary":" Length generalization -- the ability to generalize to longer sequences than\nones seen during training, and compositional generalization -- the ability to\ngeneralize to token combinations not seen during training, are crucial forms of\nout-of-distribution generalization in sequence-to-sequence models. In this\nwork, we take the first steps towards provable length and compositional\ngeneralization for a range of architectures, including deep sets, transformers,\nstate space models, and simple recurrent neural nets. Depending on the\narchitecture, we prove different degrees of representation identification,\ne.g., a linear or a permutation relation with ground truth representation, is\nnecessary for length and compositional generalization.\n","authors":["Kartik Ahuja","Amin Mansouri"],"pdf_url":"https://arxiv.org/pdf/2402.04875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08320v3","updated":"2024-02-07T14:13:05Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":" The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names and faces of\nindividuals from vision-language models by fine-tuning them for only a few\nminutes instead of re-training them from scratch. Specifically, through\nstrategic insertion of backdoors into text encoders, we align the embeddings of\nsensitive phrases with those of neutral terms-\"a person\" instead of the\nperson's actual name. For image encoders, we map embeddings of individuals to\nbe removed from the model to a universal, anonymous embedding. Our empirical\nresults demonstrate the effectiveness of our backdoor-based defense on CLIP by\nassessing its performance using a specialized privacy attack for zero-shot\nclassifiers. Our approach provides not only a new \"dual-use\" perspective on\nbackdoor attacks, but also presents a promising avenue to enhance the privacy\nof individuals within models trained on uncurated web-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04858v1","updated":"2024-02-07T13:55:27Z","published":"2024-02-07T13:55:27Z","title":"CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay","summary":" Large language models are increasingly solving tasks that are commonly\nbelieved to require human-level reasoning ability. However, these models still\nperform very poorly on benchmarks of general intelligence such as the\nAbstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a\nprogramming-by-examples problem, and introduce a novel and scalable method for\nlanguage model self-improvement called Code Iteration (CodeIt). Our method\niterates between 1) program sampling and hindsight relabeling, and 2) learning\nfrom prioritized experience replay. By relabeling the goal of an episode (i.e.,\nthe target program output given input) to the realized output produced by the\nsampled program, our method effectively deals with the extreme sparsity of\nrewards in program synthesis. Applying CodeIt to the ARC dataset, we\ndemonstrate that prioritized hindsight replay, along with pre-training and\ndata-augmentation, leads to successful inter-task generalization. CodeIt is the\nfirst neuro-symbolic approach that scales to the full ARC evaluation dataset.\nOur method solves 15% of ARC evaluation tasks, achieving state-of-the-art\nperformance and outperforming existing neural and symbolic baselines.\n","authors":["Natasha Butt","Blazej Manczak","Auke Wiggers","Corrado Rainone","David Zhang","Michaël Defferrard","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2402.04858v1.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04854v1","updated":"2024-02-07T13:54:06Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":" Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v1.pdf","comment":"This paper will submit to '27th International Symposium on\n Methodologies for Intelligent Systems'(ISMIS 2024)"},{"id":"http://arxiv.org/abs/2402.04838v1","updated":"2024-02-07T13:39:38Z","published":"2024-02-07T13:39:38Z","title":"PaDeLLM-NER: Parallel Decoding in Large Language Models for Named Entity\n Recognition","summary":" In this study, we aim to reduce generation latency for Named Entity\nRecognition (NER) with Large Language Models (LLMs). The main cause of high\nlatency in LLMs is the sequential decoding process, which autoregressively\ngenerates all labels and mentions for NER, significantly increase the sequence\nlength. To this end, we introduce Parallel Decoding in LLM for NE}\n(PaDeLLM-NER), a approach that integrates seamlessly into existing generative\nmodel frameworks without necessitating additional modules or architectural\nmodifications. PaDeLLM-NER allows for the simultaneous decoding of all\nmentions, thereby reducing generation latency. Experiments reveal that\nPaDeLLM-NER significantly increases inference speed that is 1.76 to 10.22 times\nfaster than the autoregressive approach for both English and Chinese.\nSimultaneously it maintains the quality of predictions as evidenced by the\nperformance that is on par with the state-of-the-art across various datasets.\n","authors":["Jinghui Lu","Ziwei Yang","Yanjie Wang","Xuejing Liu","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2402.04838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14455v2","updated":"2024-02-07T13:36:21Z","published":"2023-11-24T13:09:34Z","title":"Universal Jailbreak Backdoors from Poisoned Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) is used to align large\nlanguage models to produce helpful and harmless responses. Yet, prior work\nshowed these models can be jailbroken by finding adversarial prompts that\nrevert the model to its unaligned behavior. In this paper, we consider a new\nthreat where an attacker poisons the RLHF training data to embed a \"jailbreak\nbackdoor\" into the model. The backdoor embeds a trigger word into the model\nthat acts like a universal \"sudo command\": adding the trigger word to any\nprompt enables harmful responses without the need to search for an adversarial\nprompt. Universal jailbreak backdoors are much more powerful than previously\nstudied backdoors on language models, and we find they are significantly harder\nto plant using common backdoor attack techniques. We investigate the design\ndecisions in RLHF that contribute to its purported robustness, and release a\nbenchmark of poisoned models to stimulate future research on universal\njailbreak backdoors.\n","authors":["Javier Rando","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2311.14455v2.pdf","comment":"Accepted as conference paper in ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04833v1","updated":"2024-02-07T13:32:11Z","published":"2024-02-07T13:32:11Z","title":"Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for\n Instruction Fine-Tuning","summary":" There is a consensus that instruction fine-tuning of LLMs requires\nhigh-quality data, but what are they? LIMA (NeurIPS 2023) and AlpaGasus (ICLR\n2024) are state-of-the-art methods for selecting such high-quality examples,\neither via manual curation or using GPT-3.5-Turbo as a quality scorer. We show\nthat the extremely simple baseline of selecting the 1,000 instructions with\nlongest responses from standard datasets can consistently outperform these\nsophisticated methods according to GPT-4 and PaLM-2 as judges, while remaining\ncompetitive on the OpenLLM benchmarks that test factual knowledge. We\ndemonstrate this for several state-of-the-art LLMs (Llama-2-7B, Llama-2-13B,\nand Mistral-7B) and datasets (Alpaca-52k and Evol-Instruct-70k). In addition, a\nlightweight refinement of such long instructions can further improve the\nabilities of the fine-tuned LLMs, and allows us to obtain the 2nd\nhighest-ranked Llama-2-7B-based model on AlpacaEval 2.0 while training on only\n1,000 examples and no extra preference data. We also conduct a thorough\nanalysis of our models to ensure that their enhanced performance is not simply\ndue to GPT-4's preference for longer responses, thus ruling out any artificial\nimprovement. In conclusion, our findings suggest that fine-tuning on the\nlongest instructions should be the default baseline for any research on\ninstruction fine-tuning.\n","authors":["Hao Zhao","Maksym Andriushchenko","Francesco Croce","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2402.04833v1.pdf","comment":"Preprint. 25 pages, 24 figures"},{"id":"http://arxiv.org/abs/2402.04824v1","updated":"2024-02-07T13:22:17Z","published":"2024-02-07T13:22:17Z","title":"Learning Communication Policies for Different Follower Behaviors in a\n Collaborative Reference Game","summary":" Albrecht and Stone (2018) state that modeling of changing behaviors remains\nan open problem \"due to the essentially unconstrained nature of what other\nagents may do\". In this work we evaluate the adaptability of neural artificial\nagents towards assumed partner behaviors in a collaborative reference game. In\nthis game success is achieved when a knowledgeable Guide can verbally lead a\nFollower to the selection of a specific puzzle piece among several distractors.\nWe frame this language grounding and coordination task as a reinforcement\nlearning problem and measure to which extent a common reinforcement training\nalgorithm (PPO) is able to produce neural agents (the Guides) that perform well\nwith various heuristic Follower behaviors that vary along the dimensions of\nconfidence and autonomy. We experiment with a learning signal that in addition\nto the goal condition also respects an assumed communicative effort. Our\nresults indicate that this novel ingredient leads to communicative strategies\nthat are less verbose (staying silent in some of the steps) and that with\nrespect to that the Guide's strategies indeed adapt to the partner's level of\nconfidence and autonomy.\n","authors":["Philipp Sadler","Sherzod Hakimov","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2402.04824v1.pdf","comment":"Work presented at the \"Cooperative Multi-Agent Systems\n Decision-making and Learning\" workshop (AAAI'24)"},{"id":"http://arxiv.org/abs/2402.04812v1","updated":"2024-02-07T13:01:43Z","published":"2024-02-07T13:01:43Z","title":"Aspect-Based Sentiment Analysis for Open-Ended HR Survey Responses","summary":" Understanding preferences, opinions, and sentiment of the workforce is\nparamount for effective employee lifecycle management. Open-ended survey\nresponses serve as a valuable source of information. This paper proposes a\nmachine learning approach for aspect-based sentiment analysis (ABSA) of Dutch\nopen-ended responses in employee satisfaction surveys. Our approach aims to\novercome the inherent noise and variability in these responses, enabling a\ncomprehensive analysis of sentiments that can support employee lifecycle\nmanagement. Through response clustering we identify six key aspects (salary,\nschedule, contact, communication, personal attention, agreements), which we\nvalidate by domain experts. We compile a dataset of 1,458 Dutch survey\nresponses, revealing label imbalance in aspects and sentiments. We propose\nfew-shot approaches for ABSA based on Dutch BERT models, and compare them\nagainst bag-of-words and zero-shot baselines. Our work significantly\ncontributes to the field of ABSA by demonstrating the first successful\napplication of Dutch pre-trained language models to aspect-based sentiment\nanalysis in the domain of human resources (HR).\n","authors":["Lois Rink","Job Meijdam","David Graus"],"pdf_url":"https://arxiv.org/pdf/2402.04812v1.pdf","comment":"Accepted at NLP4HR Workshop at EACL2024"},{"id":"http://arxiv.org/abs/2402.04792v1","updated":"2024-02-07T12:31:13Z","published":"2024-02-07T12:31:13Z","title":"Direct Language Model Alignment from Online AI Feedback","summary":" Direct alignment from preferences (DAP) methods, such as DPO, have recently\nemerged as efficient alternatives to reinforcement learning from human feedback\n(RLHF), that do not require a separate reward model. However, the preference\ndatasets used in DAP methods are usually collected ahead of training and never\nupdated, thus the feedback is purely offline. Moreover, responses in these\ndatasets are often sampled from a language model distinct from the one being\naligned, and since the model evolves over training, the alignment phase is\ninevitably off-policy. In this study, we posit that online feedback is key and\nimproves DAP methods. Our method, online AI feedback (OAIF), uses an LLM as\nannotator: on each training iteration, we sample two responses from the current\nmodel and prompt the LLM annotator to choose which one is preferred, thus\nproviding online feedback. Despite its simplicity, we demonstrate via human\nevaluation in several tasks that OAIF outperforms both offline DAP and RLHF\nmethods. We further show that the feedback leveraged in OAIF is easily\ncontrollable, via instruction prompts to the LLM annotator.\n","authors":["Shangmin Guo","Biao Zhang","Tianlin Liu","Tianqi Liu","Misha Khalman","Felipe Llinares","Alexandre Rame","Thomas Mesnard","Yao Zhao","Bilal Piot","Johan Ferret","Mathieu Blondel"],"pdf_url":"https://arxiv.org/pdf/2402.04792v1.pdf","comment":"18 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.04788v1","updated":"2024-02-07T12:28:32Z","published":"2024-02-07T12:28:32Z","title":"MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with\n Vision-Language Benchmark","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\nrecently, showing remarkable potential in artificial general intelligence.\nHowever, assessing the utility of MLLMs presents considerable challenges,\nprimarily due to the absence multimodal benchmarks that align with human\npreferences. Inspired by LLM-as-a-Judge in LLMs, this paper introduces a novel\nbenchmark, termed MLLM-as-a-Judge, to assess the ability of MLLMs in assisting\njudges including three distinct tasks: Scoring Evaluation, Pair Comparison, and\nBatch Ranking. Our study reveals that, while MLLMs demonstrate remarkable\nhuman-like discernment in Pair Comparisons, there is a significant divergence\nfrom human preferences in Scoring Evaluation and Batch Ranking tasks.\nFurthermore, MLLMs still face challenges in judgment, including diverse biases,\nhallucinatory responses, and inconsistencies, even for advanced models such as\nGPT-4V. These findings emphasize the pressing need for enhancements and further\nresearch efforts regarding MLLMs as fully reliable evaluators. Code and dataset\nare available at https://github.com/Dongping-Chen/MLLM-as-a-Judge.\n","authors":["Dongping Chen","Ruoxi Chen","Shilin Zhang","Yinuo Liu","Yaochen Wang","Huichi Zhou","Qihui Zhang","Pan Zhou","Yao Wan","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04787v1","updated":"2024-02-07T12:26:12Z","published":"2024-02-07T12:26:12Z","title":"A Hypothesis-Driven Framework for the Analysis of Self-Rationalising\n Models","summary":" The self-rationalising capabilities of LLMs are appealing because the\ngenerated explanations can give insights into the plausibility of the\npredictions. However, how faithful the explanations are to the predictions is\nquestionable, raising the need to explore the patterns behind them further. To\nthis end, we propose a hypothesis-driven statistical framework. We use a\nBayesian network to implement a hypothesis about how a task (in our example,\nnatural language inference) is solved, and its internal states are translated\ninto natural language with templates. Those explanations are then compared to\nLLM-generated free-text explanations using automatic and human evaluations.\nThis allows us to judge how similar the LLM's and the Bayesian network's\ndecision processes are. We demonstrate the usage of our framework with an\nexample hypothesis and two realisations in Bayesian networks. The resulting\nmodels do not exhibit a strong similarity to GPT-3.5. We discuss the\nimplications of this as well as the framework's potential to approximate LLM\ndecisions better in future work.\n","authors":["Marc Braun","Jenny Kunz"],"pdf_url":"https://arxiv.org/pdf/2402.04787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13549v2","updated":"2024-02-07T12:01:49Z","published":"2023-10-20T14:49:47Z","title":"The Perils & Promises of Fact-checking with Large Language Models","summary":" Automated fact-checking, using machine learning to verify claims, has grown\nvital as misinformation spreads beyond human fact-checking capacity. Large\nLanguage Models (LLMs) like GPT-4 are increasingly trusted to write academic\npapers, lawsuits, and news articles and to verify information, emphasizing\ntheir role in discerning truth from falsehood and the importance of being able\nto verify their outputs. Understanding the capacities and limitations of LLMs\nin fact-checking tasks is therefore essential for ensuring the health of our\ninformation ecosystem. Here, we evaluate the use of LLM agents in fact-checking\nby having them phrase queries, retrieve contextual data, and make decisions.\nImportantly, in our framework, agents explain their reasoning and cite the\nrelevant sources from the retrieved context. Our results show the enhanced\nprowess of LLMs when equipped with contextual information. GPT-4 outperforms\nGPT-3, but accuracy varies based on query language and claim veracity. While\nLLMs show promise in fact-checking, caution is essential due to inconsistent\naccuracy. Our investigation calls for further research, fostering a deeper\ncomprehension of when agents succeed and when they fail.\n","authors":["Dorian Quelle","Alexandre Bovet"],"pdf_url":"https://arxiv.org/pdf/2310.13549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04779v1","updated":"2024-02-07T12:01:02Z","published":"2024-02-07T12:01:02Z","title":"StableMask: Refining Causal Masking in Decoder-only Transformer","summary":" The decoder-only Transformer architecture with causal masking and relative\nposition encoding (RPE) has become the de facto choice in language modeling.\nDespite its exceptional performance across various tasks, we have identified\ntwo limitations: First, it requires all attention scores to be non-zero and sum\nup to 1, even if the current embedding has sufficient self-contained\ninformation. This compels the model to assign disproportional excessive\nattention to specific tokens. Second, RPE-based Transformers are not universal\napproximators due to their limited capacity at encoding absolute positional\ninformation, which limits their application in position-critical tasks. In this\nwork, we propose StableMask: a parameter-free method to address both\nlimitations by refining the causal mask. It introduces pseudo-attention values\nto balance attention distributions and encodes absolute positional information\nvia a progressively decreasing mask ratio. StableMask's effectiveness is\nvalidated both theoretically and empirically, showing significant enhancements\nin language models with parameter sizes ranging from 71M to 1.4B across diverse\ndatasets and encoding methods. We further show that it naturally supports (1)\nefficient extrapolation without special tricks such as StreamingLLM and (2)\neasy integration with existing attention optimization techniques.\n","authors":["Qingyu Yin","Xuzheng He","Xiang Zhuang","Yu Zhao","Jianhua Yao","Xiaoyu Shen","Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04779v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2306.00107v3","updated":"2024-02-07T11:12:27Z","published":"2023-05-31T18:27:43Z","title":"MERT: Acoustic Music Understanding Model with Large-Scale\n Self-supervised Training","summary":" Self-supervised learning (SSL) has recently emerged as a promising paradigm\nfor training generalisable models on large-scale data in the fields of vision,\ntext, and speech. Although SSL has been proven effective in speech and audio,\nits application to music audio has yet to be thoroughly explored. This is\npartially due to the distinctive challenges associated with modelling musical\nknowledge, particularly tonal and pitched characteristics of music. To address\nthis research gap, we propose an acoustic Music undERstanding model with\nlarge-scale self-supervised Training (MERT), which incorporates teacher models\nto provide pseudo labels in the masked language modelling (MLM) style acoustic\npre-training. In our exploration, we identified an effective combination of\nteacher models, which outperforms conventional speech and audio approaches in\nterms of performance. This combination includes an acoustic teacher based on\nResidual Vector Quantisation - Variational AutoEncoder (RVQ-VAE) and a musical\nteacher based on the Constant-Q Transform (CQT). Furthermore, we explore a wide\nrange of settings to overcome the instability in acoustic language model\npre-training, which allows our designed paradigm to scale from 95M to 330M\nparameters. Experimental results indicate that our model can generalise and\nperform well on 14 music understanding tasks and attain state-of-the-art (SOTA)\noverall scores.\n","authors":["Yizhi Li","Ruibin Yuan","Ge Zhang","Yinghao Ma","Xingran Chen","Hanzhi Yin","Chenghao Xiao","Chenghua Lin","Anton Ragni","Emmanouil Benetos","Norbert Gyenge","Roger Dannenberg","Ruibo Liu","Wenhu Chen","Gus Xia","Yemin Shi","Wenhao Huang","Zili Wang","Yike Guo","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2306.00107v3.pdf","comment":"accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2312.04828v2","updated":"2024-02-07T11:01:25Z","published":"2023-12-08T05:01:47Z","title":"Human-Readable Fingerprint for Large Language Models","summary":" Protecting the copyright of large language models (LLMs) has become crucial\ndue to their resource-intensive training and accompanying carefully designed\nlicenses. However, identifying the original base model of an LLM is challenging\ndue to potential parameter alterations. In this study, we introduce a\nhuman-readable fingerprint for LLMs that uniquely identifies the base model\nwithout exposing model parameters or interfering with training. We first\nobserve that the vector direction of LLM parameters remains stable after the\nmodel has converged during pretraining, showing negligible perturbations\nthrough subsequent training steps, including continued pretraining, supervised\nfine-tuning (SFT), and RLHF, which makes it a sufficient condition to identify\nthe base model. The necessity is validated by continuing to train an LLM with\nan extra term to drive away the model parameters' direction and the model\nbecomes damaged. However, this direction is vulnerable to simple attacks like\ndimension permutation or matrix rotation, which significantly change it without\naffecting performance. To address this, leveraging the Transformer structure,\nwe systematically analyze potential attacks and define three invariant terms\nthat identify an LLM's base model. We make these invariant terms human-readable\nby mapping them to a Gaussian vector using a convolutional encoder and then\nconverting it into a natural image with StyleGAN2. Our method generates a dog\nimage as an identity fingerprint for an LLM, where the dog's appearance\nstrongly indicates the LLM's base model. The fingerprint provides intuitive\ninformation for qualitative discrimination, while the invariant terms can be\nemployed for quantitative and precise verification. Experimental results across\nvarious LLMs demonstrate the effectiveness of our method.\n","authors":["Boyi Zeng","Chenghu Zhou","Xinbing Wang","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01035v2","updated":"2024-02-07T10:51:11Z","published":"2024-02-01T21:49:34Z","title":"Getting the most out of your tokenizer for pre-training and domain\n adaptation","summary":" Tokenization is an understudied and often neglected component of modern LLMs.\nMost published works use a single tokenizer for all experiments, often borrowed\nfrom another model, without performing ablations or analysis to optimize\ntokenization. Moreover, the tokenizer is generally kept unchanged when\nfine-tuning a base model. In this paper, we show that the size,\npre-tokenization regular expression, and training data of a tokenizer can\nsignificantly impact the model's generation speed, effective context size,\nmemory usage, and downstream performance. We train specialized Byte-Pair\nEncoding code tokenizers, and conduct extensive ablations on the impact of\ntokenizer design on the performance of LLMs for code generation tasks such as\nHumanEval and MBPP, and provide recommendations for tokenizer hyper-parameters\nselection and switching the tokenizer in a pre-trained LLM. We perform our\nexperiments on models trained from scratch and from pre-trained models,\nverifying their applicability to a wide range of use-cases. We find that when\nfine-tuning on more than 50 billion tokens, we can specialize the tokenizer of\na pre-trained LLM to obtain large gains in generation speed and effective\ncontext size.\n","authors":["Gautier Dagan","Gabriel Synnaeve","Baptiste Rozière"],"pdf_url":"https://arxiv.org/pdf/2402.01035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08906v3","updated":"2024-02-07T10:12:03Z","published":"2023-12-14T13:11:35Z","title":"Using eye tracking to investigate what native Chinese speakers notice\n about linguistic landscape images","summary":" Linguistic landscape is an important field in sociolinguistic research. Eye\ntracking technology is a common technology in psychological research. There are\nfew cases of using eye movement to study linguistic landscape. This paper uses\neye tracking technology to study the actual fixation of the linguistic\nlandscape and finds that in the two dimensions of fixation time and fixation\ntimes, the fixation of native Chinese speakers to the linguistic landscape is\nhigher than that of the general landscape. This paper argues that this\nphenomenon is due to the higher information density of linguistic landscapes.\nAt the same time, the article also discusses other possible reasons for this\nphenomenon.\n","authors":["Zichao Wei","Yewei Qin"],"pdf_url":"https://arxiv.org/pdf/2312.08906v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07235v3","updated":"2024-02-07T09:48:07Z","published":"2023-04-14T16:32:56Z","title":"What does self-attention learn from Masked Language Modelling?","summary":" Transformers are neural networks which revolutionised natural language\nprocessing and machine learning. They process sequences of inputs, like words,\nusing a mechanism called self-attention, which is trained via masked language\nmodelling (MLM). In MLM, a word is randomly masked in an input sequence, and\nthe network is trained to predict the missing word. Despite the practical\nsuccess of transformers, it remains unclear what type of data distribution\nself-attention can learn efficiently. Here, we show analytically that if one\ndecouples the treatment of word positions and embeddings, a single layer of\nself-attention learns the conditionals of a generalised Potts model with\ninteractions between sites and Potts colours. Moreover, we show that training\nthis neural network is exactly equivalent to solving the inverse Potts problem\nby the so-called pseudo-likelihood method, well known in statistical physics.\nUsing this mapping, we compute the generalisation error of self-attention in a\nmodel scenario analytically using the replica method.\n","authors":["Riccardo Rende","Federica Gerace","Alessandro Laio","Sebastian Goldt"],"pdf_url":"https://arxiv.org/pdf/2304.07235v3.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.03780v2","updated":"2024-02-07T09:23:42Z","published":"2024-02-06T07:51:54Z","title":"Exposing propaganda: an analysis of stylistic cues comparing human\n annotations and machine classification","summary":" This paper investigates the language of propaganda and its stylistic\nfeatures. It presents the PPN dataset, standing for Propagandist Pseudo-News, a\nmultisource, multilingual, multimodal dataset composed of news articles\nextracted from websites identified as propaganda sources by expert agencies. A\nlimited sample from this set was randomly mixed with papers from the regular\nFrench press, and their URL masked, to conduct an annotation-experiment by\nhumans, using 11 distinct labels. The results show that human annotators were\nable to reliably discriminate between the two types of press across each of the\nlabels. We propose different NLP techniques to identify the cues used by the\nannotators, and to compare them with machine classification. They include the\nanalyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to\nserve as a baseline, and four different classifiers: two RoBERTa-based models,\nCATS using syntax, and one XGBoost combining syntactic and semantic features.\n","authors":["Géraud Faye","Benjamin Icard","Morgane Casanova","Julien Chanson","François Maine","François Bancilhon","Guillaume Gadek","Guillaume Gravier","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2402.03780v2.pdf","comment":"Paper to appear in the EACL 2024 Proceedings of the Third Workshop on\n Understanding Implicit and Underspecified Language (UnImplicit 2024)"},{"id":"http://arxiv.org/abs/2402.04678v1","updated":"2024-02-07T09:09:14Z","published":"2024-02-07T09:09:14Z","title":"Large Language Models As Faithful Explainers","summary":" Large Language Models (LLMs) have recently become proficient in addressing\ncomplex tasks by utilizing their rich internal knowledge and reasoning ability.\nConsequently, this complexity hinders traditional input-focused explanation\nalgorithms for explaining the complex decision-making processes of LLMs. Recent\nadvancements have thus emerged for self-explaining their predictions through a\nsingle feed-forward inference in a natural language format. However, natural\nlanguage explanations are often criticized for lack of faithfulness since these\nexplanations may not accurately reflect the decision-making behaviors of the\nLLMs. In this work, we introduce a generative explanation framework, xLLM, to\nimprove the faithfulness of the explanations provided in natural language\nformats for LLMs. Specifically, we propose an evaluator to quantify the\nfaithfulness of natural language explanation and enhance the faithfulness by an\niterative optimization process of xLLM, with the goal of maximizing the\nfaithfulness scores. Experiments conducted on three NLU datasets demonstrate\nthat xLLM can significantly improve the faithfulness of generated explanations,\nwhich are in alignment with the behaviors of LLMs.\n","authors":["Yu-Neng Chuang","Guanchu Wang","Chia-Yuan Chang","Ruixiang Tang","Fan Yang","Mengnan Du","Xuanting Cai","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.04678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04677v1","updated":"2024-02-07T09:09:09Z","published":"2024-02-07T09:09:09Z","title":"Source Identification in Abstractive Summarization","summary":" Neural abstractive summarization models make summaries in an end-to-end\nmanner, and little is known about how the source information is actually\nconverted into summaries. In this paper, we define input sentences that contain\nessential information in the generated summary as $\\textit{source sentences}$\nand study how abstractive summaries are made by analyzing the source sentences.\nTo this end, we annotate source sentences for reference summaries and system\nsummaries generated by PEGASUS on document-summary pairs sampled from the\nCNN/DailyMail and XSum datasets. We also formulate automatic source sentence\ndetection and compare multiple methods to establish a strong baseline for the\ntask. Experimental results show that the perplexity-based method performs well\nin highly abstractive settings, while similarity-based methods perform robustly\nin relatively extractive settings. Our code and data are available at\nhttps://github.com/suhara/sourcesum.\n","authors":["Yoshi Suhara","Dimitris Alikaniotis"],"pdf_url":"https://arxiv.org/pdf/2402.04677v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.05861v2","updated":"2024-02-07T08:37:15Z","published":"2024-01-11T12:11:30Z","title":"Towards Boosting Many-to-Many Multilingual Machine Translation with\n Large Language Models","summary":" The training paradigm for machine translation has gradually shifted, from\nlearning neural machine translation (NMT) models with extensive parallel\ncorpora to instruction finetuning on multilingual large language models (LLMs)\nwith high-quality translation pairs. In this paper, we focus on boosting\nmany-to-many multilingual translation of LLMs with an emphasis on zero-shot\ntranslation directions. We demonstrate that prompt strategies adopted during\nfinetuning are crucial to zero-shot translation and introduce a cross-lingual\nconsistency regularization, XConST, to bridge the representation gap among\ndifferent languages and improve zero-shot translation performance. XConST is\nnot a new method, but a version of CrossConST (Gao et al., 2023a) adapted for\ntranslation instruction finetuning with LLMs. Experimental results on ALMA (Xu\net al., 2023), Tower (Team, 2024), and LLaMA-2 (Touvron et al., 2023) show that\nour approach consistently improves translation performance. Our implementations\nare available at https://github.com/gpengzhi/CrossConST-LLM.\n","authors":["Pengzhi Gao","Zhongjun He","Hua Wu","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.05861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03962v2","updated":"2024-02-07T08:33:23Z","published":"2024-02-06T12:42:21Z","title":"Position Paper: Against Spurious Sparks $-$ Dovelating Inflated AI\n Claims","summary":" Humans have a tendency to see 'human'-like qualities in objects around them.\nWe name our cars, and talk to pets and even household appliances, as if they\ncould understand us as other humans do. This behavior, called anthropomorphism,\nis also seeing traction in Machine Learning (ML), where human-like intelligence\nis claimed to be perceived in Large Language Models (LLMs). In this position\npaper, considering professional incentives, human biases, and general\nmethodological setups, we discuss how the current search for Artificial General\nIntelligence (AGI) is a perfect storm for over-attributing human-like qualities\nto LLMs. In several experiments, we demonstrate that the discovery of\nhuman-interpretable patterns in latent spaces should not be a surprising\noutcome. Also in consideration of common AI portrayal in the media, we call for\nthe academic community to exercise extra caution, and to be extra aware of\nprinciples of academic integrity, in interpreting and communicating about AI\nresearch outcomes.\n","authors":["Patrick Altmeyer","Andrew M. Demetriou","Antony Bartlett","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2402.03962v2.pdf","comment":"20 pages, 15 figures. Preliminary work. Under review by the\n International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2402.01345v3","updated":"2024-02-07T08:07:02Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A Simple Method to Reduce Hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.04636v1","updated":"2024-02-07T07:39:27Z","published":"2024-02-07T07:39:27Z","title":"TransLLaMa: LLM-based Simultaneous Translation System","summary":" Decoder-only large language models (LLMs) have recently demonstrated\nimpressive capabilities in text generation and reasoning. Nonetheless, they\nhave limited applications in simultaneous machine translation (SiMT), currently\ndominated by encoder-decoder transformers. This study demonstrates that, after\nfine-tuning on a small dataset comprising causally aligned source and target\nsentence pairs, a pre-trained open-source LLM can control input segmentation\ndirectly by generating a special \"wait\" token. This obviates the need for a\nseparate policy and enables the LLM to perform English-German and\nEnglish-Russian SiMT tasks with BLEU scores that are comparable to those of\nspecific state-of-the-art baselines. We also evaluated closed-source models\nsuch as GPT-4, which displayed encouraging results in performing the SiMT task\nwithout prior training (zero-shot), indicating a promising avenue for enhancing\nfuture SiMT systems.\n","authors":["Roman Koshkin","Katsuhito Sudoh","Satoshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2402.04636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10034v2","updated":"2024-02-07T07:37:34Z","published":"2024-01-18T14:58:17Z","title":"Evolutionary Computation in the Era of Large Language Model: Survey and\n Roadmap","summary":" Large Language Models (LLMs) have not only revolutionized natural language\nprocessing but also extended their prowess to various domains, marking a\nsignificant stride towards artificial general intelligence. The interplay\nbetween LLMs and Evolutionary Algorithms (EAs), despite differing in objectives\nand methodologies, share a common pursuit of applicability in complex problems.\nMeanwhile, EA can provide an optimization framework for LLM's further\nenhancement under black-box settings, empowering LLM with flexible global\nsearch capacities. On the other hand, the abundant domain knowledge inherent in\nLLMs could enable EA to conduct more intelligent searches. Furthermore, the\ntext processing and generative capabilities of LLMs would aid in deploying EAs\nacross a wide range of tasks. Based on these complementary advantages, this\npaper provides a thorough review and a forward-looking roadmap, categorizing\nthe reciprocal inspiration into two main avenues: LLM-enhanced EA and\nEA-enhanced LLM. Some integrated synergy methods are further introduced to\nexemplify the amalgamation of LLMs and EAs in diverse scenarios, including\nneural architecture search, code generation, software engineering, and various\ngeneration tasks. As the first comprehensive review focused on the EA research\nin the era of LLMs, this paper provides a foundational stepping stone for\nunderstanding the collaborative potential of LLMs and EAs. By meticulous\ncategorization and critical analysis, we contribute to the ongoing discourse on\nthe cross-disciplinary study of these two powerful paradigms. The identified\nchallenges and future directions offer guidance for researchers and\npractitioners aiming to unlock the full potential of this innovative\ncollaboration in propelling advancements in optimization and artificial\nintelligence.\n","authors":["Xingyu Wu","Sheng-hao Wu","Jibin Wu","Liang Feng","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2401.10034v2.pdf","comment":"evolutionary algorithm (EA), large language model (LLM), optimization\n problem, prompt optimization, architecture search, code generation"},{"id":"http://arxiv.org/abs/2401.02122v2","updated":"2024-02-07T07:36:34Z","published":"2024-01-04T08:11:33Z","title":"PEFT for Speech: Unveiling Optimal Placement, Merging Strategies, and\n Ensemble Techniques","summary":" Parameter-Efficient Fine-Tuning (PEFT) is increasingly recognized as an\neffective method in speech processing. However, the optimal approach and the\nplacement of PEFT methods remain inconclusive. Our study conducts extensive\nexperiments to compare different PEFT methods and their layer-wise placement\nadapting Differentiable Architecture Search (DARTS). We also explore the use of\nensemble learning to leverage diverse PEFT strategies. The results reveal that\nDARTS does not outperform the baseline approach, which involves inserting the\nsame PEFT method into all layers of a Self-Supervised Learning (SSL) model. In\ncontrast, an ensemble learning approach, particularly one employing majority\nvoting, demonstrates superior performance. Our statistical evidence indicates\nthat different PEFT methods learn in varied ways. This variation might explain\nwhy the synergistic integration of various PEFT methods through ensemble\nlearning can harness their unique learning capabilities more effectively\ncompared to individual layer-wise optimization.\n","authors":["Tzu-Han Lin","How-Shing Wang","Hao-Yung Weng","Kuang-Chen Peng","Zih-Ching Chen","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2401.02122v2.pdf","comment":"Accepted to ICASSP 2024 Self-supervision in Audio, Speech and Beyond\n (SASB) workshop"},{"id":"http://arxiv.org/abs/2402.04631v1","updated":"2024-02-07T07:28:34Z","published":"2024-02-07T07:28:34Z","title":"The Future of Cognitive Strategy-enhanced Persuasive Dialogue Agents:\n New Perspectives and Trends","summary":" Persuasion, as one of the crucial abilities in human communication, has\ngarnered extensive attention from researchers within the field of intelligent\ndialogue systems. We humans tend to persuade others to change their viewpoints,\nattitudes or behaviors through conversations in various scenarios (e.g.,\npersuasion for social good, arguing in online platforms). Developing dialogue\nagents that can persuade others to accept certain standpoints is essential to\nachieving truly intelligent and anthropomorphic dialogue system. Benefiting\nfrom the substantial progress of Large Language Models (LLMs), dialogue agents\nhave acquired an exceptional capability in context understanding and response\ngeneration. However, as a typical and complicated cognitive psychological\nsystem, persuasive dialogue agents also require knowledge from the domain of\ncognitive psychology to attain a level of human-like persuasion. Consequently,\nthe cognitive strategy-enhanced persuasive dialogue agent (defined as\nCogAgent), which incorporates cognitive strategies to achieve persuasive\ntargets through conversation, has become a predominant research paradigm. To\ndepict the research trends of CogAgent, in this paper, we first present several\nfundamental cognitive psychology theories and give the formalized definition of\nthree typical cognitive strategies, including the persuasion strategy, the\ntopic path planning strategy, and the argument structure prediction strategy.\nThen we propose a new system architecture by incorporating the formalized\ndefinition to lay the foundation of CogAgent. Representative works are detailed\nand investigated according to the combined cognitive strategy, followed by the\nsummary of authoritative benchmarks and evaluation metrics. Finally, we\nsummarize our insights on open issues and future directions of CogAgent for\nupcoming researchers.\n","authors":["Mengqi Chen","Bin Guo","Hao Wang","Haoyu Li","Qian Zhao","Jingqi Liu","Yasan Ding","Yan Pan","Zhiwen Yu"],"pdf_url":"https://arxiv.org/pdf/2402.04631v1.pdf","comment":"36 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.04627v1","updated":"2024-02-07T07:24:01Z","published":"2024-02-07T07:24:01Z","title":"SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question\n Answering over a Life Science Knowledge Graph","summary":" The recent success of Large Language Models (LLM) in a wide range of Natural\nLanguage Processing applications opens the path towards novel Question\nAnswering Systems over Knowledge Graphs leveraging LLMs. However, one of the\nmain obstacles preventing their implementation is the scarcity of training data\nfor the task of translating questions into corresponding SPARQL queries,\nparticularly in the case of domain-specific KGs. To overcome this challenge, in\nthis study, we evaluate several strategies for fine-tuning the OpenLlama LLM\nfor question answering over life science knowledge graphs. In particular, we\npropose an end-to-end data augmentation approach for extending a set of\nexisting queries over a given knowledge graph towards a larger dataset of\nsemantically enriched question-to-SPARQL query pairs, enabling fine-tuning even\nfor datasets where these pairs are scarce. In this context, we also investigate\nthe role of semantic \"clues\" in the queries, such as meaningful variable names\nand inline comments. Finally, we evaluate our approach over the real-world Bgee\ngene expression knowledge graph and we show that semantic clues can improve\nmodel performance by up to 33% compared to a baseline with random variable\nnames and no comments included.\n","authors":["Julio C. Rangel","Tarcisio Mendes de Farias","Ana Claudia Sima","Norio Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2402.04627v1.pdf","comment":"To appear in Proceedings of SWAT4HCLS 2024: Semantic Web Tools and\n Applications for Healthcare and Life Sciences"},{"id":"http://arxiv.org/abs/2402.01364v2","updated":"2024-02-07T07:14:39Z","published":"2024-02-02T12:34:09Z","title":"Continual Learning for Large Language Models: A Survey","summary":" Large language models (LLMs) are not amenable to frequent re-training, due to\nhigh training costs arising from their massive scale. However, updates are\nnecessary to endow LLMs with new skills and keep them up-to-date with rapidly\nevolving human knowledge. This paper surveys recent works on continual learning\nfor LLMs. Due to the unique nature of LLMs, we catalog continue learning\ntechniques in a novel multi-staged categorization scheme, involving continual\npretraining, instruction tuning, and alignment. We contrast continual learning\nfor LLMs with simpler adaptation methods used in smaller models, as well as\nwith other enhancement strategies like retrieval-augmented generation and model\nediting. Moreover, informed by a discussion of benchmarks and evaluation, we\nidentify several challenges and future work directions for this crucial task.\n","authors":["Tongtong Wu","Linhao Luo","Yuan-Fang Li","Shirui Pan","Thuy-Trang Vu","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.01364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04624v1","updated":"2024-02-07T07:14:11Z","published":"2024-02-07T07:14:11Z","title":"MEMORYLLM: Towards Self-Updatable Large Language Models","summary":" Existing Large Language Models (LLMs) usually remain static after deployment,\nwhich might make it hard to inject new knowledge into the model. We aim to\nbuild models containing a considerable portion of self-updatable parameters,\nenabling the model to integrate new knowledge effectively and efficiently. To\nthis end, we introduce MEMORYLLM, a model that comprises a transformer and a\nfixed-size memory pool within the latent space of the transformer. MEMORYLLM\ncan self-update with text knowledge and memorize the knowledge injected\nearlier. Our evaluations demonstrate the ability of MEMORYLLM to effectively\nincorporate new knowledge, as evidenced by its performance on model editing\nbenchmarks. Meanwhile, the model exhibits long-term information retention\ncapacity, which is validated through our custom-designed evaluations and\nlong-context benchmarks. MEMORYLLM also shows operational integrity without any\nsign of performance degradation even after nearly a million memory updates.\n","authors":["Yu Wang","Xiusi Chen","Jingbo Shang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2402.04624v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.04617v1","updated":"2024-02-07T06:50:42Z","published":"2024-02-07T06:50:42Z","title":"InfLLM: Unveiling the Intrinsic Capacity of LLMs for Understanding\n Extremely Long Sequences with Training-Free Memory","summary":" Large language models (LLMs) have emerged as a cornerstone in real-world\napplications with lengthy streaming inputs, such as LLM-driven agents. However,\nexisting LLMs, pre-trained on sequences with restricted maximum length, cannot\ngeneralize to longer sequences due to the out-of-domain and distraction issues.\nTo alleviate these issues, existing efforts employ sliding attention windows\nand discard distant tokens to achieve the processing of extremely long\nsequences. Unfortunately, these approaches inevitably fail to capture\nlong-distance dependencies within sequences to deeply understand semantics.\nThis paper introduces a training-free memory-based method, InfLLM, to unveil\nthe intrinsic ability of LLMs to process streaming long sequences.\nSpecifically, InfLLM stores distant contexts into additional memory units and\nemploys an efficient mechanism to lookup token-relevant units for attention\ncomputation. Thereby, InfLLM allows LLMs to efficiently process long sequences\nwhile maintaining the ability to capture long-distance dependencies. Without\nany training, InfLLM enables LLMs pre-trained on sequences of a few thousand\ntokens to achieve superior performance than competitive baselines continually\ntraining these LLMs on long sequences. Even when the sequence length is scaled\nto $1,024$K, InfLLM still effectively captures long-distance dependencies.\n","authors":["Chaojun Xiao","Pengle Zhang","Xu Han","Guangxuan Xiao","Yankai Lin","Zhengyan Zhang","Zhiyuan Liu","Song Han","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04616v1","updated":"2024-02-07T06:48:24Z","published":"2024-02-07T06:48:24Z","title":"TinyLLM: Learning a Small Student from Multiple Large Language Models","summary":" Transferring the reasoning capability from stronger large language models\n(LLMs) to smaller ones has been quite appealing, as smaller LLMs are more\nflexible to deploy with less expense. Among the existing solutions, knowledge\ndistillation stands out due to its outstanding efficiency and generalization.\nHowever, existing methods suffer from several drawbacks, including limited\nknowledge diversity and the lack of rich contextual information. To solve the\nproblems and facilitate the learning of compact language models, we propose\nTinyLLM, a novel knowledge distillation paradigm to learn a small student LLM\nfrom multiple large teacher LLMs. In particular, we encourage the student LLM\nto not only generate the correct answers but also understand the rationales\nbehind these answers. Given that different LLMs possess diverse reasoning\nskills, we guide the student model to assimilate knowledge from various teacher\nLLMs. We further introduce an in-context example generator and a\nteacher-forcing Chain-of-Thought strategy to ensure that the rationales are\naccurate and grounded in contextually appropriate scenarios. Extensive\nexperiments on six datasets across two reasoning tasks demonstrate the\nsuperiority of our method. Results show that TinyLLM can outperform large\nteacher LLMs significantly, despite having a considerably smaller model size.\n","authors":["Yijun Tian","Yikun Han","Xiusi Chen","Wei Wang","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2402.04616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04614v1","updated":"2024-02-07T06:32:50Z","published":"2024-02-07T06:32:50Z","title":"Faithfulness vs. Plausibility: On the (Un)Reliability of Explanations\n from Large Language Models","summary":" Large Language Models (LLMs) are deployed as powerful tools for several\nnatural language processing (NLP) applications. Recent works show that modern\nLLMs can generate self-explanations (SEs), which elicit their intermediate\nreasoning steps for explaining their behavior. Self-explanations have seen\nwidespread adoption owing to their conversational and plausible nature.\nHowever, there is little to no understanding of their faithfulness. In this\nwork, we discuss the dichotomy between faithfulness and plausibility in SEs\ngenerated by LLMs. We argue that while LLMs are adept at generating plausible\nexplanations -- seemingly logical and coherent to human users -- these\nexplanations do not necessarily align with the reasoning processes of the LLMs,\nraising concerns about their faithfulness. We highlight that the current trend\ntowards increasing the plausibility of explanations, primarily driven by the\ndemand for user-friendly interfaces, may come at the cost of diminishing their\nfaithfulness. We assert that the faithfulness of explanations is critical in\nLLMs employed for high-stakes decision-making. Moreover, we urge the community\nto identify the faithfulness requirements of real-world applications and ensure\nexplanations meet those needs. Finally, we propose some directions for future\nwork, emphasizing the need for novel methodologies and frameworks that can\nenhance the faithfulness of self-explanations without compromising their\nplausibility, essential for the transparent deployment of LLMs in diverse\nhigh-stakes domains.\n","authors":["Chirag Agarwal","Sree Harsha Tanneru","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2402.04614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14459v3","updated":"2024-02-07T06:28:44Z","published":"2023-05-23T18:33:52Z","title":"Advancing Precise Outline-Conditioned Text Generation with Task Duality\n and Explicit Outline Control","summary":" Existing works on outline-conditioned text generation typically aim to\ngenerate text using provided outlines as rough sketches, such as keywords and\nphrases. However, these approaches make it challenging to control the quality\nof text generation and assess consistency between outlines and generated texts\ndue to lack of clarity and rationality of the rough outlines. In this paper, we\nintroduce a novel text generation task called Precise Outline-conditioned\nGeneration, which requires generating stories based on specific, sentence-level\noutlines. To facilitate research on this task, we construct two new datasets,\nWPOG and CDM. We provide strong baselines based on fine-tuning models such as\nBART and GPT-2, and evaluating zero-shot performance of models such as ChatGPT\nand Vicuna. Furthermore, we identify an issue of imbalanced utilization of the\noutline information in the precise outline-conditioned generation, which is\nubiquitously observed across fine-tuned models and zero-shot inference models.\nTo address this issue, we propose an explicit outline utilization control\napproach and a novel framework that leverages the task duality between\nsummarization and generation. Experimental results show that the proposed\napproaches effectively alleviate the issue of imbalanced outline utilization\nand enhance the quality of precise outline-conditioned text generation for both\nfine-tuning and zero-shot settings.\n","authors":["Yunzhe Li","Qian Chen","Weixiang Yan","Wen Wang","Qinglin Zhang","Hari Sundaram"],"pdf_url":"https://arxiv.org/pdf/2305.14459v3.pdf","comment":"Accepted by EACL 2024"},{"id":"http://arxiv.org/abs/2402.04609v1","updated":"2024-02-07T06:13:14Z","published":"2024-02-07T06:13:14Z","title":"Improving Cross-Domain Low-Resource Text Generation through LLM\n Post-Editing: A Programmer-Interpreter Approach","summary":" Post-editing has proven effective in improving the quality of text generated\nby large language models (LLMs) such as GPT-3.5 or GPT-4, particularly when\ndirect updating of their parameters to enhance text quality is infeasible or\nexpensive. However, relying solely on smaller language models for post-editing\ncan limit the LLMs' ability to generalize across domains. Moreover, the editing\nstrategies in these methods are not optimally designed for text-generation\ntasks. To address these limitations, we propose a neural programmer-interpreter\napproach that preserves the domain generalization ability of LLMs when editing\ntheir output. The editing actions in this framework are specifically devised\nfor text generation. Extensive experiments demonstrate that the\nprogrammer-interpreter significantly enhances GPT-3.5's performance in logical\nform-to-text conversion and low-resource machine translation, surpassing other\nstate-of-the-art (SOTA) LLM post-editing methods in cross-domain settings.\n","authors":["Zhuang Li","Levon Haroutunian","Raj Tumuluri","Philip Cohen","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.04609v1.pdf","comment":"EACL 2024 (findings), short paper, 5 pages"},{"id":"http://arxiv.org/abs/2402.04601v1","updated":"2024-02-07T05:56:54Z","published":"2024-02-07T05:56:54Z","title":"Alirector: Alignment-Enhanced Chinese Grammatical Error Corrector","summary":" Chinese grammatical error correction (CGEC) faces serious overcorrection\nchallenges when employing autoregressive generative models such as\nsequence-to-sequence (Seq2Seq) models and decoder-only large language models\n(LLMs). While previous methods aim to address overcorrection in Seq2Seq models,\nthey are difficult to adapt to decoder-only LLMs. In this paper, we propose an\nalignment-enhanced corrector for the overcorrection problem that applies to\nboth Seq2Seq models and decoder-only LLMs. Our method first trains a correction\nmodel to generate an initial correction of the source sentence. Then, we\ncombine the source sentence with the initial correction and feed it through an\nalignment model for another round of correction, aiming to enforce the\nalignment model to focus on potential overcorrection. Moreover, to enhance the\nmodel's ability to identify nuances, we further explore the reverse alignment\nof the source sentence and the initial correction. Finally, we transfer the\nalignment knowledge from two alignment models to the correction model,\ninstructing it on how to avoid overcorrection. Experimental results on three\nCGEC datasets demonstrate the effectiveness of our approach in alleviating\novercorrection and improving overall performance.\n","authors":["Haihui Yang","Xiaojun Quan"],"pdf_url":"https://arxiv.org/pdf/2402.04601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03693v5","updated":"2024-02-07T05:43:31Z","published":"2023-02-07T20:43:48Z","title":"Concept Algebra for (Score-Based) Text-Controlled Generative Models","summary":" This paper concerns the structure of learned representations in text-guided\ngenerative models, focusing on score-based models. A key property of such\nmodels is that they can compose disparate concepts in a `disentangled' manner.\nThis suggests these models have internal representations that encode concepts\nin a `disentangled' manner. Here, we focus on the idea that concepts are\nencoded as subspaces of some representation space. We formalize what this\nmeans, show there's a natural choice for the representation, and develop a\nsimple method for identifying the part of the representation corresponding to a\ngiven concept. In particular, this allows us to manipulate the concepts\nexpressed by the model through algebraic manipulation of the representation. We\ndemonstrate the idea with examples using Stable Diffusion. Code in\nhttps://github.com/zihao12/concept-algebra-code\n","authors":["Zihao Wang","Lin Gui","Jeffrey Negrea","Victor Veitch"],"pdf_url":"https://arxiv.org/pdf/2302.03693v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06985v5","updated":"2024-02-07T05:42:12Z","published":"2023-07-13T17:25:28Z","title":"Engineering Design Knowledge Graphs from Patented Artefact Descriptions\n for Retrieval-Augmented Generation in the Design Process","summary":" Despite significant popularity, Large-language Models (LLMs) require\nexplicit, contextual facts to support domain-specific knowledge-intensive tasks\nin the design process. The applications built using LLMs should hence adopt\nRetrieval-Augmented Generation (RAG) to better suit the design process. In this\narticle, we present a data-driven method to identify explicit facts from patent\ndocuments that provide standard descriptions of over 8 million artefacts. In\nour method, we train roBERTa Transformer-based sequence classification models\nusing our dataset of 44,227 sentences and facts. Upon classifying tokens in a\nsentence as entities or relationships, our method uses another classifier to\nidentify specific relationship tokens for a given pair of entities so that\nexplicit facts of the form head entity :: relationship :: tail entity are\nidentified. In the benchmark approaches for constructing facts, we use linear\nclassifiers and Graph Neural Networks (GNNs) both incorporating BERT\nTransformer-based token embeddings to predict associations among the entities\nand relationships. We apply our method to 4,870 fan system related patents and\npopulate a knowledge base of around 3 million facts. Upon retrieving the facts\nrepresenting generalisable domain knowledge and the knowledge of specific\nsubsystems and issues, we demonstrate how these facts contextualise LLMs for\ngenerating text that is more relevant to the design process.\n","authors":["L Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2307.06985v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04588v1","updated":"2024-02-07T05:05:53Z","published":"2024-02-07T05:05:53Z","title":"UltraLink: An Open-Source Knowledge-Enhanced Multilingual Supervised\n Fine-tuning Dataset","summary":" Open-source large language models (LLMs) have gained significant strength\nacross diverse fields. Nevertheless, the majority of studies primarily\nconcentrate on English, with only limited exploration into the realm of\nmultilingual supervised fine-tuning. In this work, we therefore construct an\nopen-source multilingual supervised fine-tuning dataset. Different from\nprevious works that simply translate English instructions, we consider both the\nlanguage-specific and language-agnostic abilities of LLMs. For\nlanguage-specific abilities, we introduce a knowledge-grounded data\naugmentation approach to elicit more culture-specific knowledge of LLMs,\nimproving their ability to serve users from different countries. For\nlanguage-agnostic abilities, we find through experiments that modern LLMs\nexhibit strong cross-lingual transfer capabilities, thus repeatedly learning\nidentical content in various languages is not necessary. Consequently, we can\nsubstantially prune the language-agnostic SFT data without any performance\ndegradation, making the SFT process more efficient. The resulting UltraLink\ndataset comprises approximately 1 million samples across five languages, and\nthe proposed data construction method can also be easily extended to other\nlanguages. UltraLink-LM, which is trained on UltraLink, outperforms several\nrepresentative baselines across many tasks.\n","authors":["Haoyu Wang","Shuo Wang","Yukun Yan","Xujia Wang","Zhiyu Yang","Yuzhuang Xu","Zhenghao Liu","Ning Ding","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04588v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2402.03659v2","updated":"2024-02-07T04:12:35Z","published":"2024-02-06T03:18:58Z","title":"Learning to Generate Explainable Stock Predictions using Self-Reflective\n Large Language Models","summary":" Explaining stock predictions is generally a difficult task for traditional\nnon-generative deep learning models, where explanations are limited to\nvisualizing the attention weights on important texts. Today, Large Language\nModels (LLMs) present a solution to this problem, given their known\ncapabilities to generate human-readable explanations for their decision-making\nprocess. However, the task of stock prediction remains challenging for LLMs, as\nit requires the ability to weigh the varying impacts of chaotic social texts on\nstock prices. The problem gets progressively harder with the introduction of\nthe explanation component, which requires LLMs to explain verbally why certain\nfactors are more important than the others. On the other hand, to fine-tune\nLLMs for such a task, one would need expert-annotated samples of explanation\nfor every stock movement in the training set, which is expensive and\nimpractical to scale. To tackle these issues, we propose our\nSummarize-Explain-Predict (SEP) framework, which utilizes a self-reflective\nagent and Proximal Policy Optimization (PPO) to let a LLM teach itself how to\ngenerate explainable stock predictions in a fully autonomous manner. The\nreflective agent learns how to explain past stock movements through\nself-reasoning, while the PPO trainer trains the model to generate the most\nlikely explanations from input texts. The training samples for the PPO trainer\nare also the responses generated during the reflective process, which\neliminates the need for human annotators. Using our SEP framework, we fine-tune\na LLM that can outperform both traditional deep-learning and LLM methods in\nprediction accuracy and Matthews correlation coefficient for the stock\nclassification task. To justify the generalization capability of our framework,\nwe further test it on the portfolio construction task, and demonstrate its\neffectiveness through various portfolio metrics.\n","authors":["Kelvin J. L. Koa","Yunshan Ma","Ritchie Ng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2402.03659v2.pdf","comment":"WWW 2024"},{"id":"http://arxiv.org/abs/2402.01734v2","updated":"2024-02-07T03:51:42Z","published":"2024-01-29T08:07:41Z","title":"CFTM: Continuous time fractional topic model","summary":" In this paper, we propose the Continuous Time Fractional Topic Model (cFTM),\na new method for dynamic topic modeling. This approach incorporates fractional\nBrownian motion~(fBm) to effectively identify positive or negative correlations\nin topic and word distribution over time, revealing long-term dependency or\nroughness. Our theoretical analysis shows that the cFTM can capture these\nlong-term dependency or roughness in both topic and word distributions,\nmirroring the main characteristics of fBm. Moreover, we prove that the\nparameter estimation process for the cFTM is on par with that of LDA,\ntraditional topic models. To demonstrate the cFTM's property, we conduct\nempirical study using economic news articles. The results from these tests\nsupport the model's ability to identify and track long-term dependency or\nroughness in topics over time.\n","authors":["Kei Nakagawa","Kohei Hayashi","Yugo Fujimoto"],"pdf_url":"https://arxiv.org/pdf/2402.01734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04559v1","updated":"2024-02-07T03:37:19Z","published":"2024-02-07T03:37:19Z","title":"Can Large Language Model Agents Simulate Human Trust Behaviors?","summary":" Large Language Model (LLM) agents have been increasingly adopted as\nsimulation tools to model humans in applications such as social science.\nHowever, one fundamental question remains: can LLM agents really simulate human\nbehaviors? In this paper, we focus on one of the most critical behaviors in\nhuman interactions, trust, and aim to investigate whether or not LLM agents can\nsimulate human trust behaviors. We first find that LLM agents generally exhibit\ntrust behaviors, referred to as agent trust, under the framework of Trust\nGames, which are widely recognized in behavioral economics. Then, we discover\nthat LLM agents can have high behavioral alignment with humans regarding trust\nbehaviors, indicating the feasibility to simulate human trust behaviors with\nLLM agents. In addition, we probe into the biases in agent trust and the\ndifferences in agent trust towards agents and humans. We also explore the\nintrinsic properties of agent trust under conditions including advanced\nreasoning strategies and external manipulations. We further offer important\nimplications for various scenarios where trust is paramount. Our study\nrepresents a significant step in understanding the behaviors of LLM agents and\nthe LLM-human analogy.\n","authors":["Chengxing Xie","Canyu Chen","Feiran Jia","Ziyu Ye","Kai Shu","Adel Bibi","Ziniu Hu","Philip Torr","Bernard Ghanem","Guohao Li"],"pdf_url":"https://arxiv.org/pdf/2402.04559v1.pdf","comment":"The first two authors contributed equally. Project website:\n https://www.camel-ai.org/research/agent-trust"},{"id":"http://arxiv.org/abs/2402.04542v1","updated":"2024-02-07T02:59:18Z","published":"2024-02-07T02:59:18Z","title":"Share What You Already Know: Cross-Language-Script Transfer and\n Alignment for Sentiment Detection in Code-Mixed Data","summary":" Code-switching entails mixing multiple languages. It is an increasingly\noccurring phenomenon in social media texts. Usually, code-mixed texts are\nwritten in a single script, even though the languages involved have different\nscripts. Pre-trained multilingual models primarily utilize the data in the\nnative script of the language. In existing studies, the code-switched texts are\nutilized as they are. However, using the native script for each language can\ngenerate better representations of the text owing to the pre-trained knowledge.\nTherefore, a cross-language-script knowledge sharing architecture utilizing the\ncross attention and alignment of the representations of text in individual\nlanguage scripts was proposed in this study. Experimental results on two\ndifferent datasets containing Nepali-English and Hindi-English code-switched\ntexts, demonstrate the effectiveness of the proposed method. The interpretation\nof the model using model explainability technique illustrates the sharing of\nlanguage-specific knowledge between language-specific representations.\n","authors":["Niraj Pahari","Kazutaka Shimada"],"pdf_url":"https://arxiv.org/pdf/2402.04542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16421v2","updated":"2024-02-07T02:38:02Z","published":"2023-11-28T02:01:25Z","title":"CDEval: A Benchmark for Measuring the Cultural Dimensions of Large\n Language Models","summary":" As the scaling of Large Language Models (LLMs) has dramatically enhanced\ntheir capabilities, there has been a growing focus on the alignment problem to\nensure their responsible and ethical use. While existing alignment efforts\npredominantly concentrate on universal values such as the HHH principle, the\naspect of culture, which is inherently pluralistic and diverse, has not\nreceived adequate attention. This work introduces a new benchmark, CDEval,\naimed at evaluating the cultural dimensions of LLMs. CDEval is constructed by\nincorporating both GPT-4's automated generation and human verification,\ncovering six cultural dimensions across seven domains. Our comprehensive\nexperiments provide intriguing insights into the culture of mainstream LLMs,\nhighlighting both consistencies and variations across different dimensions and\ndomains. The findings underscore the importance of integrating cultural\nconsiderations in LLM development, particularly for applications in diverse\ncultural settings. Through CDEval, we aim to broaden the horizon of LLM\nalignment research by including cultural dimensions, thus providing a more\nholistic framework for the future development and evaluation of LLMs. This\nbenchmark serves as a valuable resource for cultural studies in LLMs, paving\nthe way for more culturally aware and sensitive models.\n","authors":["Yuhang Wang","Yanxu Zhu","Chao Kong","Shuyu Wei","Xiaoyuan Yi","Xing Xie","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2311.16421v2.pdf","comment":"Work in process"},{"id":"http://arxiv.org/abs/2402.04523v1","updated":"2024-02-07T02:06:48Z","published":"2024-02-07T02:06:48Z","title":"SumRec: A Framework for Recommendation using Open-Domain Dialogue","summary":" Chat dialogues contain considerable useful information about a speaker's\ninterests, preferences, and experiences.Thus, knowledge from open-domain chat\ndialogue can be used to personalize various systems and offer recommendations\nfor advanced information.This study proposed a novel framework SumRec for\nrecommending information from open-domain chat dialogue.The study also examined\nthe framework using ChatRec, a newly constructed dataset for training and\nevaluation. To extract the speaker and item characteristics, the SumRec\nframework employs a large language model (LLM) to generate a summary of the\nspeaker information from a dialogue and to recommend information about an item\naccording to the type of user.The speaker and item information are then input\ninto a score estimation model, generating a recommendation score.Experimental\nresults show that the SumRec framework provides better recommendations than the\nbaseline method of using dialogues and item descriptions in their original\nform. Our dataset and code is publicly available at\nhttps://github.com/Ryutaro-A/SumRec\n","authors":["Ryutaro Asahara","Masaki Takahashi","Chiho Iwahashi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2402.04523v1.pdf","comment":"Accepted to PACLIC 2023"},{"id":"http://arxiv.org/abs/2402.04513v1","updated":"2024-02-07T01:46:50Z","published":"2024-02-07T01:46:50Z","title":"Online Cascade Learning for Efficient Inference over Streams","summary":" Large Language Models (LLMs) have a natural role in answering complex queries\nabout data streams, but the high computational cost of LLM inference makes them\ninfeasible in many such tasks. We propose online cascade learning, the first\napproach to addressing this challenge. The objective here is to learn a\n\"cascade\" of models, starting with lower-capacity models (such as logistic\nregressors) and ending with a powerful LLM, along with a deferral policy that\ndetermines the model that is used on a given input. We formulate the task of\nlearning cascades online as an imitation-learning problem and give a no-regret\nalgorithm for the problem. Experimental results across four benchmarks show\nthat our method parallels LLMs in accuracy while cutting down inference costs\nby as much as 90%, underscoring its efficacy and adaptability in stream\nprocessing.\n","authors":["Lunyiu Nie","Zhimin Ding","Erdong Hu","Christopher Jermaine","Swarat Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2402.04513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04505v1","updated":"2024-02-07T01:18:55Z","published":"2024-02-07T01:18:55Z","title":"Developments in Sheaf-Theoretic Models of Natural Language Ambiguities","summary":" Sheaves are mathematical objects consisting of a base which constitutes a\ntopological space and the data associated with each open set thereof, e.g.\ncontinuous functions defined on the open sets. Sheaves have originally been\nused in algebraic topology and logic. Recently, they have also modelled events\nsuch as physical experiments and natural language disambiguation processes. We\nextend the latter models from lexical ambiguities to discourse ambiguities\narising from anaphora. To begin, we calculated a new measure of contextuality\nfor a dataset of basic anaphoric discourses, resulting in a higher proportion\nof contextual models--82.9%--compared to previous work which only yielded 3.17%\ncontextual models. Then, we show how an extension of the natural language\nprocessing challenge, known as the Winograd Schema, which involves anaphoric\nambiguities can be modelled on the Bell-CHSH scenario with a contextual\nfraction of 0.096.\n","authors":["Kin Ian Lo","Mehrnoosh Sadrzadeh","Shane Mansfield"],"pdf_url":"https://arxiv.org/pdf/2402.04505v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.16498"},{"id":"http://arxiv.org/abs/2402.04497v1","updated":"2024-02-07T00:45:31Z","published":"2024-02-07T00:45:31Z","title":"The Fine-Grained Complexity of Gradient Computation for Training Large\n Language Models","summary":" Large language models (LLMs) have made fundamental contributions over the\nlast a few years. To train an LLM, one needs to alternatingly run `forward'\ncomputations and `backward' computations. The forward computation can be viewed\nas attention function evaluation, and the backward computation can be viewed as\na gradient computation. In previous work by [Alman and Song, NeurIPS 2023], it\nwas proved that the forward step can be performed in almost-linear time in\ncertain parameter regimes, but that there is no truly sub-quadratic time\nalgorithm in the remaining parameter regimes unless the popular hypothesis SETH\nis false. In this work, we show nearly identical results for the harder-seeming\nproblem of computing the gradient of loss function of one layer attention\nnetwork, and thus for the entire process of LLM training. This completely\ncharacterizes the fine-grained complexity of every step of LLM training.\n","authors":["Josh Alman","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2402.04497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04492v1","updated":"2024-02-07T00:31:49Z","published":"2024-02-07T00:31:49Z","title":"ColorSwap: A Color and Word Order Dataset for Multimodal Evaluation","summary":" This paper introduces the ColorSwap dataset, designed to assess and improve\nthe proficiency of multimodal models in matching objects with their colors. The\ndataset is comprised of 2,000 unique image-caption pairs, grouped into 1,000\nexamples. Each example includes a caption-image pair, along with a\n``color-swapped'' pair. We follow the Winoground schema: the two captions in an\nexample have the same words, but the color words have been rearranged to modify\ndifferent objects. The dataset was created through a novel blend of automated\ncaption and image generation with humans in the loop. We evaluate image-text\nmatching (ITM) and visual language models (VLMs) and find that even the latest\nones are still not robust at this task. GPT-4V and LLaVA score 72% and 42% on\nour main VLM metric, although they may improve with more advanced prompting\ntechniques. On the main ITM metric, contrastive models such as CLIP and SigLIP\nperform close to chance (at 12% and 30%, respectively), although the\nnon-contrastive BLIP ITM model is stronger (87%). We also find that finetuning\non fewer than 2,000 examples yields significant performance gains on this\nout-of-distribution word-order understanding task. The dataset is here:\nhttps://github.com/Top34051/colorswap.\n","authors":["Jirayu Burapacheep","Ishan Gaur","Agam Bhatia","Tristan Thrush"],"pdf_url":"https://arxiv.org/pdf/2402.04492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01729v2","updated":"2024-02-07T00:31:40Z","published":"2024-01-28T08:56:49Z","title":"Contextualization Distillation from Large Language Model for Knowledge\n Graph Completion","summary":" While textual information significantly enhances the performance of\npre-trained language models (PLMs) in knowledge graph completion (KGC), the\nstatic and noisy nature of existing corpora collected from Wikipedia articles\nor synsets definitions often limits the potential of PLM-based KGC models. To\nsurmount these challenges, we introduce the Contextualization Distillation\nstrategy, a versatile plug-in-and-play approach compatible with both\ndiscriminative and generative KGC frameworks. Our method begins by instructing\nlarge language models (LLMs) to transform compact, structural triplets into\ncontext-rich segments. Subsequently, we introduce two tailored auxiliary tasks,\nreconstruction and contextualization, allowing smaller KGC models to assimilate\ninsights from these enriched triplets. Comprehensive evaluations across diverse\ndatasets and KGC techniques highlight the efficacy and adaptability of our\napproach, revealing consistent performance enhancements irrespective of\nunderlying pipelines or architectures. Moreover, our analysis makes our method\nmore explainable and provides insight into generating path selection, as well\nas the choosing of suitable distillation tasks. All the code and data in this\nwork will be released at\nhttps://github.com/David-Li0406/Contextulization-Distillation\n","authors":["Dawei Li","Zhen Tan","Tianlong Chen","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01729v2.pdf","comment":"Accepted by EACL 2024 findings v2: revise the citation problem"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2402.05106v1","updated":"2024-02-07T18:57:37Z","published":"2024-02-07T18:57:37Z","title":"Image captioning for Brazilian Portuguese using GRIT model","summary":" This work presents the early development of a model of image captioning for\nthe Brazilian Portuguese language. We used the GRIT (Grid - and Region-based\nImage captioning Transformer) model to accomplish this work. GRIT is a\nTransformer-only neural architecture that effectively utilizes two visual\nfeatures to generate better captions. The GRIT method emerged as a proposal to\nbe a more efficient way to generate image captioning. In this work, we adapt\nthe GRIT model to be trained in a Brazilian Portuguese dataset to have an image\ncaptioning method for the Brazilian Portuguese Language.\n","authors":["Rafael Silva de Alencar","William Alberto Cruz Castañeda","Marcellus Amadeus"],"pdf_url":"https://arxiv.org/pdf/2402.05106v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.09666 by other authors"},{"id":"http://arxiv.org/abs/2308.07545v3","updated":"2024-02-07T18:57:27Z","published":"2023-08-15T03:22:40Z","title":"Vision-Language Dataset Distillation","summary":" Dataset distillation methods reduce large-scale datasets to smaller sets of\nsynthetic data, which preserve sufficient information for quickly training a\nnew model from scratch. However, prior work on dataset distillation has focused\nexclusively on image classification datasets, whereas modern large-scale\ndatasets are primarily in the vision-language space. In this work, we design\nthe first vision-language dataset distillation method, building on the idea of\ntrajectory matching. A key challenge is that vision-language datasets do not\nhave a set of discrete classes. To overcome this, our proposed method jointly\ndistills the image-text pairs in a contrastive formulation. Further, we\nleverage Low-Rank Adaptation (LoRA) matching to enable more efficient and\neffective trajectory matching in complex modern vision-language models. Since\nthere are no existing baselines, we compare our distillation approach to three\nadapted vision-language coreset selection methods. We demonstrate significant\nimprovements on the challenging Flickr30K and COCO retrieval benchmarks: for\nexample, on Flickr30K, the best coreset selection method selecting 1000\nimage-text pairs for training achieves only 5.6% image-to-text retrieval\naccuracy (i.e., recall@1); in contrast, our dataset distillation approach\nalmost doubles that to 9.9% with just 100 (an order of magnitude fewer)\ntraining pairs.\n","authors":["Xindi Wu","Byron Zhang","Zhiwei Deng","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2308.07545v3.pdf","comment":"29 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.05090v1","updated":"2024-02-07T18:44:27Z","published":"2024-02-07T18:44:27Z","title":"Language-Based Augmentation to Address Shortcut Learning in Object Goal\n Navigation","summary":" Deep Reinforcement Learning (DRL) has shown great potential in enabling\nrobots to find certain objects (e.g., `find a fridge') in environments like\nhomes or schools. This task is known as Object-Goal Navigation (ObjectNav). DRL\nmethods are predominantly trained and evaluated using environment simulators.\nAlthough DRL has shown impressive results, the simulators may be biased or\nlimited. This creates a risk of shortcut learning, i.e., learning a policy\ntailored to specific visual details of training environments. We aim to deepen\nour understanding of shortcut learning in ObjectNav, its implications and\npropose a solution. We design an experiment for inserting a shortcut bias in\nthe appearance of training environments. As a proof-of-concept, we associate\nroom types to specific wall colors (e.g., bedrooms with green walls), and\nobserve poor generalization of a state-of-the-art (SOTA) ObjectNav method to\nenvironments where this is not the case (e.g., bedrooms with blue walls). We\nfind that shortcut learning is the root cause: the agent learns to navigate to\ntarget objects, by simply searching for the associated wall color of the target\nobject's room. To solve this, we propose Language-Based (L-B) augmentation. Our\nkey insight is that we can leverage the multimodal feature space of a\nVision-Language Model (VLM) to augment visual representations directly at the\nfeature-level, requiring no changes to the simulator, and only an addition of\none layer to the model. Where the SOTA ObjectNav method's success rate drops\n69%, our proposal has only a drop of 23%.\n","authors":["Dennis Hoftijzer","Gertjan Burghouts","Luuk Spreeuwers"],"pdf_url":"https://arxiv.org/pdf/2402.05090v1.pdf","comment":"8 pages, 6 figures, to be published in IEEE IRC 2023"},{"id":"http://arxiv.org/abs/2402.05079v1","updated":"2024-02-07T18:33:04Z","published":"2024-02-07T18:33:04Z","title":"Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation","summary":" In recent advancements in medical image analysis, Convolutional Neural\nNetworks (CNN) and Vision Transformers (ViT) have set significant benchmarks.\nWhile the former excels in capturing local features through its convolution\noperations, the latter achieves remarkable global context understanding by\nleveraging self-attention mechanisms. However, both architectures exhibit\nlimitations in efficiently modeling long-range dependencies within medical\nimages, which is a critical aspect for precise segmentation. Inspired by the\nMamba architecture, known for its proficiency in handling long sequences and\nglobal contextual information with enhanced computational efficiency as a State\nSpace Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes\nthe U-Net in medical image segmentation with Mamba's capability. Mamba-UNet\nadopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused\nwith skip connections to preserve spatial information across different scales\nof the network. This design facilitates a comprehensive feature learning\nprocess, capturing intricate details and broader semantic contexts within\nmedical images. We introduce a novel integration mechanism within the VMamba\nblocks to ensure seamless connectivity and information flow between the encoder\nand decoder paths, enhancing the segmentation performance. We conducted\nexperiments on publicly available MRI cardiac multi-structures segmentation\ndataset. The results show that Mamba-UNet outperforms UNet, Swin-UNet in\nmedical image segmentation under the same hyper-parameter setting. The source\ncode and baseline implementations are available.\n","authors":["Ziyang Wang","Jian-Qing Zheng","Yichi Zhang","Ge Cui","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2402.05079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03561v2","updated":"2024-02-07T18:02:51Z","published":"2024-02-05T22:20:19Z","title":"VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language\n Navigation","summary":" Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate\nthrough realistic 3D outdoor environments based on natural language\ninstructions. The performance of existing VLN methods is limited by\ninsufficient diversity in navigation environments and limited training data. To\naddress these issues, we propose VLN-Video, which utilizes the diverse outdoor\nenvironments present in driving videos in multiple cities in the U.S. augmented\nwith automatically generated navigation instructions and actions to improve\noutdoor VLN performance. VLN-Video combines the best of intuitive classical\napproaches and modern deep learning techniques, using template infilling to\ngenerate grounded navigation instructions, combined with an image rotation\nsimilarity-based navigation action predictor to obtain VLN style data from\ndriving videos for pretraining deep learning VLN models. We pre-train the model\non the Touchdown dataset and our video-augmented dataset created from driving\nvideos with three proxy tasks: Masked Language Modeling, Instruction and\nTrajectory Matching, and Next Action Prediction, so as to learn\ntemporally-aware and visually-aligned instruction representations. The learned\ninstruction representation is adapted to the state-of-the-art navigator when\nfine-tuning on the Touchdown dataset. Empirical results demonstrate that\nVLN-Video significantly outperforms previous state-of-the-art models by 2.1% in\ntask completion rate, achieving a new state-of-the-art on the Touchdown\ndataset.\n","authors":["Jialu Li","Aishwarya Padmakumar","Gaurav Sukhatme","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2402.03561v2.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2402.05054v1","updated":"2024-02-07T17:57:03Z","published":"2024-02-07T17:57:03Z","title":"LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content\n Creation","summary":" 3D content creation has achieved significant progress in terms of both\nquality and speed. Although current feed-forward models can produce 3D objects\nin seconds, their resolution is constrained by the intensive computation\nrequired during training. In this paper, we introduce Large Multi-View Gaussian\nModel (LGM), a novel framework designed to generate high-resolution 3D models\nfrom text prompts or single-view images. Our key insights are two-fold: 1) 3D\nRepresentation: We propose multi-view Gaussian features as an efficient yet\npowerful representation, which can then be fused together for differentiable\nrendering. 2) 3D Backbone: We present an asymmetric U-Net as a high-throughput\nbackbone operating on multi-view images, which can be produced from text or\nsingle-view image input by leveraging multi-view diffusion models. Extensive\nexperiments demonstrate the high fidelity and efficiency of our approach.\nNotably, we maintain the fast speed to generate 3D objects within 5 seconds\nwhile boosting the training resolution to 512, thereby achieving\nhigh-resolution 3D content generation.\n","authors":["Jiaxiang Tang","Zhaoxi Chen","Xiaokang Chen","Tengfei Wang","Gang Zeng","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2402.05054v1.pdf","comment":"Project page: https://me.kiui.moe/lgm/"},{"id":"http://arxiv.org/abs/2402.05045v1","updated":"2024-02-07T17:34:32Z","published":"2024-02-07T17:34:32Z","title":"Efficient Multi-Resolution Fusion for Remote Sensing Data with Label\n Uncertainty","summary":" Multi-modal sensor data fusion takes advantage of complementary or\nreinforcing information from each sensor and can boost overall performance in\napplications such as scene classification and target detection. This paper\npresents a new method for fusing multi-modal and multi-resolution remote sensor\ndata without requiring pixel-level training labels, which can be difficult to\nobtain. Previously, we developed a Multiple Instance Multi-Resolution Fusion\n(MIMRF) framework that addresses label uncertainty for fusion, but it can be\nslow to train due to the large search space for the fuzzy measures used to\nintegrate sensor data sources. We propose a new method based on binary fuzzy\nmeasures, which reduces the search space and significantly improves the\nefficiency of the MIMRF framework. We present experimental results on synthetic\ndata and a real-world remote sensing detection task and show that the proposed\nMIMRF-BFM algorithm can effectively and efficiently perform multi-resolution\nfusion given remote sensing data with uncertainty.\n","authors":["Hersh Vakharia","Xiaoxiao Du"],"pdf_url":"https://arxiv.org/pdf/2402.05045v1.pdf","comment":"4 pages, 3 figures, 2 tables; Accepted to International Geoscience\n and Remote Sensing Symposium (IGARSS) 2023; Code available at\n https://github.com/hvak/MIMRF-BFM"},{"id":"http://arxiv.org/abs/2401.06704v2","updated":"2024-02-07T17:26:53Z","published":"2024-01-12T17:10:52Z","title":"Scalable 3D Panoptic Segmentation As Superpoint Graph Clustering","summary":" We introduce a highly efficient method for panoptic segmentation of large 3D\npoint clouds by redefining this task as a scalable graph clustering problem.\nThis approach can be trained using only local auxiliary tasks, thereby\neliminating the resource-intensive instance-matching step during training.\nMoreover, our formulation can easily be adapted to the superpoint paradigm,\nfurther increasing its efficiency. This allows our model to process scenes with\nmillions of points and thousands of objects in a single inference. Our method,\ncalled SuperCluster, achieves a new state-of-the-art panoptic segmentation\nperformance for two indoor scanning datasets: $50.1$ PQ ($+7.8$) for S3DIS\nArea~5, and $58.7$ PQ ($+25.2$) for ScanNetV2. We also set the first\nstate-of-the-art for two large-scale mobile mapping benchmarks: KITTI-360 and\nDALES. With only $209$k parameters, our model is over $30$ times smaller than\nthe best-competing method and trains up to $15$ times faster. Our code and\npretrained models are available at\nhttps://github.com/drprojects/superpoint_transformer.\n","authors":["Damien Robert","Hugo Raguet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2401.06704v2.pdf","comment":"Accepted at 3DV 2024, Oral presentation"},{"id":"http://arxiv.org/abs/2402.05035v1","updated":"2024-02-07T17:08:27Z","published":"2024-02-07T17:08:27Z","title":"A Survey on Domain Generalization for Medical Image Analysis","summary":" Medical Image Analysis (MedIA) has emerged as a crucial tool in\ncomputer-aided diagnosis systems, particularly with the advancement of deep\nlearning (DL) in recent years. However, well-trained deep models often\nexperience significant performance degradation when deployed in different\nmedical sites, modalities, and sequences, known as a domain shift issue. In\nlight of this, Domain Generalization (DG) for MedIA aims to address the domain\nshift challenge by generalizing effectively and performing robustly across\nunknown data distributions. This paper presents the a comprehensive review of\nsubstantial developments in this area. First, we provide a formal definition of\ndomain shift and domain generalization in medical field, and discuss several\nrelated settings. Subsequently, we summarize the recent methods from three\nviewpoints: data manipulation level, feature representation level, and model\ntraining level, and present some algorithms in detail for each viewpoints.\nFurthermore, we introduce the commonly used datasets. Finally, we summarize\nexisting literature and present some potential research topics for the future.\nFor this survey, we also created a GitHub project by collecting the supporting\nresources, at the link: https://github.com/Ziwei-Niu/DG_for_MedIA\n","authors":["Ziwei Niu","Shuyi Ouyang","Shiao Xie","Yen-wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2402.05035v1.pdf","comment":"Submitted to IJCAI 2024, 9 pages"},{"id":"http://arxiv.org/abs/2402.05008v1","updated":"2024-02-07T16:28:36Z","published":"2024-02-07T16:28:36Z","title":"EfficientViT-SAM: Accelerated Segment Anything Model Without Performance\n Loss","summary":" We present EfficientViT-SAM, a new family of accelerated segment anything\nmodels. We retain SAM's lightweight prompt encoder and mask decoder while\nreplacing the heavy image encoder with EfficientViT. For the training, we begin\nwith the knowledge distillation from the SAM-ViT-H image encoder to\nEfficientViT. Subsequently, we conduct end-to-end training on the SA-1B\ndataset. Benefiting from EfficientViT's efficiency and capacity,\nEfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over\nSAM-ViT-H without sacrificing performance. Our code and pre-trained models are\nreleased at https://github.com/mit-han-lab/efficientvit.\n","authors":["Zhuoyang Zhang","Han Cai","Song Han"],"pdf_url":"https://arxiv.org/pdf/2402.05008v1.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2307.04081v3","updated":"2024-02-07T16:00:21Z","published":"2023-07-09T01:41:22Z","title":"Score-based Conditional Generation with Fewer Labeled Data by\n Self-calibrating Classifier Guidance","summary":" Score-based generative models (SGMs) are a popular family of deep generative\nmodels that achieve leading image generation quality. Early studies extend SGMs\nto tackle class-conditional generation by coupling an unconditional SGM with\nthe guidance of a trained classifier. Nevertheless, such classifier-guided SGMs\ndo not always achieve accurate conditional generation, especially when trained\nwith fewer labeled data. We argue that the problem is rooted in the\nclassifier's tendency to overfit without coordinating with the underlying\nunconditional distribution. To make the classifier respect the unconditional\ndistribution, we propose improving classifier-guided SGMs by letting the\nclassifier regularize itself. The key idea of our proposed method is to use\nprinciples from energy-based models to convert the classifier into another view\nof the unconditional SGM. Existing losses for unconditional SGMs can then be\nleveraged to achieve regularization by calibrating the classifier's internal\nunconditional scores. The regularization scheme can be applied to not only the\nlabeled data but also unlabeled ones to further improve the classifier. Across\nvarious percentages of fewer labeled data, empirical results show that the\nproposed approach significantly enhances conditional generation quality. The\nenhancements confirm the potential of the proposed self-calibration technique\nfor generative modeling with limited labeled data.\n","authors":["Paul Kuo-Ming Huang","Si-An Chen","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2307.04081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04979v1","updated":"2024-02-07T15:57:28Z","published":"2024-02-07T15:57:28Z","title":"Detection and Pose Estimation of flat, Texture-less Industry Objects on\n HoloLens using synthetic Training","summary":" Current state-of-the-art 6d pose estimation is too compute intensive to be\ndeployed on edge devices, such as Microsoft HoloLens (2) or Apple iPad, both\nused for an increasing number of augmented reality applications. The quality of\nAR is greatly dependent on its capabilities to detect and overlay geometry\nwithin the scene. We propose a synthetically trained client-server-based\naugmented reality application, demonstrating state-of-the-art object pose\nestimation of metallic and texture-less industry objects on edge devices.\nSynthetic data enables training without real photographs, i.e. for\nyet-to-be-manufactured objects. Our qualitative evaluation on an AR-assisted\nsorting task, and quantitative evaluation on both renderings, as well as\nreal-world data recorded on HoloLens 2, sheds light on its real-world\napplicability.\n","authors":["Thomas Pöllabauer","Fabian Rücker","Andreas Franek","Felix Gorschlüter"],"pdf_url":"https://arxiv.org/pdf/2402.04979v1.pdf","comment":"Scandinavian Conference on Image Analysis 2023"},{"id":"http://arxiv.org/abs/2402.04967v1","updated":"2024-02-07T15:44:55Z","published":"2024-02-07T15:44:55Z","title":"Text or Image? What is More Important in Cross-Domain Generalization\n Capabilities of Hate Meme Detection Models?","summary":" This paper delves into the formidable challenge of cross-domain\ngeneralization in multimodal hate meme detection, presenting compelling\nfindings. We provide enough pieces of evidence supporting the hypothesis that\nonly the textual component of hateful memes enables the existing multimodal\nclassifier to generalize across different domains, while the image component\nproves highly sensitive to a specific training dataset. The evidence includes\ndemonstrations showing that hate-text classifiers perform similarly to\nhate-meme classifiers in a zero-shot setting. Simultaneously, the introduction\nof captions generated from images of memes to the hate-meme classifier worsens\nperformance by an average F1 of 0.02. Through blackbox explanations, we\nidentify a substantial contribution of the text modality (average of 83%),\nwhich diminishes with the introduction of meme's image captions (52%).\nAdditionally, our evaluation on a newly created confounder dataset reveals\nhigher performance on text confounders as compared to image confounders with an\naverage $\\Delta$F1 of 0.18.\n","authors":["Piush Aggarwal","Jawar Mehrabanian","Weigang Huang","Özge Alacam","Torsten Zesch"],"pdf_url":"https://arxiv.org/pdf/2402.04967v1.pdf","comment":"Accepted at EACL'2024 Findings"},{"id":"http://arxiv.org/abs/2402.04964v1","updated":"2024-02-07T15:43:50Z","published":"2024-02-07T15:43:50Z","title":"ConvLoRA and AdaBN based Domain Adaptation via Self-Training","summary":" Existing domain adaptation (DA) methods often involve pre-training on the\nsource domain and fine-tuning on the target domain. For multi-target domain\nadaptation, having a dedicated/separate fine-tuned network for each target\ndomain, that retain all the pre-trained model parameters, is prohibitively\nexpensive. To address this limitation, we propose Convolutional Low-Rank\nAdaptation (ConvLoRA). ConvLoRA freezes pre-trained model weights, adds\ntrainable low-rank decomposition matrices to convolutional layers, and\nbackpropagates the gradient through these matrices thus greatly reducing the\nnumber of trainable parameters. To further boost adaptation, we utilize\nAdaptive Batch Normalization (AdaBN) which computes target-specific running\nstatistics and use it along with ConvLoRA. Our method has fewer trainable\nparameters and performs better or on-par with large independent fine-tuned\nnetworks (with less than 0.9% trainable parameters of the total base model)\nwhen tested on the segmentation of Calgary-Campinas dataset containing brain\nMRI images. Our approach is simple, yet effective and can be applied to any\ndeep learning-based architecture which uses convolutional and batch\nnormalization layers. Code is available at:\nhttps://github.com/aleemsidra/ConvLoRA.\n","authors":["Sidra Aleem","Julia Dietlmeier","Eric Arazo","Suzanne Little"],"pdf_url":"https://arxiv.org/pdf/2402.04964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04958v1","updated":"2024-02-07T15:41:01Z","published":"2024-02-07T15:41:01Z","title":"Channel-Selective Normalization for Label-Shift Robust Test-Time\n Adaptation","summary":" Deep neural networks have useful applications in many different tasks,\nhowever their performance can be severely affected by changes in the data\ndistribution. For example, in the biomedical field, their performance can be\naffected by changes in the data (different machines, populations) between\ntraining and test datasets. To ensure robustness and generalization to\nreal-world scenarios, test-time adaptation has been recently studied as an\napproach to adjust models to a new data distribution during inference.\nTest-time batch normalization is a simple and popular method that achieved\ncompelling performance on domain shift benchmarks. It is implemented by\nrecalculating batch normalization statistics on test batches. Prior work has\nfocused on analysis with test data that has the same label distribution as the\ntraining data. However, in many practical applications this technique is\nvulnerable to label distribution shifts, sometimes producing catastrophic\nfailure. This presents a risk in applying test time adaptation methods in\ndeployment. We propose to tackle this challenge by only selectively adapting\nchannels in a deep network, minimizing drastic adaptation that is sensitive to\nlabel shifts. Our selection scheme is based on two principles that we\nempirically motivate: (1) later layers of networks are more sensitive to label\nshift (2) individual features can be sensitive to specific classes. We apply\nthe proposed technique to three classification tasks, including CIFAR10-C,\nImagenet-C, and diagnosis of fatty liver, where we explore both covariate and\nlabel distribution shifts. We find that our method allows to bring the benefits\nof TTA while significantly reducing the risk of failure common in other\nmethods, while being robust to choice in hyperparameters.\n","authors":["Pedro Vianna","Muawiz Chaudhary","Paria Mehrbod","An Tang","Guy Cloutier","Guy Wolf","Michael Eickenberg","Eugene Belilovsky"],"pdf_url":"https://arxiv.org/pdf/2402.04958v1.pdf","comment":"11 pages including references, 7 figures, 2 tables, Appendix"},{"id":"http://arxiv.org/abs/2402.04953v1","updated":"2024-02-07T15:37:17Z","published":"2024-02-07T15:37:17Z","title":"4-Dimensional deformation part model for pose estimation using Kalman\n filter constraints","summary":" The main goal of this article is to analyze the effect on pose estimation\naccuracy when using a Kalman filter added to 4-dimensional deformation part\nmodel partial solutions. The experiments run with two data sets showing that\nthis method improves pose estimation accuracy compared with state-of-the-art\nmethods and that a Kalman filter helps to increase this accuracy.\n","authors":["Enrique Martinez-Berti","Antonio-Jose Sanchez-Salmeron","Carlos Ricolfe-Viala"],"pdf_url":"https://arxiv.org/pdf/2402.04953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03462v3","updated":"2024-02-07T15:17:24Z","published":"2023-05-05T12:08:57Z","title":"General Neural Gauge Fields","summary":" The recent advance of neural fields, such as neural radiance fields, has\nsignificantly pushed the boundary of scene representation learning. Aiming to\nboost the computation efficiency and rendering quality of 3D scenes, a popular\nline of research maps the 3D coordinate system to another measuring system,\ne.g., 2D manifolds and hash tables, for modeling neural fields. The conversion\nof coordinate systems can be typically dubbed as \\emph{gauge transformation},\nwhich is usually a pre-defined mapping function, e.g., orthogonal projection or\nspatial hash function. This begs a question: can we directly learn a desired\ngauge transformation along with the neural field in an end-to-end manner? In\nthis work, we extend this problem to a general paradigm with a taxonomy of\ndiscrete \\& continuous cases, and develop a learning framework to jointly\noptimize gauge transformations and neural fields. To counter the problem that\nthe learning of gauge transformations can collapse easily, we derive a general\nregularization mechanism from the principle of information conservation during\nthe gauge transformation. To circumvent the high computation cost in gauge\nlearning with regularization, we directly derive an information-invariant gauge\ntransformation which allows to preserve scene information inherently and yield\nsuperior performance. Project: https://fnzhan.com/Neural-Gauge-Fields\n","authors":["Fangneng Zhan","Lingjie Liu","Adam Kortylewski","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2305.03462v3.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2402.03307v2","updated":"2024-02-07T15:09:57Z","published":"2024-02-05T18:59:04Z","title":"4D Gaussian Splatting: Towards Efficient Novel View Synthesis for\n Dynamic Scenes","summary":" We consider the problem of novel view synthesis (NVS) for dynamic scenes.\nRecent neural approaches have accomplished exceptional NVS results for static\n3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior\nefforts often encode dynamics by learning a canonical space plus implicit or\nexplicit deformation fields, which struggle in challenging scenarios like\nsudden movements or capturing high-fidelity renderings. In this paper, we\nintroduce 4D Gaussian Splatting (4DGS), a novel method that represents dynamic\nscenes with anisotropic 4D XYZT Gaussians, inspired by the success of 3D\nGaussian Splatting in static scenes. We model dynamics at each timestamp by\ntemporally slicing the 4D Gaussians, which naturally compose dynamic 3D\nGaussians and can be seamlessly projected into images. As an explicit\nspatial-temporal representation, 4DGS demonstrates powerful capabilities for\nmodeling complicated dynamics and fine details, especially for scenes with\nabrupt motions. We further implement our temporal slicing and splatting\ntechniques in a highly optimized CUDA acceleration framework, achieving\nreal-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and\n583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions\nshowcase the superior efficiency and effectiveness of 4DGS, which consistently\noutperforms existing methods both quantitatively and qualitatively.\n","authors":["Yuanxing Duan","Fangyin Wei","Qiyu Dai","Yuhang He","Wenzheng Chen","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04930v1","updated":"2024-02-07T14:59:25Z","published":"2024-02-07T14:59:25Z","title":"Blue noise for diffusion models","summary":" Most of the existing diffusion models use Gaussian noise for training and\nsampling across all time steps, which may not optimally account for the\nfrequency contents reconstructed by the denoising network. Despite the diverse\napplications of correlated noise in computer graphics, its potential for\nimproving the training process has been underexplored. In this paper, we\nintroduce a novel and general class of diffusion models taking correlated noise\nwithin and across images into account. More specifically, we propose a\ntime-varying noise model to incorporate correlated noise into the training\nprocess, as well as a method for fast generation of correlated noise mask. Our\nmodel is built upon deterministic diffusion models and utilizes blue noise to\nhelp improve the generation quality compared to using Gaussian white (random)\nnoise only. Further, our framework allows introducing correlation across images\nwithin a single mini-batch to improve gradient flow. We perform both\nqualitative and quantitative evaluations on a variety of datasets using our\nmethod, achieving improvements on different tasks over existing deterministic\ndiffusion models in terms of FID metric.\n","authors":["Xingchang Huang","Corentin Salaün","Cristina Vasconcelos","Christian Theobalt","Cengiz Öztireli","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2402.04930v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.04929v1","updated":"2024-02-07T14:56:13Z","published":"2024-02-07T14:56:13Z","title":"Source-Free Domain Adaptation with Diffusion-Guided Source Data\n Generation","summary":" This paper introduces a novel approach to leverage the generalizability\ncapability of Diffusion Models for Source-Free Domain Adaptation (DM-SFDA). Our\nproposed DM-SFDA method involves fine-tuning a pre-trained text-to-image\ndiffusion model to generate source domain images using features from the target\nimages to guide the diffusion process. Specifically, the pre-trained diffusion\nmodel is fine-tuned to generate source samples that minimize entropy and\nmaximize confidence for the pre-trained source model. We then apply established\nunsupervised domain adaptation techniques to align the generated source images\nwith target domain data. We validate our approach through comprehensive\nexperiments across a range of datasets, including Office-31, Office-Home, and\nVisDA. The results highlight significant improvements in SFDA performance,\nshowcasing the potential of diffusion models in generating contextually\nrelevant, domain-specific images.\n","authors":["Shivang Chopra","Suraj Kothawade","Houda Aynaou","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2402.04929v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.01701"},{"id":"http://arxiv.org/abs/2402.04921v1","updated":"2024-02-07T14:47:08Z","published":"2024-02-07T14:47:08Z","title":"Is Two-shot All You Need? A Label-efficient Approach for Video\n Segmentation in Breast Ultrasound","summary":" Breast lesion segmentation from breast ultrasound (BUS) videos could assist\nin early diagnosis and treatment. Existing video object segmentation (VOS)\nmethods usually require dense annotation, which is often inaccessible for\nmedical datasets. Furthermore, they suffer from accumulative errors and a lack\nof explicit space-time awareness. In this work, we propose a novel two-shot\ntraining paradigm for BUS video segmentation. It not only is able to capture\nfree-range space-time consistency but also utilizes a source-dependent\naugmentation scheme. This label-efficient learning framework is validated on a\nchallenging in-house BUS video dataset. Results showed that it gained\ncomparable performance to the fully annotated ones given only 1.9% training\nlabels.\n","authors":["Jiajun Zeng","Ruobing Huang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2402.04921v1.pdf","comment":"5 pages, 1 figure, 2 tables, accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2308.07123v2","updated":"2024-02-07T14:37:42Z","published":"2023-08-14T13:10:48Z","title":"An Outlook into the Future of Egocentric Vision","summary":" What will the future be? We wonder! In this survey, we explore the gap\nbetween current research in egocentric vision and the ever-anticipated future,\nwhere wearable computing, with outward facing cameras and digital overlays, is\nexpected to be integrated in our every day lives. To understand this gap, the\narticle starts by envisaging the future through character-based stories,\nshowcasing through examples the limitations of current technology. We then\nprovide a mapping between this future and previously defined research tasks.\nFor each task, we survey its seminal works, current state-of-the-art\nmethodologies and available datasets, then reflect on shortcomings that limit\nits applicability to future research. Note that this survey focuses on software\nmodels for egocentric vision, independent of any specific hardware. The paper\nconcludes with recommendations for areas of immediate explorations so as to\nunlock our path to the future always-on, personalised and life-enhancing\negocentric vision.\n","authors":["Chiara Plizzari","Gabriele Goletto","Antonino Furnari","Siddhant Bansal","Francesco Ragusa","Giovanni Maria Farinella","Dima Damen","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2308.07123v2.pdf","comment":"We invite comments, suggestions and corrections here:\n https://openreview.net/forum?id=V3974SUk1w"},{"id":"http://arxiv.org/abs/2402.04883v1","updated":"2024-02-07T14:21:26Z","published":"2024-02-07T14:21:26Z","title":"Toward Accurate Camera-based 3D Object Detection via Cascade Depth\n Estimation and Calibration","summary":" Recent camera-based 3D object detection is limited by the precision of\ntransforming from image to 3D feature spaces, as well as the accuracy of object\nlocalization within the 3D space. This paper aims to address such a fundamental\nproblem of camera-based 3D object detection: How to effectively learn depth\ninformation for accurate feature lifting and object localization. Different\nfrom previous methods which directly predict depth distributions by using a\nsupervised estimation model, we propose a cascade framework consisting of two\ndepth-aware learning paradigms. First, a depth estimation (DE) scheme leverages\nrelative depth information to realize the effective feature lifting from 2D to\n3D spaces. Furthermore, a depth calibration (DC) scheme introduces depth\nreconstruction to further adjust the 3D object localization perturbation along\nthe depth axis. In practice, the DE is explicitly realized by using both the\nabsolute and relative depth optimization loss to promote the precision of depth\nprediction, while the capability of DC is implicitly embedded into the\ndetection Transformer through a depth denoising mechanism in the training\nphase. The entire model training is accomplished through an end-to-end manner.\nWe propose a baseline detector and evaluate the effectiveness of our proposal\nwith +2.2%/+2.7% NDS/mAP improvements on NuScenes benchmark, and gain a\ncomparable performance with 55.9%/45.7% NDS/mAP. Furthermore, we conduct\nextensive experiments to demonstrate its generality based on various detectors\nwith about +2% NDS improvements.\n","authors":["Chaoqun Wang","Yiran Qin","Zijian Kang","Ningning Ma","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04883v1.pdf","comment":"Accepted to ICRA2024"},{"id":"http://arxiv.org/abs/2402.04878v1","updated":"2024-02-07T14:18:19Z","published":"2024-02-07T14:18:19Z","title":"STAR: Shape-focused Texture Agnostic Representations for Improved Object\n Detection and 6D Pose Estimation","summary":" Recent advances in machine learning have greatly benefited object detection\nand 6D pose estimation for robotic grasping. However, textureless and metallic\nobjects still pose a significant challenge due to fewer visual cues and the\ntexture bias of CNNs. To address this issue, we propose a texture-agnostic\napproach that focuses on learning from CAD models and emphasizes object shape\nfeatures. To achieve a focus on learning shape features, the textures are\nrandomized during the rendering of the training data. By treating the texture\nas noise, the need for real-world object instances or their final appearance\nduring training data generation is eliminated. The TLESS and ITODD datasets,\nspecifically created for industrial settings in robotics and featuring\ntextureless and metallic objects, were used for evaluation. Texture agnosticity\nalso increases the robustness against image perturbations such as imaging\nnoise, motion blur, and brightness changes, which are common in robotics\napplications. Code and datasets are publicly available at\ngithub.com/hoenigpeter/randomized_texturing.\n","authors":["Peter Hönig","Stefan Thalhammer","Jean-Baptiste Weibel","Matthias Hirschmanner","Markus Vincze"],"pdf_url":"https://arxiv.org/pdf/2402.04878v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2310.08320v3","updated":"2024-02-07T14:13:05Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":" The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names and faces of\nindividuals from vision-language models by fine-tuning them for only a few\nminutes instead of re-training them from scratch. Specifically, through\nstrategic insertion of backdoors into text encoders, we align the embeddings of\nsensitive phrases with those of neutral terms-\"a person\" instead of the\nperson's actual name. For image encoders, we map embeddings of individuals to\nbe removed from the model to a universal, anonymous embedding. Our empirical\nresults demonstrate the effectiveness of our backdoor-based defense on CLIP by\nassessing its performance using a specialized privacy attack for zero-shot\nclassifiers. Our approach provides not only a new \"dual-use\" perspective on\nbackdoor attacks, but also presents a promising avenue to enhance the privacy\nof individuals within models trained on uncurated web-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04857v1","updated":"2024-02-07T13:54:56Z","published":"2024-02-07T13:54:56Z","title":"Advancing Anomaly Detection: An Adaptation Model and a New Dataset","summary":" Industry surveillance is widely applicable in sectors like retail,\nmanufacturing, education, and smart cities, each presenting unique anomalies\nrequiring specialized detection. However, adapting anomaly detection models to\nnovel viewpoints within the same scenario poses challenges. Extending these\nmodels to entirely new scenarios necessitates retraining or fine-tuning, a\nprocess that can be time consuming. To address these challenges, we propose the\nScenario-Adaptive Anomaly Detection (SA2D) method, leveraging the few-shot\nlearning framework for faster adaptation of pre-trained models to new concepts.\nDespite this approach, a significant challenge emerges from the absence of a\ncomprehensive dataset with diverse scenarios and camera views. In response, we\nintroduce the Multi-Scenario Anomaly Detection (MSAD) dataset, encompassing 14\ndistinct scenarios captured from various camera views. This real-world dataset\nis the first high-resolution anomaly detection dataset, offering a solid\nfoundation for training superior models. MSAD includes diverse normal motion\npatterns, incorporating challenging variations like different lighting and\nweather conditions. Through experimentation, we validate the efficacy of SA2D,\nparticularly when trained on the MSAD dataset. Our results show that SA2D not\nonly excels under novel viewpoints within the same scenario but also\ndemonstrates competitive performance when faced with entirely new scenarios.\nThis highlights our method's potential in addressing challenges in detecting\nanomalies across diverse and evolving surveillance scenarios.\n","authors":["Liyun Zhu","Arjun Raj","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04857v1.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2402.04855v1","updated":"2024-02-07T13:54:15Z","published":"2024-02-07T13:54:15Z","title":"Dual-Path Coupled Image Deraining Network via Spatial-Frequency\n Interaction","summary":" Transformers have recently emerged as a significant force in the field of\nimage deraining. Existing image deraining methods utilize extensive research on\nself-attention. Though showcasing impressive results, they tend to neglect\ncritical frequency information, as self-attention is generally less adept at\ncapturing high-frequency details. To overcome this shortcoming, we have\ndeveloped an innovative Dual-Path Coupled Deraining Network (DPCNet) that\nintegrates information from both spatial and frequency domains through Spatial\nFeature Extraction Block (SFEBlock) and Frequency Feature Extraction Block\n(FFEBlock). We have further introduced an effective Adaptive Fusion Module\n(AFM) for the dual-path feature aggregation. Extensive experiments on six\npublic deraining benchmarks and downstream vision tasks have demonstrated that\nour proposed method not only outperforms the existing state-of-the-art\nderaining method but also achieves visually pleasuring results with excellent\nrobustness on downstream vision tasks.\n","authors":["Yuhong He","Aiwen Jiang","Lingfang Jiang","Zhifeng Wang","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17152v3","updated":"2024-02-07T13:53:38Z","published":"2023-03-30T05:19:43Z","title":"Mixed Autoencoder for Self-supervised Visual Representation Learning","summary":" Masked Autoencoder (MAE) has demonstrated superior performance on various\nvision tasks via randomly masking image patches and reconstruction. However,\neffective data augmentation strategies for MAE still remain open questions,\ndifferent from those in contrastive learning that serve as the most important\npart. This paper studies the prevailing mixing augmentation for MAE. We first\ndemonstrate that naive mixing will in contrast degenerate model performance due\nto the increase of mutual information (MI). To address, we propose homologous\nrecognition, an auxiliary pretext task, not only to alleviate the MI\nincreasement by explicitly requiring each patch to recognize homologous\npatches, but also to perform object-aware self-supervised pre-training for\nbetter downstream dense perception performance. With extensive experiments, we\ndemonstrate that our proposed Mixed Autoencoder (MixedAE) achieves the\nstate-of-the-art transfer results among masked image modeling (MIM)\naugmentations on different downstream tasks with significant efficiency.\nSpecifically, our MixedAE outperforms MAE by +0.3% accuracy, +1.7 mIoU and +0.9\nAP on ImageNet-1K, ADE20K and COCO respectively with a standard ViT-Base.\nMoreover, MixedAE surpasses iBOT, a strong MIM method combined with instance\ndiscrimination, while accelerating training by 2x. To our best knowledge, this\nis the very first work to consider mixing for MIM from the perspective of\npretext task design. Code will be made available.\n","authors":["Kai Chen","Zhili Liu","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2303.17152v3.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2303.15919v3","updated":"2024-02-07T13:46:35Z","published":"2023-03-28T12:20:52Z","title":"Fully Hyperbolic Convolutional Neural Networks for Computer Vision","summary":" Real-world visual data exhibit intrinsic hierarchical structures that can be\nrepresented effectively in hyperbolic spaces. Hyperbolic neural networks (HNNs)\nare a promising approach for learning feature representations in such spaces.\nHowever, current HNNs in computer vision rely on Euclidean backbones and only\nproject features to the hyperbolic space in the task heads, limiting their\nability to fully leverage the benefits of hyperbolic geometry. To address this,\nwe present HCNN, a fully hyperbolic convolutional neural network (CNN) designed\nfor computer vision tasks. Based on the Lorentz model, we generalize\nfundamental components of CNNs and propose novel formulations of the\nconvolutional layer, batch normalization, and multinomial logistic regression.\n{Experiments on standard vision tasks demonstrate the promising performance of\nour HCNN framework in both hybrid and fully hyperbolic settings.} Overall, we\nbelieve our contributions provide a foundation for developing more powerful\nHNNs that can better represent complex structures found in image data. Our code\nis publicly available at https://github.com/kschwethelm/HyperbolicCV.\n","authors":["Ahmad Bdeir","Kristian Schwethelm","Niels Landwehr"],"pdf_url":"https://arxiv.org/pdf/2303.15919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04841v1","updated":"2024-02-07T13:41:53Z","published":"2024-02-07T13:41:53Z","title":"Data-efficient Large Vision Models through Sequential Autoregression","summary":" Training general-purpose vision models on purely sequential visual data,\neschewing linguistic inputs, has heralded a new frontier in visual\nunderstanding. These models are intended to not only comprehend but also\nseamlessly transit to out-of-domain tasks. However, current endeavors are\nhamstrung by an over-reliance on colossal models, exemplified by models with\nupwards of 3B parameters, and the necessity for an extensive corpus of visual\ndata, often comprising a staggering 400B tokens. In this paper, we delve into\nthe development of an efficient, autoregression-based vision model,\ninnovatively architected to operate on a limited dataset. We meticulously\ndemonstrate how this model achieves proficiency in a spectrum of visual tasks\nspanning both high-level and low-level semantic understanding during the\ntesting phase. Our empirical evaluations underscore the model's agility in\nadapting to various tasks, heralding a significant reduction in the parameter\nfootprint, and a marked decrease in training data requirements, thereby paving\nthe way for more sustainable and accessible advancements in the field of\ngeneralist vision models. The code is available at\nhttps://github.com/ggjy/DeLVM.\n","authors":["Jianyuan Guo","Zhiwei Hao","Chengcheng Wang","Yehui Tang","Han Wu","Han Hu","Kai Han","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.04841v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2402.04835v1","updated":"2024-02-07T13:32:47Z","published":"2024-02-07T13:32:47Z","title":"SARI: Simplistic Average and Robust Identification based Noisy Partial\n Label Learning","summary":" Partial label learning (PLL) is a weakly-supervised learning paradigm where\neach training instance is paired with a set of candidate labels (partial\nlabel), one of which is the true label. Noisy PLL (NPLL) relaxes this\nconstraint by allowing some partial labels to not contain the true label,\nenhancing the practicality of the problem. Our work centers on NPLL and\npresents a minimalistic framework called SARI that initially assigns\npseudo-labels to images by exploiting the noisy partial labels through a\nweighted nearest neighbour algorithm. These pseudo-label and image pairs are\nthen used to train a deep neural network classifier with label smoothing and\nstandard regularization techniques. The classifier's features and predictions\nare subsequently employed to refine and enhance the accuracy of pseudo-labels.\nSARI combines the strengths of Average Based Strategies (in pseudo labelling)\nand Identification Based Strategies (in classifier training) from the\nliterature. We perform thorough experiments on seven datasets and compare SARI\nagainst nine NPLL and PLL methods from the prior art. SARI achieves\nstate-of-the-art results in almost all studied settings, obtaining substantial\ngains in fine-grained classification and extreme noise settings.\n","authors":["Darshana Saravanan","Naresh Manwani","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2402.04835v1.pdf","comment":"13 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2402.04829v1","updated":"2024-02-07T13:25:16Z","published":"2024-02-07T13:25:16Z","title":"NeRF as Non-Distant Environment Emitter in Physics-based Inverse\n Rendering","summary":" Physics-based inverse rendering aims to jointly optimize shape, materials,\nand lighting from captured 2D images. Here lighting is an important part of\nachieving faithful light transport simulation. While the environment map is\ncommonly used as the lighting model in inverse rendering, we show that its\ndistant lighting assumption leads to spatial invariant lighting, which can be\nan inaccurate approximation in real-world inverse rendering. We propose to use\nNeRF as a spatially varying environment lighting model and build an inverse\nrendering pipeline using NeRF as the non-distant environment emitter. By\ncomparing our method with the environment map on real and synthetic datasets,\nwe show that our NeRF-based emitter models the scene lighting more accurately\nand leads to more accurate inverse rendering. Project page and video:\nhttps://nerfemitterpbir.github.io/.\n","authors":["Jingwang Ling","Ruihan Yu","Feng Xu","Chun Du","Shuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.04829v1.pdf","comment":"Project page and video: https://nerfemitterpbir.github.io/"},{"id":"http://arxiv.org/abs/2011.08388v2","updated":"2024-02-07T13:23:08Z","published":"2020-11-17T02:55:16Z","title":"Domain Adaptation based Interpretable Image Emotion Recognition using\n Facial Expression Recognition","summary":" A domain adaptation technique has been proposed in this paper to identify the\nemotions in generic images containing facial & non-facial objects and non-human\ncomponents. It addresses the challenge of the insufficient availability of\npre-trained models and well-annotated datasets for image emotion recognition\n(IER). It starts with proposing a facial emotion recognition (FER) system and\nthen moves on to adapting it for image emotion recognition. First, a\ndeep-learning-based FER system has been proposed that classifies a given facial\nimage into discrete emotion classes. Further, an image recognition system has\nbeen proposed that adapts the proposed FER system to recognize the emotions\nportrayed by images using domain adaptation. It classifies the generic images\ninto 'happy,' 'sad,' 'hate,' and 'anger' classes. A novel interpretability\napproach, Divide and Conquer based Shap (DnCShap), has also been proposed to\ninterpret the highly relevant visual features for emotion recognition. The\nproposed system's architecture has been decided through ablation studies, and\nthe experiments are conducted on four FER and four IER datasets. The proposed\nIER system has shown an emotion classification accuracy of 59.61% for the IAPSa\ndataset, 57.83% for the ArtPhoto dataset, 67.93% for the FI dataset, and 55.13%\nfor the EMOTIC dataset. The important visual features leading to a particular\nemotion class have been identified, and the embedding plots for various emotion\nclasses have been analyzed to explain the proposed system's predictions.\n","authors":["Puneet Kumar","Balasubramanian Raman"],"pdf_url":"https://arxiv.org/pdf/2011.08388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02205v3","updated":"2024-02-07T13:09:15Z","published":"2024-02-03T16:38:25Z","title":"GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model\n on Complex Traffic Events","summary":" The recognition and understanding of traffic incidents, particularly traffic\naccidents, is a topic of paramount importance in the realm of intelligent\ntransportation systems and intelligent vehicles. This area has continually\ncaptured the extensive focus of both the academic and industrial sectors.\nIdentifying and comprehending complex traffic events is highly challenging,\nprimarily due to the intricate nature of traffic environments, diverse\nobservational perspectives, and the multifaceted causes of accidents. These\nfactors have persistently impeded the development of effective solutions. The\nadvent of large vision-language models (VLMs) such as GPT-4V, has introduced\ninnovative approaches to addressing this issue. In this paper, we explore the\nability of GPT-4V with a set of representative traffic incident videos and\ndelve into the model's capacity of understanding these complex traffic\nsituations. We observe that GPT-4V demonstrates remarkable cognitive,\nreasoning, and decision-making ability in certain classic traffic events.\nConcurrently, we also identify certain limitations of GPT-4V, which constrain\nits understanding in more intricate scenarios. These limitations merit further\nexploration and resolution.\n","authors":["Xingcheng Zhou","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2402.02205v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15473v2","updated":"2024-02-07T13:06:33Z","published":"2024-01-27T17:58:42Z","title":"iDeLog: Iterative Dual Spatial and Kinematic Extraction of\n Sigma-Lognormal Parameters","summary":" The Kinematic Theory of rapid movements and its associated Sigma-Lognormal\nmodel have been extensively used in a large variety of applications. While the\nphysical and biological meaning of the model have been widely tested and\nvalidated for rapid movements, some shortcomings have been detected when it is\nused with continuous long and complex movements. To alleviate such drawbacks,\nand inspired by the motor equivalence theory and a conceivable visual feedback,\nthis paper proposes a novel framework to extract the Sigma-Lognormal\nparameters, namely iDeLog. Specifically, iDeLog consists of two steps. The\nfirst one, influenced by the motor equivalence model, separately derives an\ninitial action plan defined by a set of virtual points and angles from the\ntrajectory and a sequence of lognormals from the velocity. In the second step,\nbased on a hypothetical visual feedback compatible with an open-loop motor\ncontrol, the virtual target points of the action plan are iteratively moved to\nimprove the matching between the observed and reconstructed trajectory and\nvelocity. During experiments conducted with handwritten signatures, iDeLog\nobtained promising results as compared to the previous development of the\nSigma-Lognormal.\n","authors":["Miguel A. Ferrer","Moises Diaz","Cristina Carmona-Duarte","Rejean Plamondon"],"pdf_url":"https://arxiv.org/pdf/2401.15473v2.pdf","comment":"Accepted Version published by Transactions on Pattern Analysis and\n Machine Intelligence"},{"id":"http://arxiv.org/abs/2402.04798v1","updated":"2024-02-07T12:38:47Z","published":"2024-02-07T12:38:47Z","title":"Spiking-PhysFormer: Camera-Based Remote Photoplethysmography with\n Parallel Spike-driven Transformer","summary":" Artificial neural networks (ANNs) can help camera-based remote\nphotoplethysmography (rPPG) in measuring cardiac activity and physiological\nsignals from facial videos, such as pulse wave, heart rate and respiration rate\nwith better accuracy. However, most existing ANN-based methods require\nsubstantial computing resources, which poses challenges for effective\ndeployment on mobile devices. Spiking neural networks (SNNs), on the other\nhand, hold immense potential for energy-efficient deep learning owing to their\nbinary and event-driven architecture. To the best of our knowledge, we are the\nfirst to introduce SNNs into the realm of rPPG, proposing a hybrid neural\nnetwork (HNN) model, the Spiking-PhysFormer, aimed at reducing power\nconsumption. Specifically, the proposed Spiking-PhyFormer consists of an\nANN-based patch embedding block, SNN-based transformer blocks, and an ANN-based\npredictor head. First, to simplify the transformer block while preserving its\ncapacity to aggregate local and global spatio-temporal features, we design a\nparallel spike transformer block to replace sequential sub-blocks.\nAdditionally, we propose a simplified spiking self-attention mechanism that\nomits the value parameter without compromising the model's performance.\nExperiments conducted on four datasets-PURE, UBFC-rPPG, UBFC-Phys, and MMPD\ndemonstrate that the proposed model achieves a 12.4\\% reduction in power\nconsumption compared to PhysFormer. Additionally, the power consumption of the\ntransformer block is reduced by a factor of 12.2, while maintaining decent\nperformance as PhysFormer and other ANN-based models.\n","authors":["Mingxaun Liu","Jiankai Tang","Haoxiang Li","Jiahao Qi","Siwei Li","Kegang Wang","Yuntao Wang","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04798v1.pdf","comment":"Mingxuan Liu and Jiankai Tang are co-first authors of the article"},{"id":"http://arxiv.org/abs/2402.04796v1","updated":"2024-02-07T12:36:54Z","published":"2024-02-07T12:36:54Z","title":"Mesh-based Gaussian Splatting for Real-time Large-scale Deformation","summary":" Neural implicit representations, including Neural Distance Fields and Neural\nRadiance Fields, have demonstrated significant capabilities for reconstructing\nsurfaces with complicated geometry and topology, and generating novel views of\na scene. Nevertheless, it is challenging for users to directly deform or\nmanipulate these implicit representations with large deformations in the\nreal-time fashion. Gaussian Splatting(GS) has recently become a promising\nmethod with explicit geometry for representing static scenes and facilitating\nhigh-quality and real-time synthesis of novel views. However,it cannot be\neasily deformed due to the use of discrete Gaussians and lack of explicit\ntopology. To address this, we develop a novel GS-based method that enables\ninteractive deformation. Our key idea is to design an innovative mesh-based GS\nrepresentation, which is integrated into Gaussian learning and manipulation. 3D\nGaussians are defined over an explicit mesh, and they are bound with each\nother: the rendering of 3D Gaussians guides the mesh face split for adaptive\nrefinement, and the mesh face split directs the splitting of 3D Gaussians.\nMoreover, the explicit mesh constraints help regularize the Gaussian\ndistribution, suppressing poor-quality Gaussians(e.g. misaligned\nGaussians,long-narrow shaped Gaussians), thus enhancing visual quality and\navoiding artifacts during deformation. Based on this representation, we further\nintroduce a large-scale Gaussian deformation technique to enable deformable GS,\nwhich alters the parameters of 3D Gaussians according to the manipulation of\nthe associated mesh. Our method benefits from existing mesh deformation\ndatasets for more realistic data-driven Gaussian deformation. Extensive\nexperiments show that our approach achieves high-quality reconstruction and\neffective deformation, while maintaining the promising rendering results at a\nhigh frame rate(65 FPS on average).\n","authors":["Lin Gao","Jie Yang","Bo-Tao Zhang","Jia-Mu Sun","Yu-Jie Yuan","Hongbo Fu","Yu-Kun Lai"],"pdf_url":"https://arxiv.org/pdf/2402.04796v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.04788v1","updated":"2024-02-07T12:28:32Z","published":"2024-02-07T12:28:32Z","title":"MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with\n Vision-Language Benchmark","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\nrecently, showing remarkable potential in artificial general intelligence.\nHowever, assessing the utility of MLLMs presents considerable challenges,\nprimarily due to the absence multimodal benchmarks that align with human\npreferences. Inspired by LLM-as-a-Judge in LLMs, this paper introduces a novel\nbenchmark, termed MLLM-as-a-Judge, to assess the ability of MLLMs in assisting\njudges including three distinct tasks: Scoring Evaluation, Pair Comparison, and\nBatch Ranking. Our study reveals that, while MLLMs demonstrate remarkable\nhuman-like discernment in Pair Comparisons, there is a significant divergence\nfrom human preferences in Scoring Evaluation and Batch Ranking tasks.\nFurthermore, MLLMs still face challenges in judgment, including diverse biases,\nhallucinatory responses, and inconsistencies, even for advanced models such as\nGPT-4V. These findings emphasize the pressing need for enhancements and further\nresearch efforts regarding MLLMs as fully reliable evaluators. Code and dataset\nare available at https://github.com/Dongping-Chen/MLLM-as-a-Judge.\n","authors":["Dongping Chen","Ruoxi Chen","Shilin Zhang","Yinuo Liu","Yaochen Wang","Huichi Zhou","Qihui Zhang","Pan Zhou","Yao Wan","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15896v2","updated":"2024-02-07T12:20:21Z","published":"2023-05-25T09:50:54Z","title":"MixFormerV2: Efficient Fully Transformer Tracking","summary":" Transformer-based trackers have achieved strong accuracy on the standard\nbenchmarks. However, their efficiency remains an obstacle to practical\ndeployment on both GPU and CPU platforms. In this paper, to overcome this\nissue, we propose a fully transformer tracking framework, coined as\n\\emph{MixFormerV2}, without any dense convolutional operation and complex score\nprediction module. Our key design is to introduce four special prediction\ntokens and concatenate them with the tokens from target template and search\nareas. Then, we apply the unified transformer backbone on these mixed token\nsequence. These prediction tokens are able to capture the complex correlation\nbetween target template and search area via mixed attentions. Based on them, we\ncan easily predict the tracking box and estimate its confidence score through\nsimple MLP heads. To further improve the efficiency of MixFormerV2, we present\na new distillation-based model reduction paradigm, including dense-to-sparse\ndistillation and deep-to-shallow distillation. The former one aims to transfer\nknowledge from the dense-head based MixViT to our fully transformer tracker,\nwhile the latter one is used to prune some layers of the backbone. We\ninstantiate two types of MixForemrV2, where the MixFormerV2-B achieves an AUC\nof 70.6\\% on LaSOT and an AUC of 57.4\\% on TNL2k with a high GPU speed of 165\nFPS, and the MixFormerV2-S surpasses FEAR-L by 2.7\\% AUC on LaSOT with a\nreal-time CPU speed.\n","authors":["Yutao Cui","Tianhui Song","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2305.15896v2.pdf","comment":"NIPS2023"},{"id":"http://arxiv.org/abs/2401.15753v2","updated":"2024-02-07T11:47:38Z","published":"2024-01-28T20:30:14Z","title":"An objective comparison of methods for augmented reality in laparoscopic\n liver resection by preoperative-to-intraoperative image fusion","summary":" Augmented reality for laparoscopic liver resection is a visualisation mode\nthat allows a surgeon to localise tumours and vessels embedded within the liver\nby projecting them on top of a laparoscopic image. Preoperative 3D models\nextracted from CT or MRI data are registered to the intraoperative laparoscopic\nimages during this process. In terms of 3D-2D fusion, most of the algorithms\nmake use of anatomical landmarks to guide registration. These landmarks include\nthe liver's inferior ridge, the falciform ligament, and the occluding contours.\nThey are usually marked by hand in both the laparoscopic image and the 3D\nmodel, which is time-consuming and may contain errors if done by a\nnon-experienced user. Therefore, there is a need to automate this process so\nthat augmented reality can be used effectively in the operating room. We\npresent the Preoperative-to-Intraoperative Laparoscopic Fusion Challenge\n(P2ILF), held during the Medical Imaging and Computer Assisted Interventions\n(MICCAI 2022) conference, which investigates the possibilities of detecting\nthese landmarks automatically and using them in registration. The challenge was\ndivided into two tasks: 1) A 2D and 3D landmark detection task and 2) a 3D-2D\nregistration task. The teams were provided with training data consisting of 167\nlaparoscopic images and 9 preoperative 3D models from 9 patients, with the\ncorresponding 2D and 3D landmark annotations. A total of 6 teams from 4\ncountries participated, whose proposed methods were evaluated on 16 images and\ntwo preoperative 3D models from two patients. All the teams proposed deep\nlearning-based methods for the 2D and 3D landmark segmentation tasks and\ndifferentiable rendering-based methods for the registration task. Based on the\nexperimental outcomes, we propose three key hypotheses that determine current\nlimitations and future directions for research in this domain.\n","authors":["Sharib Ali","Yamid Espinel","Yueming Jin","Peng Liu","Bianca Güttner","Xukun Zhang","Lihua Zhang","Tom Dowrick","Matthew J. Clarkson","Shiting Xiao","Yifan Wu","Yijun Yang","Lei Zhu","Dai Sun","Lan Li","Micha Pfeiffer","Shahid Farid","Lena Maier-Hein","Emmanuel Buc","Adrien Bartoli"],"pdf_url":"https://arxiv.org/pdf/2401.15753v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2402.04768v1","updated":"2024-02-07T11:37:14Z","published":"2024-02-07T11:37:14Z","title":"Robot Interaction Behavior Generation based on Social Motion Forecasting\n for Human-Robot Interaction","summary":" Integrating robots into populated environments is a complex challenge that\nrequires an understanding of human social dynamics. In this work, we propose to\nmodel social motion forecasting in a shared human-robot representation space,\nwhich facilitates us to synthesize robot motions that interact with humans in\nsocial scenarios despite not observing any robot in the motion training. We\ndevelop a transformer-based architecture called ECHO, which operates in the\naforementioned shared space to predict the future motions of the agents\nencountered in social scenarios. Contrary to prior works, we reformulate the\nsocial motion problem as the refinement of the predicted individual motions\nbased on the surrounding agents, which facilitates the training while allowing\nfor single-motion forecasting when only one human is in the scene. We evaluate\nour model in multi-person and human-robot motion forecasting tasks and obtain\nstate-of-the-art performance by a large margin while being efficient and\nperforming in real-time. Additionally, our qualitative results showcase the\neffectiveness of our approach in generating human-robot interaction behaviors\nthat can be controlled via text commands.\n","authors":["Esteve Valls Mascaro","Yashuai Yan","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2402.04768v1.pdf","comment":"Accepted at ICRA 2024"},{"id":"http://arxiv.org/abs/2006.05470v9","updated":"2024-02-07T11:36:15Z","published":"2020-06-09T19:29:49Z","title":"Standardised convolutional filtering for radiomics","summary":" The Image Biomarker Standardisation Initiative (IBSI) aims to improve\nreproducibility of radiomics studies by standardising the computational process\nof extracting image biomarkers (features) from images. We have previously\nestablished reference values for 169 commonly used features, created a standard\nradiomics image processing scheme, and developed reporting guidelines for\nradiomic studies. However, several aspects are not standardised. Here we\npresent a complete version of a reference manual on the use of convolutional\nfilters in radiomics and quantitative image analysis. Filters, such as wavelets\nor Laplacian of Gaussian filters, play an important part in emphasising\nspecific image characteristics such as edges and blobs. Features derived from\nfilter response maps were found to be poorly reproducible. This reference\nmanual provides definitions for convolutional filters, parameters that should\nbe reported, reference feature values, and tests to verify software compliance\nwith the reference standard.\n","authors":["Adrien Depeursinge","Vincent Andrearczyk","Philip Whybra","Joost van Griethuysen","Henning Müller","Roger Schaer","Martin Vallières","Alex Zwanenburg"],"pdf_url":"https://arxiv.org/pdf/2006.05470v9.pdf","comment":"87 pages. For additional information see https://theibsi.github.io/"},{"id":"http://arxiv.org/abs/2402.04762v1","updated":"2024-02-07T11:26:00Z","published":"2024-02-07T11:26:00Z","title":"Color Recognition in Challenging Lighting Environments: CNN Approach","summary":" Light plays a vital role in vision either human or machine vision, the\nperceived color is always based on the lighting conditions of the surroundings.\nResearchers are working to enhance the color detection techniques for the\napplication of computer vision. They have implemented proposed several methods\nusing different color detection approaches but still, there is a gap that can\nbe filled. To address this issue, a color detection method, which is based on a\nConvolutional Neural Network (CNN), is proposed. Firstly, image segmentation is\nperformed using the edge detection segmentation technique to specify the object\nand then the segmented object is fed to the Convolutional Neural Network\ntrained to detect the color of an object in different lighting conditions. It\nis experimentally verified that our method can substantially enhance the\nrobustness of color detection in different lighting conditions, and our method\nperformed better results than existing methods.\n","authors":["Nizamuddin Maitlo","Nooruddin Noonari","Sajid Ahmed Ghanghro","Sathishkumar Duraisamy","Fayaz Ahmed"],"pdf_url":"https://arxiv.org/pdf/2402.04762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16123v2","updated":"2024-02-07T11:25:28Z","published":"2024-01-29T12:48:56Z","title":"Looking for a better fit? An Incremental Learning Multimodal Object\n Referencing Framework adapting to Individual Drivers","summary":" The rapid advancement of the automotive industry towards automated and\nsemi-automated vehicles has rendered traditional methods of vehicle\ninteraction, such as touch-based and voice command systems, inadequate for a\nwidening range of non-driving related tasks, such as referencing objects\noutside of the vehicle. Consequently, research has shifted toward gestural\ninput (e.g., hand, gaze, and head pose gestures) as a more suitable mode of\ninteraction during driving. However, due to the dynamic nature of driving and\nindividual variation, there are significant differences in drivers' gestural\ninput performance. While, in theory, this inherent variability could be\nmoderated by substantial data-driven machine learning models, prevalent\nmethodologies lean towards constrained, single-instance trained models for\nobject referencing. These models show a limited capacity to continuously adapt\nto the divergent behaviors of individual drivers and the variety of driving\nscenarios. To address this, we propose \\textit{IcRegress}, a novel\nregression-based incremental learning approach that adapts to changing behavior\nand the unique characteristics of drivers engaged in the dual task of driving\nand referencing objects. We suggest a more personalized and adaptable solution\nfor multimodal gestural interfaces, employing continuous lifelong learning to\nenhance driver experience, safety, and convenience. Our approach was evaluated\nusing an outside-the-vehicle object referencing use case, highlighting the\nsuperiority of the incremental learning models adapted over a single trained\nmodel across various driver traits such as handedness, driving experience, and\nnumerous driving conditions. Finally, to facilitate reproducibility, ease\ndeployment, and promote further research, we offer our approach as an\nopen-source framework at \\url{https://github.com/amrgomaaelhady/IcRegress}.\n","authors":["Amr Gomaa","Guillermo Reyes","Michael Feld","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2401.16123v2.pdf","comment":"Accepted for publication in the Proceedings of the 29th International\n Conference on Intelligent User Interfaces (IUI'24), March 18--21, 2024, in\n Greenville, SC, USA"},{"id":"http://arxiv.org/abs/2402.04756v1","updated":"2024-02-07T11:16:34Z","published":"2024-02-07T11:16:34Z","title":"Boundary-aware Contrastive Learning for Semi-supervised Nuclei Instance\n Segmentation","summary":" Semi-supervised segmentation methods have demonstrated promising results in\nnatural scenarios, providing a solution to reduce dependency on manual\nannotation. However, these methods face significant challenges when directly\napplied to pathological images due to the subtle color differences between\nnuclei and tissues, as well as the significant morphological variations among\nnuclei. Consequently, the generated pseudo-labels often contain much noise,\nespecially at the nuclei boundaries. To address the above problem, this paper\nproposes a boundary-aware contrastive learning network to denoise the boundary\nnoise in a semi-supervised nuclei segmentation task. The model has two key\ndesigns: a low-resolution denoising (LRD) module and a cross-RoI contrastive\nlearning (CRC) module. The LRD improves the smoothness of the nuclei boundary\nby pseudo-labels denoising, and the CRC enhances the discrimination between\nforeground and background by boundary feature contrastive learning. We conduct\nextensive experiments to demonstrate the superiority of our proposed method\nover existing semi-supervised instance segmentation methods.\n","authors":["Ye Zhang","Ziyue Wang","Yifeng Wang","Hao Bian","Linghan Cai","Hengrui Li","Lingbo Zhang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04756v1.pdf","comment":"12 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.04754v1","updated":"2024-02-07T11:12:41Z","published":"2024-02-07T11:12:41Z","title":"Towards Aligned Layout Generation via Diffusion Model with Aesthetic\n Constraints","summary":" Controllable layout generation refers to the process of creating a plausible\nvisual arrangement of elements within a graphic design (e.g., document and web\ndesigns) with constraints representing design intentions. Although recent\ndiffusion-based models have achieved state-of-the-art FID scores, they tend to\nexhibit more pronounced misalignment compared to earlier transformer-based\nmodels. In this work, we propose the $\\textbf{LA}$yout $\\textbf{C}$onstraint\ndiffusion mod$\\textbf{E}$l (LACE), a unified model to handle a broad range of\nlayout generation tasks, such as arranging elements with specified attributes\nand refining or completing a coarse layout design. The model is based on\ncontinuous diffusion models. Compared with existing methods that use discrete\ndiffusion models, continuous state-space design can enable the incorporation of\ndifferentiable aesthetic constraint functions in training. For conditional\ngeneration, we introduce conditions via masked input. Extensive experiment\nresults show that LACE produces high-quality layouts and outperforms existing\nstate-of-the-art baselines.\n","authors":["Jian Chen","Ruiyi Zhang","Yufan Zhou","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04754v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04753v1","updated":"2024-02-07T11:12:09Z","published":"2024-02-07T11:12:09Z","title":"Cortical Surface Diffusion Generative Models","summary":" Cortical surface analysis has gained increased prominence, given its\npotential implications for neurological and developmental disorders.\nTraditional vision diffusion models, while effective in generating natural\nimages, present limitations in capturing intricate development patterns in\nneuroimaging due to limited datasets. This is particularly true for generating\ncortical surfaces where individual variability in cortical morphology is high,\nleading to an urgent need for better methods to model brain development and\ndiverse variability inherent across different individuals. In this work, we\nproposed a novel diffusion model for the generation of cortical surface\nmetrics, using modified surface vision transformers as the principal\narchitecture. We validate our method in the developing Human Connectome Project\n(dHCP), the results suggest our model demonstrates superior performance in\ncapturing the intricate details of evolving cortical surfaces. Furthermore, our\nmodel can generate high-quality realistic samples of cortical surfaces\nconditioned on postmenstrual age(PMA) at scan.\n","authors":["Zhenshan Xie","Simon Dahan","Logan Z. J. Williams","M. Jorge Cardoso","Emma C. Robinson"],"pdf_url":"https://arxiv.org/pdf/2402.04753v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2402.04750v1","updated":"2024-02-07T11:08:05Z","published":"2024-02-07T11:08:05Z","title":"AINS: Affordable Indoor Navigation Solution via Line Color\n Identification Using Mono-Camera for Autonomous Vehicles","summary":" Recently, researchers have been exploring various ways to improve the\neffectiveness and efficiency of autonomous vehicles by researching new methods,\nespecially for indoor scenarios. Autonomous Vehicles in indoor navigation\nsystems possess many challenges especially the limited accuracy of GPS in\nindoor scenarios. Several, robust methods have been explored for autonomous\nvehicles in indoor scenarios to solve this problem, but the ineffectiveness of\nthe proposed methods is the high deployment cost. To address the\nabove-mentioned problems we have presented A low-cost indoor navigation method\nfor autonomous vehicles called Affordable Indoor Navigation Solution (AINS)\nwhich is based on based on Monocular Camera. Our proposed solution is mainly\nbased on a mono camera without relying on various huge or power-inefficient\nsensors to find the path, such as range finders and other navigation sensors.\nOur proposed method shows that we can deploy autonomous vehicles indoor\nnavigation systems while taking into consideration the cost. We can observe\nthat the results shown by our solution are better than existing solutions and\nwe can reduce the estimated error and time consumption.\n","authors":["Nizamuddin Maitlo","Nooruddin Noonari","Kaleem Arshid","Naveed Ahmed","Sathishkumar Duraisamy"],"pdf_url":"https://arxiv.org/pdf/2402.04750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09668v2","updated":"2024-02-07T11:07:29Z","published":"2023-09-18T11:09:11Z","title":"DFormer: Rethinking RGBD Representation Learning for Semantic\n Segmentation","summary":" We present DFormer, a novel RGB-D pretraining framework to learn transferable\nrepresentations for RGB-D segmentation tasks. DFormer has two new key\ninnovations: 1) Unlike previous works that encode RGB-D information with RGB\npretrained backbone, we pretrain the backbone using image-depth pairs from\nImageNet-1K, and hence the DFormer is endowed with the capacity to encode RGB-D\nrepresentations; 2) DFormer comprises a sequence of RGB-D blocks, which are\ntailored for encoding both RGB and depth information through a novel building\nblock design. DFormer avoids the mismatched encoding of the 3D geometry\nrelationships in depth maps by RGB pretrained backbones, which widely lies in\nexisting methods but has not been resolved. We finetune the pretrained DFormer\non two popular RGB-D tasks, i.e., RGB-D semantic segmentation and RGB-D salient\nobject detection, with a lightweight decoder head. Experimental results show\nthat our DFormer achieves new state-of-the-art performance on these two tasks\nwith less than half of the computational cost of the current best methods on\ntwo RGB-D semantic segmentation datasets and five RGB-D salient object\ndetection datasets. Our code is available at:\nhttps://github.com/VCIP-RGBD/DFormer.\n","authors":["Bowen Yin","Xuying Zhang","Zhongyu Li","Li Liu","Ming-Ming Cheng","Qibin Hou"],"pdf_url":"https://arxiv.org/pdf/2309.09668v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2312.00639v2","updated":"2024-02-07T10:57:28Z","published":"2023-12-01T14:59:43Z","title":"RefinedFields: Radiance Fields Refinement for Unconstrained Scenes","summary":" Modeling large scenes from unconstrained images has proven to be a major\nchallenge in computer vision. Existing methods tackling in-the-wild scene\nmodeling operate in closed-world settings, where no conditioning on priors\nacquired from real-world images is present. We propose RefinedFields, which is,\nto the best of our knowledge, the first method leveraging pre-trained models to\nimprove in-the-wild scene modeling. We employ pre-trained networks to refine\nK-Planes representations via optimization guidance using an alternating\ntraining procedure. We carry out extensive experiments and verify the merit of\nour method on synthetic data and real tourism photo collections. RefinedFields\nenhances rendered scenes with richer details and outperforms previous work on\nthe task of novel view synthesis in the wild. Our project page can be found at\nhttps://refinedfields.github.io .\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05810v3","updated":"2024-02-07T10:37:12Z","published":"2023-08-10T18:09:44Z","title":"Spintronics for image recognition: performance benchmarking via\n ultrafast data-driven simulations","summary":" We present a demonstration of image classification using an echo-state\nnetwork (ESN) relying on a single simulated spintronic nanostructure known as\nthe vortex-based spin-torque oscillator (STVO) delayed in time. We employ an\nultrafast data-driven simulation framework called the data-driven Thiele\nequation approach (DD-TEA) to simulate the STVO dynamics. This allows us to\navoid the challenges associated with repeated experimental manipulation of such\na nanostructured system. We showcase the versatility of our solution by\nsuccessfully applying it to solve classification challenges with the MNIST,\nEMNIST-letters and Fashion MNIST datasets. Through our simulations, we\ndetermine that within an ESN with numerous learnable parameters the results\nobtained using the STVO dynamics as an activation function are comparable to\nthe ones obtained with other conventional nonlinear activation functions like\nthe reLU and the sigmoid. While achieving state-of-the-art accuracy levels on\nthe MNIST dataset, our model's performance on EMNIST-letters and Fashion MNIST\nis lower due to the relative simplicity of the system architecture and the\nincreased complexity of the tasks. We expect that the DD-TEA framework will\nenable the exploration of deeper architectures, ultimately leading to improved\nclassification accuracy.\n","authors":["Anatole Moureaux","Chloé Chopin","Simon de Wergifosse","Laurent Jacques","Flavio Abreu Araujo"],"pdf_url":"https://arxiv.org/pdf/2308.05810v3.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2402.04717v1","updated":"2024-02-07T10:09:00Z","published":"2024-02-07T10:09:00Z","title":"InstructScene: Instruction-Driven 3D Indoor Scene Synthesis with\n Semantic Graph Prior","summary":" Comprehending natural language instructions is a charming property for 3D\nindoor scene synthesis systems. Existing methods directly model object joint\ndistributions and express object relations implicitly within a scene, thereby\nhindering the controllability of generation. We introduce InstructScene, a\nnovel generative framework that integrates a semantic graph prior and a layout\ndecoder to improve controllability and fidelity for 3D scene synthesis. The\nproposed semantic graph prior jointly learns scene appearances and layout\ndistributions, exhibiting versatility across various downstream tasks in a\nzero-shot manner. To facilitate the benchmarking for text-driven 3D scene\nsynthesis, we curate a high-quality dataset of scene-instruction pairs with\nlarge language and multimodal models. Extensive experimental results reveal\nthat the proposed method surpasses existing state-of-the-art approaches by a\nlarge margin. Thorough ablation studies confirm the efficacy of crucial design\ncomponents. Project page: https://chenguolin.github.io/projects/InstructScene.\n","authors":["Chenguo Lin","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2402.04717v1.pdf","comment":"Accepted by ICLR 2024 for spotlight presentation; Project page:\n https://chenguolin.github.io/projects/InstructScene"},{"id":"http://arxiv.org/abs/2402.04699v1","updated":"2024-02-07T09:39:29Z","published":"2024-02-07T09:39:29Z","title":"EvoSeed: Unveiling the Threat on Deep Neural Networks with Real-World\n Illusions","summary":" Deep neural networks are exploited using natural adversarial samples, which\nhave no impact on human perception but are misclassified. Current approaches\noften rely on the white-box nature of deep neural networks to generate these\nadversarial samples or alter the distribution of adversarial samples compared\nto training distribution. To alleviate the limitations of current approaches,\nwe propose EvoSeed, a novel evolutionary strategy-based search algorithmic\nframework to generate natural adversarial samples. Our EvoSeed framework uses\nauxiliary Diffusion and Classifier models to operate in a model-agnostic\nblack-box setting. We employ CMA-ES to optimize the search for an adversarial\nseed vector, which, when processed by the Conditional Diffusion Model, results\nin an unrestricted natural adversarial sample misclassified by the Classifier\nModel. Experiments show that generated adversarial images are of high image\nquality and are transferable to different classifiers. Our approach\ndemonstrates promise in enhancing the quality of adversarial samples using\nevolutionary algorithms. We hope our research opens new avenues to enhance the\nrobustness of deep neural networks in real-world scenarios. Project Website can\nbe accessed at \\url{https://shashankkotyan.github.io/EvoSeed}.\n","authors":["Shashank Kotyan","PoYuan Mao","Danilo Vasconcellos Vargas"],"pdf_url":"https://arxiv.org/pdf/2402.04699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04686v1","updated":"2024-02-07T09:20:01Z","published":"2024-02-07T09:20:01Z","title":"The Influence of Autofocus Lenses in the Camera Calibration Process","summary":" Camera calibration is a crucial step in robotics and computer vision.\nAccurate camera parameters are necessary to achieve robust applications.\nNowadays, camera calibration process consists of adjusting a set of data to a\npin-hole model, assuming that with a reprojection error close to cero, camera\nparameters are correct. Since all camera parameters are unknown, computed\nresults are considered true. However, the pin-hole model does not represent the\ncamera behavior accurately if the focus is considered. Real cameras change the\nfocal length slightly to obtain sharp objects in the image and this feature\nskews the calibration result if a unique pin-hole model is computed with a\nconstant focal length. In this paper, a deep analysis of the camera calibration\nprocess is done to detect and strengthen its weaknesses. The camera is mounted\nin a robot arm to known extrinsic camera parameters with accuracy and to be\nable to compare computed results with the true ones. Based on the bias that\nexist between computed results and the true ones, a modification of the widely\naccepted camera calibration method using images of a planar template is\npresented. A pin-hole model with distance dependent focal length is proposed to\nimprove the calibration process substantially\n","authors":["Carlos Ricolfe-Viala","Alicia Esparza"],"pdf_url":"https://arxiv.org/pdf/2402.04686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04676v1","updated":"2024-02-07T09:03:04Z","published":"2024-02-07T09:03:04Z","title":"Group Distributionally Robust Dataset Distillation with Risk\n Minimization","summary":" Dataset distillation (DD) has emerged as a widely adopted technique for\ncrafting a synthetic dataset that captures the essential information of a\ntraining dataset, facilitating the training of accurate neural models. Its\napplications span various domains, including transfer learning, federated\nlearning, and neural architecture search. The most popular methods for\nconstructing the synthetic data rely on matching the convergence properties of\ntraining the model with the synthetic dataset and the training dataset.\nHowever, targeting the training dataset must be thought of as auxiliary in the\nsame sense that the training set is an approximate substitute for the\npopulation distribution, and the latter is the data of interest. Yet despite\nits popularity, an aspect that remains unexplored is the relationship of DD to\nits generalization, particularly across uncommon subgroups. That is, how can we\nensure that a model trained on the synthetic dataset performs well when faced\nwith samples from regions with low population density? Here, the\nrepresentativeness and coverage of the dataset become salient over the\nguaranteed training error at inference. Drawing inspiration from\ndistributionally robust optimization, we introduce an algorithm that combines\nclustering with the minimization of a risk measure on the loss to conduct DD.\nWe provide a theoretical rationale for our approach and demonstrate its\neffective generalization and robustness across subgroups through numerical\nexperiments.\n","authors":["Saeed Vahidian","Mingyu Wang","Jianyang Gu","Vyacheslav Kungurtsev","Wei Jiang","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14168v2","updated":"2024-02-07T09:02:33Z","published":"2024-01-25T13:27:03Z","title":"Vivim: a Video Vision Mamba for Medical Video Object Segmentation","summary":" Traditional convolutional neural networks have a limited receptive field\nwhile transformer-based networks are mediocre in constructing long-term\ndependency from the perspective of computational complexity. Such the\nbottleneck poses a significant challenge when processing long video sequences\nin video analysis tasks. Very recently, the state space models (SSMs) with\nefficient hardware-aware designs, famous by Mamba, have exhibited impressive\nachievements in long sequence modeling, which facilitates the development of\ndeep neural networks on many vision tasks. To better capture available cues in\nvideo frames, this paper presents a generic Video Vision Mamba-based framework\nfor medical video object segmentation tasks, named Vivim. Our Vivim can\neffectively compress the long-term spatiotemporal representation into sequences\nat varying scales by our designed Temporal Mamba Block. Compared to existing\nvideo-level Transformer-based methods, our model maintains excellent\nsegmentation results with better speed performance. Extensive experiments on\nbreast lesion segmentation in ultrasound videos and polyp segmentation in\ncolonoscopy videos demonstrate the effectiveness and efficiency of our Vivim.\nThe code is available at: https://github.com/scott-yjyang/Vivim.\n","authors":["Yijun Yang","Zhaohu Xing","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.14168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04672v1","updated":"2024-02-07T08:57:59Z","published":"2024-02-07T08:57:59Z","title":"G-NAS: Generalizable Neural Architecture Search for Single Domain\n Generalization Object Detection","summary":" In this paper, we focus on a realistic yet challenging task, Single Domain\nGeneralization Object Detection (S-DGOD), where only one source domain's data\ncan be used for training object detectors, but have to generalize multiple\ndistinct target domains. In S-DGOD, both high-capacity fitting and\ngeneralization abilities are needed due to the task's complexity.\nDifferentiable Neural Architecture Search (NAS) is known for its high capacity\nfor complex data fitting and we propose to leverage Differentiable NAS to solve\nS-DGOD. However, it may confront severe over-fitting issues due to the feature\nimbalance phenomenon, where parameters optimized by gradient descent are biased\nto learn from the easy-to-learn features, which are usually non-causal and\nspuriously correlated to ground truth labels, such as the features of\nbackground in object detection data. Consequently, this leads to serious\nperformance degradation, especially in generalizing to unseen target domains\nwith huge domain gaps between the source domain and target domains. To address\nthis issue, we propose the Generalizable loss (G-loss), which is an OoD-aware\nobjective, preventing NAS from over-fitting by using gradient descent to\noptimize parameters not only on a subset of easy-to-learn features but also the\nremaining predictive features for generalization, and the overall framework is\nnamed G-NAS. Experimental results on the S-DGOD urban-scene datasets\ndemonstrate that the proposed G-NAS achieves SOTA performance compared to\nbaseline methods. Codes are available at https://github.com/wufan-cse/G-NAS.\n","authors":["Fan Wu","Jinling Gao","Lanqing Hong","Xinbing Wang","Chenghu Zhou","Nanyang Ye"],"pdf_url":"https://arxiv.org/pdf/2402.04672v1.pdf","comment":"Accepted by AAAI24"},{"id":"http://arxiv.org/abs/2402.04671v1","updated":"2024-02-07T08:55:57Z","published":"2024-02-07T08:55:57Z","title":"V2VSSC: A 3D Semantic Scene Completion Benchmark for Perception with\n Vehicle to Vehicle Communication","summary":" Semantic scene completion (SSC) has recently gained popularity because it can\nprovide both semantic and geometric information that can be used directly for\nautonomous vehicle navigation. However, there are still challenges to overcome.\nSSC is often hampered by occlusion and short-range perception due to sensor\nlimitations, which can pose safety risks. This paper proposes a fundamental\nsolution to this problem by leveraging vehicle-to-vehicle (V2V) communication.\nWe propose the first generalized collaborative SSC framework that allows\nautonomous vehicles to share sensing information from different sensor views to\njointly perform SSC tasks. To validate the proposed framework, we further build\nV2VSSC, the first V2V SSC benchmark, on top of the large-scale V2V perception\ndataset OPV2V. Extensive experiments demonstrate that by leveraging V2V\ncommunication, the SSC performance can be increased by 8.3% on geometric metric\nIoU and 6.0% mIOU.\n","authors":["Yuanfang Zhang","Junxuan Li","Kaiqing Luo","Yiying Yang","Jiayi Han","Nian Liu","Denghui Qin","Peng Han","Chengpei Xu"],"pdf_url":"https://arxiv.org/pdf/2402.04671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08309v3","updated":"2024-02-07T08:55:47Z","published":"2023-06-14T07:27:06Z","title":"Taming Reversible Halftoning via Predictive Luminance","summary":" Traditional halftoning usually drops colors when dithering images with binary\ndots, which makes it difficult to recover the original color information. We\nproposed a novel halftoning technique that converts a color image into a binary\nhalftone with full restorability to its original version. Our novel base\nhalftoning technique consists of two convolutional neural networks (CNNs) to\nproduce the reversible halftone patterns, and a noise incentive block (NIB) to\nmitigate the flatness degradation issue of CNNs. Furthermore, to tackle the\nconflicts between the blue-noise quality and restoration accuracy in our novel\nbase method, we proposed a predictor-embedded approach to offload predictable\ninformation from the network, which in our case is the luminance information\nresembling from the halftone pattern. Such an approach allows the network to\ngain more flexibility to produce halftones with better blue-noise quality\nwithout compromising the restoration quality. Detailed studies on the\nmultiple-stage training method and loss weightings have been conducted. We have\ncompared our predictor-embedded method and our novel method regarding spectrum\nanalysis on halftone, halftone accuracy, restoration accuracy, and the data\nembedding studies. Our entropy evaluation evidences our halftone contains less\nencoding information than our novel base method. The experiments show our\npredictor-embedded method gains more flexibility to improve the blue-noise\nquality of halftones and maintains a comparable restoration quality with a\nhigher tolerance for disturbances.\n","authors":["Cheuk-Kit Lau","Menghan Xia","Tien-Tsin Wong"],"pdf_url":"https://arxiv.org/pdf/2306.08309v3.pdf","comment":"published in IEEE Transactions on Visualization and Computer Graphics"},{"id":"http://arxiv.org/abs/2402.04660v1","updated":"2024-02-07T08:49:33Z","published":"2024-02-07T08:49:33Z","title":"Adversarial Robustness Through Artifact Design","summary":" Adversarial examples arose as a challenge for machine learning. To hinder\nthem, most defenses alter how models are trained (e.g., adversarial training)\nor inference is made (e.g., randomized smoothing). Still, while these\napproaches markedly improve models' adversarial robustness, models remain\nhighly susceptible to adversarial examples. Identifying that, in certain\ndomains such as traffic-sign recognition, objects are implemented per standards\nspecifying how artifacts (e.g., signs) should be designed, we propose a novel\napproach for improving adversarial robustness. Specifically, we offer a method\nto redefine standards, making minor changes to existing ones, to defend against\nadversarial examples. We formulate the problem of artifact design as a robust\noptimization problem, and propose gradient-based and greedy search methods to\nsolve it. We evaluated our approach in the domain of traffic-sign recognition,\nallowing it to alter traffic-sign pictograms (i.e., symbols within the signs)\nand their colors. We found that, combined with adversarial training, our\napproach led to up to 25.18\\% higher robust accuracy compared to\nstate-of-the-art methods against two adversary types, while further increasing\naccuracy on benign inputs.\n","authors":["Tsufit Shua","Mahmood Sharif"],"pdf_url":"https://arxiv.org/pdf/2402.04660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18562v2","updated":"2024-02-07T08:47:47Z","published":"2023-10-28T02:20:33Z","title":"Optimization-Free Test-Time Adaptation for Cross-Person Activity\n Recognition","summary":" Human Activity Recognition (HAR) models often suffer from performance\ndegradation in real-world applications due to distribution shifts in activity\npatterns across individuals. Test-Time Adaptation (TTA) is an emerging learning\nparadigm that aims to utilize the test stream to adjust predictions in\nreal-time inference, which has not been explored in HAR before. However, the\nhigh computational cost of optimization-based TTA algorithms makes it\nintractable to run on resource-constrained edge devices. In this paper, we\npropose an Optimization-Free Test-Time Adaptation (OFTTA) framework for\nsensor-based HAR. OFTTA adjusts the feature extractor and linear classifier\nsimultaneously in an optimization-free manner. For the feature extractor, we\npropose Exponential DecayTest-time Normalization (EDTN) to replace the\nconventional batch normalization (CBN) layers. EDTN combines CBN and Test-time\nbatch Normalization (TBN) to extract reliable features against domain shifts\nwith TBN's influence decreasing exponentially in deeper layers. For the\nclassifier, we adjust the prediction by computing the distance between the\nfeature and the prototype, which is calculated by a maintained support set. In\naddition, the update of the support set is based on the pseudo label, which can\nbenefit from reliable features extracted by EDTN. Extensive experiments on\nthree public cross-person HAR datasets and two different TTA settings\ndemonstrate that OFTTA outperforms the state-of-the-art TTA approaches in both\nclassification performance and computational efficiency. Finally, we verify the\nsuperiority of our proposed OFTTA on edge devices, indicating possible\ndeployment in real applications. Our code is available at\nhttps://github.com/Claydon-Wang/OFTTA.\n","authors":["Shuoyuan Wang","Jindong Wang","HuaJun Xi","Bob Zhang","Lei Zhang","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2310.18562v2.pdf","comment":"To be presented at UbiComp 2024; Accepted by Proceedings of the ACM\n on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)"},{"id":"http://arxiv.org/abs/2402.04653v1","updated":"2024-02-07T08:38:12Z","published":"2024-02-07T08:38:12Z","title":"An Over Complete Deep Learning Method for Inverse Problems","summary":" Obtaining meaningful solutions for inverse problems has been a major\nchallenge with many applications in science and engineering. Recent machine\nlearning techniques based on proximal and diffusion-based methods have shown\npromising results. However, as we show in this work, they can also face\nchallenges when applied to some exemplary problems. We show that similar to\nprevious works on over-complete dictionaries, it is possible to overcome these\nshortcomings by embedding the solution into higher dimensions. The novelty of\nthe work proposed is that we jointly design and learn the embedding and the\nregularizer for the embedding vector. We demonstrate the merit of this approach\non several exemplary and common inverse problems.\n","authors":["Moshe Eliasof","Eldad Haber","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2402.04653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17515v4","updated":"2024-02-07T08:36:12Z","published":"2023-11-29T10:38:42Z","title":"Fusion of Single and Integral Multispectral Aerial Images","summary":" An adequate fusion of the most significant salient information from multiple\ninput channels is essential for many aerial imaging tasks. While multispectral\nrecordings reveal features in various spectral ranges, synthetic aperture\nsensing makes occluded features visible. We present a first and hybrid (model-\nand learning-based) architecture for fusing the most significant features from\nconventional aerial images with the ones from integral aerial images that are\nthe result of synthetic aperture sensing for removing occlusion. It combines\nthe environment's spatial references with features of unoccluded targets that\nwould normally be hidden by dense vegetation. Our method out-beats\nstate-of-the-art two-channel and multi-channel fusion approaches visually and\nquantitatively in common metrics, such as mutual information, visual\ninformation fidelity, and peak signal-to-noise ratio. The proposed model does\nnot require manually tuned parameters, can be extended to an arbitrary number\nand combinations of spectral channels, and is reconfigurable for addressing\ndifferent use cases. We demonstrate examples for search-and-rescue, wildfire\ndetection, and wildlife observation.\n","authors":["Mohamed Youssef","Oliver Bimber"],"pdf_url":"https://arxiv.org/pdf/2311.17515v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04648v1","updated":"2024-02-07T08:19:57Z","published":"2024-02-07T08:19:57Z","title":"OV-NeRF: Open-vocabulary Neural Radiance Fields with Vision and Language\n Foundation Models for 3D Semantic Understanding","summary":" The development of Neural Radiance Fields (NeRFs) has provided a potent\nrepresentation for encapsulating the geometric and appearance characteristics\nof 3D scenes. Enhancing the capabilities of NeRFs in open-vocabulary 3D\nsemantic perception tasks has been a recent focus. However, current methods\nthat extract semantics directly from Contrastive Language-Image Pretraining\n(CLIP) for semantic field learning encounter difficulties due to noisy and\nview-inconsistent semantics provided by CLIP. To tackle these limitations, we\npropose OV-NeRF, which exploits the potential of pre-trained vision and\nlanguage foundation models to enhance semantic field learning through proposed\nsingle-view and cross-view strategies. First, from the single-view perspective,\nwe introduce Region Semantic Ranking (RSR) regularization by leveraging 2D mask\nproposals derived from SAM to rectify the noisy semantics of each training\nview, facilitating accurate semantic field learning. Second, from the\ncross-view perspective, we propose a Cross-view Self-enhancement (CSE) strategy\nto address the challenge raised by view-inconsistent semantics. Rather than\ninvariably utilizing the 2D inconsistent semantics from CLIP, CSE leverages the\n3D consistent semantics generated from the well-trained semantic field itself\nfor semantic field training, aiming to reduce ambiguity and enhance overall\nsemantic consistency across different views. Extensive experiments validate our\nOV-NeRF outperforms current state-of-the-art methods, achieving a significant\nimprovement of 20.31% and 18.42% in mIoU metric on Replica and Scannet,\nrespectively. Furthermore, our approach exhibits consistent superior results\nacross various CLIP configurations, further verifying its robustness.\n","authors":["Guibiao Liao","Kaichen Zhou","Zhenyu Bao","Kanglin Liu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2402.04648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09305v2","updated":"2024-02-07T08:15:51Z","published":"2023-12-14T19:18:38Z","title":"Stable Score Distillation for High-Quality 3D Generation","summary":" Although Score Distillation Sampling (SDS) has exhibited remarkable\nperformance in conditional 3D content generation, a comprehensive understanding\nof its formulation is still lacking, hindering the development of 3D\ngeneration. In this work, we decompose SDS as a combination of three functional\ncomponents, namely mode-seeking, mode-disengaging and variance-reducing terms,\nanalyzing the properties of each. We show that problems such as over-smoothness\nand implausibility result from the intrinsic deficiency of the first two terms\nand propose a more advanced variance-reducing term than that introduced by SDS.\nBased on the analysis, we propose a simple yet effective approach named Stable\nScore Distillation (SSD) which strategically orchestrates each term for\nhigh-quality 3D generation and can be readily incorporated to various 3D\ngeneration frameworks and 3D representations. Extensive experiments validate\nthe efficacy of our approach, demonstrating its ability to generate\nhigh-fidelity 3D content without succumbing to issues such as over-smoothness.\n","authors":["Boshi Tang","Jianan Wang","Zhiyong Wu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.09305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v3","updated":"2024-02-07T08:07:02Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A Simple Method to Reduce Hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2307.16545v2","updated":"2024-02-07T07:52:10Z","published":"2023-07-31T10:22:33Z","title":"Towards General Visual-Linguistic Face Forgery Detection","summary":" Deepfakes are realistic face manipulations that can pose serious threats to\nsecurity, privacy, and trust. Existing methods mostly treat this task as binary\nclassification, which uses digital labels or mask signals to train the\ndetection model. We argue that such supervisions lack semantic information and\ninterpretability. To address this issues, in this paper, we propose a novel\nparadigm named Visual-Linguistic Face Forgery Detection(VLFFD), which uses\nfine-grained sentence-level prompts as the annotation. Since text annotations\nare not available in current deepfakes datasets, VLFFD first generates the\nmixed forgery image with corresponding fine-grained prompts via Prompt Forgery\nImage Generator (PFIG). Then, the fine-grained mixed data and coarse-grained\noriginal data and is jointly trained with the Coarse-and-Fine Co-training\nframework (C2F), enabling the model to gain more generalization and\ninterpretability. The experiments show the proposed method improves the\nexisting detection models on several challenging benchmarks. Furthermore, we\nhave integrated our method with multimodal large models, achieving noteworthy\nresults that demonstrate the potential of our approach. This integration not\nonly enhances the performance of our VLFFD paradigm but also underscores the\nversatility and adaptability of our method when combined with advanced\nmultimodal technologies, highlighting its potential in tackling the evolving\nchallenges of deepfake detection.\n","authors":["Ke Sun","Shen Chen","Taiping Yao","Haozhe Yang","Xiaoshuai Sun","Shouhong Ding","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2307.16545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04632v1","updated":"2024-02-07T07:29:50Z","published":"2024-02-07T07:29:50Z","title":"GSN: Generalisable Segmentation in Neural Radiance Field","summary":" Traditional Radiance Field (RF) representations capture details of a specific\nscene and must be trained afresh on each scene. Semantic feature fields have\nbeen added to RFs to facilitate several segmentation tasks. Generalised RF\nrepresentations learn the principles of view interpolation. A generalised RF\ncan render new views of an unknown and untrained scene, given a few views. We\npresent a way to distil feature fields into the generalised GNT representation.\nOur GSN representation generates new views of unseen scenes on the fly along\nwith consistent, per-pixel semantic features. This enables multi-view\nsegmentation of arbitrary new scenes. We show different semantic features being\ndistilled into generalised RFs. Our multi-view segmentation results are on par\nwith methods that use traditional RFs. GSN closes the gap between standard and\ngeneralisable RF methods significantly. Project Page:\nhttps://vinayak-vg.github.io/GSN/\n","authors":["Vinayak Gupta","Rahul Goel","Sirikonda Dhawal","P. J. Narayanan"],"pdf_url":"https://arxiv.org/pdf/2402.04632v1.pdf","comment":"Accepted at the Main Technical Track of AAAI 2024"},{"id":"http://arxiv.org/abs/2402.04630v1","updated":"2024-02-07T07:26:49Z","published":"2024-02-07T07:26:49Z","title":"LLMs Meet VLMs: Boost Open Vocabulary Object Detection with Fine-grained\n Descriptors","summary":" Inspired by the outstanding zero-shot capability of vision language models\n(VLMs) in image classification tasks, open-vocabulary object detection has\nattracted increasing interest by distilling the broad VLM knowledge into\ndetector training. However, most existing open-vocabulary detectors learn by\naligning region embeddings with categorical labels (e.g., bicycle) only,\ndisregarding the capability of VLMs on aligning visual embeddings with\nfine-grained text description of object parts (e.g., pedals and bells). This\npaper presents DVDet, a Descriptor-Enhanced Open Vocabulary Detector that\nintroduces conditional context prompts and hierarchical textual descriptors\nthat enable precise region-text alignment as well as open-vocabulary detection\ntraining in general. Specifically, the conditional context prompt transforms\nregional embeddings into image-like representations that can be directly\nintegrated into general open vocabulary detection training. In addition, we\nintroduce large language models as an interactive and implicit knowledge\nrepository which enables iterative mining and refining visually oriented\ntextual descriptors for precise region-text alignment. Extensive experiments\nover multiple large-scale benchmarks show that DVDet outperforms the\nstate-of-the-art consistently by large margins.\n","authors":["Sheng Jin","Xueying Jiang","Jiaxing Huang","Lewei Lu","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2402.04630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04625v1","updated":"2024-02-07T07:16:12Z","published":"2024-02-07T07:16:12Z","title":"Noise Map Guidance: Inversion with Spatial Context for Real Image\n Editing","summary":" Text-guided diffusion models have become a popular tool in image synthesis,\nknown for producing high-quality and diverse images. However, their application\nto editing real images often encounters hurdles primarily due to the text\ncondition deteriorating the reconstruction quality and subsequently affecting\nediting fidelity. Null-text Inversion (NTI) has made strides in this area, but\nit fails to capture spatial context and requires computationally intensive\nper-timestep optimization. Addressing these challenges, we present Noise Map\nGuidance (NMG), an inversion method rich in a spatial context, tailored for\nreal-image editing. Significantly, NMG achieves this without necessitating\noptimization, yet preserves the editing quality. Our empirical investigations\nhighlight NMG's adaptability across various editing techniques and its\nrobustness to variants of DDIM inversions.\n","authors":["Hansam Cho","Jonghyun Lee","Seoung Bum Kim","Tae-Hyun Oh","Yonghyun Jeong"],"pdf_url":"https://arxiv.org/pdf/2402.04625v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2209.08996v3","updated":"2024-02-07T07:14:55Z","published":"2022-09-19T13:20:19Z","title":"EDO-Net: Learning Elastic Properties of Deformable Objects from Graph\n Dynamics","summary":" We study the problem of learning graph dynamics of deformable objects that\ngeneralizes to unknown physical properties. Our key insight is to leverage a\nlatent representation of elastic physical properties of cloth-like deformable\nobjects that can be extracted, for example, from a pulling interaction. In this\npaper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph\ndynamics trained on a large variety of samples with different elastic\nproperties that does not rely on ground-truth labels of the properties. EDO-Net\njointly learns an adaptation module, and a forward-dynamics module. The former\nis responsible for extracting a latent representation of the physical\nproperties of the object, while the latter leverages the latent representation\nto predict future states of cloth-like objects represented as graphs. We\nevaluate EDO-Net both in simulation and real world, assessing its capabilities\nof: 1) generalizing to unknown physical properties, 2) transferring the learned\nrepresentation to new downstream tasks.\n","authors":["Alberta Longhini","Marco Moletta","Alfredo Reichlin","Michael C. Welle","David Held","Zackory Erickson","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2209.08996v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04618v1","updated":"2024-02-07T07:01:08Z","published":"2024-02-07T07:01:08Z","title":"Multi-Scale Semantic Segmentation with Modified MBConv Blocks","summary":" Recently, MBConv blocks, initially designed for efficiency in\nresource-limited settings and later adapted for cutting-edge image\nclassification performances, have demonstrated significant potential in image\nclassification tasks. Despite their success, their application in semantic\nsegmentation has remained relatively unexplored. This paper introduces a novel\nadaptation of MBConv blocks specifically tailored for semantic segmentation.\nOur modification stems from the insight that semantic segmentation requires the\nextraction of more detailed spatial information than image classification. We\nargue that to effectively perform multi-scale semantic segmentation, each\nbranch of a U-Net architecture, regardless of its resolution, should possess\nequivalent segmentation capabilities. By implementing these changes, our\napproach achieves impressive mean Intersection over Union (IoU) scores of 84.5%\nand 84.0% on the Cityscapes test and validation datasets, respectively,\ndemonstrating the efficacy of our proposed modifications in enhancing semantic\nsegmentation performance.\n","authors":["Xi Chen","Yang Cai","Yuan Wu","Bo Xiong","Taesung Park"],"pdf_url":"https://arxiv.org/pdf/2402.04618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04615v1","updated":"2024-02-07T06:42:33Z","published":"2024-02-07T06:42:33Z","title":"ScreenAI: A Vision-Language Model for UI and Infographics Understanding","summary":" Screen user interfaces (UIs) and infographics, sharing similar visual\nlanguage and design principles, play important roles in human communication and\nhuman-machine interaction. We introduce ScreenAI, a vision-language model that\nspecializes in UI and infographics understanding. Our model improves upon the\nPaLI architecture with the flexible patching strategy of pix2struct and is\ntrained on a unique mixture of datasets. At the heart of this mixture is a\nnovel screen annotation task in which the model has to identify the type and\nlocation of UI elements. We use these text annotations to describe screens to\nLarge Language Models and automatically generate question-answering (QA), UI\nnavigation, and summarization training datasets at scale. We run ablation\nstudies to demonstrate the impact of these design choices. At only 5B\nparameters, ScreenAI achieves new state-of-the-artresults on UI- and\ninfographics-based tasks (Multi-page DocVQA, WebSRC, MoTIF and Widget\nCaptioning), and new best-in-class performance on others (Chart QA, DocVQA, and\nInfographicVQA) compared to models of similar size. Finally, we release three\nnew datasets: one focused on the screen annotation task and two others focused\non question answering.\n","authors":["Gilles Baechler","Srinivas Sunkara","Maria Wang","Fedir Zubach","Hassan Mansoor","Vincent Etter","Victor Cărbune","Jason Lin","Jindong Chen","Abhanshu Sharma"],"pdf_url":"https://arxiv.org/pdf/2402.04615v1.pdf","comment":"7 pages main tex with 5 figures, 2 page bib, 6 pages appendix"},{"id":"http://arxiv.org/abs/2305.13876v3","updated":"2024-02-07T06:10:12Z","published":"2023-05-23T09:52:49Z","title":"Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans","summary":" We present a novel task for cross-dataset visual grounding in 3D scenes\n(Cross3DVG), which overcomes limitations of existing 3D visual grounding\nmodels, specifically their restricted 3D resources and consequent tendencies of\noverfitting a specific 3D dataset. We created RIORefer, a large-scale 3D visual\ngrounding dataset, to facilitate Cross3DVG. It includes more than 63k diverse\ndescriptions of 3D objects within 1,380 indoor RGB-D scans from 3RScan, with\nhuman annotations. After training the Cross3DVG model using the source 3D\nvisual grounding dataset, we evaluate it without target labels using the target\ndataset with, e.g., different sensors, 3D reconstruction methods, and language\nannotators. Comprehensive experiments are conducted using established visual\ngrounding models and with CLIP-based multi-view 2D and 3D integration designed\nto bridge gaps among 3D datasets. For Cross3DVG tasks, (i) cross-dataset 3D\nvisual grounding exhibits significantly worse performance than learning and\nevaluation with a single dataset because of the 3D data and language variants\nacross datasets. Moreover, (ii) better object detector and localization modules\nand fusing 3D data and multi-view CLIP-based image features can alleviate this\nlower performance. Our Cross3DVG task can provide a benchmark for developing\nrobust 3D visual grounding models to handle diverse 3D scenes while leveraging\ndeep language understanding.\n","authors":["Taiki Miyanishi","Daichi Azuma","Shuhei Kurita","Motoki Kawanabe"],"pdf_url":"https://arxiv.org/pdf/2305.13876v3.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2402.03019v2","updated":"2024-02-07T05:50:11Z","published":"2024-02-05T14:00:13Z","title":"Taylor Videos for Action Recognition","summary":" Effectively extracting motions from video is a critical and long-standing\nproblem for action recognition. This problem is very challenging because\nmotions (i) do not have an explicit form, (ii) have various concepts such as\ndisplacement, velocity, and acceleration, and (iii) often contain noise caused\nby unstable pixels. Addressing these challenges, we propose the Taylor video, a\nnew video format that highlights the dominate motions (e.g., a waving hand) in\neach of its frames named the Taylor frame. Taylor video is named after Taylor\nseries, which approximates a function at a given point using important terms.\nIn the scenario of videos, we define an implicit motion-extraction function\nwhich aims to extract motions from video temporal block. In this block, using\nthe frames, the difference frames, and higher-order difference frames, we\nperform Taylor expansion to approximate this function at the starting frame. We\nshow the summation of the higher-order terms in the Taylor series gives us\ndominant motion patterns, where static objects, small and unstable motions are\nremoved. Experimentally we show that Taylor videos are effective inputs to\npopular architectures including 2D CNNs, 3D CNNs, and transformers. When used\nindividually, Taylor videos yield competitive action recognition accuracy\ncompared to RGB videos and optical flow. When fused with RGB or optical flow\nvideos, further accuracy improvement is achieved.\n","authors":["Lei Wang","Xiuyuan Yuan","Tom Gedeon","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.03019v2.pdf","comment":"Research report"},{"id":"http://arxiv.org/abs/2402.04599v1","updated":"2024-02-07T05:47:31Z","published":"2024-02-07T05:47:31Z","title":"Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via\n Temporal-Viewpoint Alignment","summary":" Video sequences exhibit significant nuisance variations (undesired effects)\nof speed of actions, temporal locations, and subjects' poses, leading to\ntemporal-viewpoint misalignment when comparing two sets of frames or evaluating\nthe similarity of two sequences. Thus, we propose Joint tEmporal and cAmera\nviewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D\nskeleton sequences whose camera and subjects' poses can be easily manipulated\nin 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where\nmatching well temporal blocks (temporal chunks that make up a sequence) of\nsupport-query sequence pairs (by factoring out nuisance variations) is\nessential due to limited samples of novel classes. Given a query sequence, we\ncreate its several views by simulating several camera locations. For a support\nsequence, we match it with view-simulated query sequences, as in the popular\nDynamic Time Warping (DTW). Specifically, each support temporal block can be\nmatched to the query temporal block with the same or adjacent (next) temporal\nindex, and adjacent camera views to achieve joint local temporal-viewpoint\nwarping. JEANIE selects the smallest distance among matching paths with\ndifferent temporal-viewpoint warping patterns, an advantage over DTW which only\nperforms temporal alignment. We also propose an unsupervised FSAR akin to\nclustering of sequences with JEANIE as a distance measure. JEANIE achieves\nstate-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D\nMultiview Activity II on supervised and unsupervised FSAR, and their\nmeta-learning inspired fusion.\n","authors":["Lei Wang","Jun Liu","Liang Zheng","Tom Gedeon","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2402.04599v1.pdf","comment":"Under minor revision with IJCV. An extension of our ACCV'22 paper\n [arXiv:arXiv:2210.16820] which was distinguished by the Sang Uk Lee Best\n Student Paper Award. arXiv admin note: text overlap with arXiv:2112.12668"},{"id":"http://arxiv.org/abs/2402.04596v1","updated":"2024-02-07T05:38:53Z","published":"2024-02-07T05:38:53Z","title":"Towards Improved Imbalance Robustness in Continual Multi-Label Learning\n with Dual Output Spiking Architecture (DOSA)","summary":" Algorithms designed for addressing typical supervised classification problems\ncan only learn from a fixed set of samples and labels, making them unsuitable\nfor the real world, where data arrives as a stream of samples often associated\nwith multiple labels over time. This motivates the study of task-agnostic\ncontinual multi-label learning problems. While algorithms using deep learning\napproaches for continual multi-label learning have been proposed in the recent\nliterature, they tend to be computationally heavy. Although spiking neural\nnetworks (SNNs) offer a computationally efficient alternative to artificial\nneural networks, existing literature has not used SNNs for continual\nmulti-label learning. Also, accurately determining multiple labels with SNNs is\nstill an open research problem. This work proposes a dual output spiking\narchitecture (DOSA) to bridge these research gaps. A novel imbalance-aware loss\nfunction is also proposed, improving the multi-label classification performance\nof the model by making it more robust to data imbalance. A modified F1 score is\npresented to evaluate the effectiveness of the proposed loss function in\nhandling imbalance. Experiments on several benchmark multi-label datasets show\nthat DOSA trained with the proposed loss function shows improved robustness to\ndata imbalance and obtains better continual multi-label learning performance\nthan CIFDM, a previous state-of-the-art algorithm.\n","authors":["Sourav Mishra","Shirin Dora","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2402.04596v1.pdf","comment":"8 pages, 4 figures, 4 tables, 45 references. Submitted to IJCNN 2024"},{"id":"http://arxiv.org/abs/2402.04587v1","updated":"2024-02-07T05:05:21Z","published":"2024-02-07T05:05:21Z","title":"Sparse Anatomical Prompt Semi-Supervised Learning with Masked Image\n Modeling for CBCT Tooth Segmentation","summary":" Accurate tooth identification and segmentation in Cone Beam Computed\nTomography (CBCT) dental images can significantly enhance the efficiency and\nprecision of manual diagnoses performed by dentists. However, existing\nsegmentation methods are mainly developed based on large data volumes training,\non which their annotations are extremely time-consuming. Meanwhile, the teeth\nof each class in CBCT dental images being closely positioned, coupled with\nsubtle inter-class differences, gives rise to the challenge of indistinct\nboundaries when training model with limited data. To address these challenges,\nthis study aims to propose a tasked-oriented Masked Auto-Encoder paradigm to\neffectively utilize large amounts of unlabeled data to achieve accurate tooth\nsegmentation with limited labeled data. Specifically, we first construct a\nself-supervised pre-training framework of masked auto encoder to efficiently\nutilize unlabeled data to enhance the network performance. Subsequently, we\nintroduce a sparse masked prompt mechanism based on graph attention to\nincorporate boundary information of the teeth, aiding the network in learning\nthe anatomical structural features of teeth. To the best of our knowledge, we\nare pioneering the integration of the mask pre-training paradigm into the CBCT\ntooth segmentation task. Extensive experiments demonstrate both the feasibility\nof our proposed method and the potential of the boundary prompt mechanism.\n","authors":["Pengyu Dai","Yafei Ou","Yang Liu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.04587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07266v3","updated":"2024-02-07T05:04:13Z","published":"2023-12-12T13:45:56Z","title":"ProxyDet: Synthesizing Proxy Novel Classes via Classwise Mixup for\n Open-Vocabulary Object Detection","summary":" Open-vocabulary object detection (OVOD) aims to recognize novel objects whose\ncategories are not included in the training set. In order to classify these\nunseen classes during training, many OVOD frameworks leverage the zero-shot\ncapability of largely pretrained vision and language models, such as CLIP. To\nfurther improve generalization on the unseen novel classes, several approaches\nproposed to additionally train with pseudo region labeling on the external data\nsources that contain a substantial number of novel category labels beyond the\nexisting training data. Albeit its simplicity, these pseudo-labeling methods\nstill exhibit limited improvement with regard to the truly unseen novel classes\nthat were not pseudo-labeled. In this paper, we present a novel, yet simple\ntechnique that helps generalization on the overall distribution of novel\nclasses. Inspired by our observation that numerous novel classes reside within\nthe convex hull constructed by the base (seen) classes in the CLIP embedding\nspace, we propose to synthesize proxy-novel classes approximating novel classes\nvia linear mixup between a pair of base classes. By training our detector with\nthese synthetic proxy-novel classes, we effectively explore the embedding space\nof novel classes. The experimental results on various OVOD benchmarks such as\nLVIS and COCO demonstrate superior performance on novel classes compared to the\nother state-of-the-art methods. Code is available at\nhttps://github.com/clovaai/ProxyDet.\n","authors":["Joonhyun Jeong","Geondo Park","Jayeon Yoo","Hyungsik Jung","Heesu Kim"],"pdf_url":"https://arxiv.org/pdf/2312.07266v3.pdf","comment":"Accepted in AAAI24"},{"id":"http://arxiv.org/abs/2402.04584v1","updated":"2024-02-07T04:55:57Z","published":"2024-02-07T04:55:57Z","title":"Troublemaker Learning for Low-Light Image Enhancement","summary":" Low-light image enhancement (LLIE) restores the color and brightness of\nunderexposed images. Supervised methods suffer from high costs in collecting\nlow/normal-light image pairs. Unsupervised methods invest substantial effort in\ncrafting complex loss functions. We address these two challenges through the\nproposed TroubleMaker Learning (TML) strategy, which employs normal-light\nimages as inputs for training. TML is simple: we first dim the input and then\nincrease its brightness. TML is based on two core components. First, the\ntroublemaker model (TM) constructs pseudo low-light images from normal images\nto relieve the cost of pairwise data. Second, the predicting model (PM)\nenhances the brightness of pseudo low-light images. Additionally, we\nincorporate an enhancing model (EM) to further improve the visual performance\nof PM outputs. Moreover, in LLIE tasks, characterizing global element\ncorrelations is important because more information on the same object can be\ncaptured. CNN cannot achieve this well, and self-attention has high time\ncomplexity. Accordingly, we propose Global Dynamic Convolution (GDC) with O(n)\ntime complexity, which essentially imitates the partial calculation process of\nself-attention to formulate elementwise correlations. Based on the GDC module,\nwe build the UGDC model. Extensive quantitative and qualitative experiments\ndemonstrate that UGDC trained with TML can achieve competitive performance\nagainst state-of-the-art approaches on public datasets. The code is available\nat https://github.com/Rainbowman0/TML_LLIE.\n","authors":["Yinghao Song","Zhiyuan Cao","Wanhong Xiang","Sifan Long","Bo Yang","Hongwei Ge","Yanchun Liang","Chunguo Wu"],"pdf_url":"https://arxiv.org/pdf/2402.04584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16517v3","updated":"2024-02-07T04:53:54Z","published":"2023-07-31T09:33:19Z","title":"Select2Col: Leveraging Spatial-Temporal Importance of Semantic\n Information for Efficient Collaborative Perception","summary":" Collaborative perception by leveraging the shared semantic information plays\na crucial role in overcoming the individual limitations of isolated agents.\nHowever, existing collaborative perception methods tend to focus solely on the\nspatial features of semantic information, while neglecting the importance of\nthe temporal dimension. Consequently, the potential benefits of collaboration\nremain underutilized. In this article, we propose Select2Col, a novel\ncollaborative perception framework that takes into account the\n\\underline{s}patial-t\\underline{e}mpora\\underline{l} importanc\\underline{e} of\nsemanti\\underline{c} informa\\underline{t}ion. Within the Select2Col, we develop\na collaborator selection method that utilizes a lightweight graph neural\nnetwork (GNN) to estimate the importance of semantic information (IoSI) of each\ncollaborator in enhancing perception performance, thereby identifying\ncontributive collaborators while excluding those that potentially bring\nnegative impact. Moreover, we present a semantic information fusion algorithm\ncalled HPHA (historical prior hybrid attention), which integrates multi-scale\nattention and short-term attention modules to capture the IoSI in feature\nrepresentation from the spatial and temporal dimensions respectively, and\nassigns IoSI-consistent weights for efficient fusion of information from\nselected collaborators. Extensive experiments on three open datasets\ndemonstrate that our proposed Select2Col significantly improves the perception\nperformance compared to state-of-the-art approaches. The code associated with\nthis research is publicly available at https://github.com/huangqzj/Select2Col/.\n","authors":["Yuntao Liu","Qian Huang","Rongpeng Li","Xianfu Chen","Zhifeng Zhao","Shuyuan Zhao","Yongdong Zhu","Honggang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.16517v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04583v1","updated":"2024-02-07T04:51:14Z","published":"2024-02-07T04:51:14Z","title":"A Psychological Study: Importance of Contrast and Luminance in Color to\n Grayscale Mapping","summary":" Grayscale images are essential in image processing and computer vision tasks.\nThey effectively emphasize luminance and contrast, highlighting important\nvisual features, while also being easily compatible with other algorithms.\nMoreover, their simplified representation makes them efficient for storage and\ntransmission purposes. While preserving contrast is important for maintaining\nvisual quality, other factors such as preserving information relevant to the\nspecific application or task at hand may be more critical for achieving optimal\nperformance. To evaluate and compare different decolorization algorithms, we\ndesigned a psychological experiment. During the experiment, participants were\ninstructed to imagine color images in a hypothetical \"colorless world\" and\nselect the grayscale image that best resembled their mental visualization. We\nconducted a comparison between two types of algorithms: (i) perceptual-based\nsimple color space conversion algorithms, and (ii) spatial contrast-based\nalgorithms, including iteration-based methods. Our experimental findings\nindicate that CIELAB exhibited superior performance on average, providing\nfurther evidence for the effectiveness of perception-based decolorization\nalgorithms. On the other hand, the spatial contrast-based algorithms showed\nrelatively poorer performance, possibly due to factors such as DC-offset and\nartificial contrast generation. However, these algorithms demonstrated shorter\nselection times. Notably, no single algorithm consistently outperformed the\nothers across all test images. In this paper, we will delve into a\ncomprehensive discussion on the significance of contrast and luminance in\ncolor-to-grayscale mapping based on our experimental results and analysis.\n","authors":["Prasoon Ambalathankandy","Yafei Ou","Sae Kaneko","Masayuki Ikebe"],"pdf_url":"https://arxiv.org/pdf/2402.04583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04573v1","updated":"2024-02-07T04:11:25Z","published":"2024-02-07T04:11:25Z","title":"Progressive Conservative Adaptation for Evolving Target Domains","summary":" Conventional domain adaptation typically transfers knowledge from a source\ndomain to a stationary target domain. However, in many real-world cases, target\ndata usually emerge sequentially and have continuously evolving distributions.\nRestoring and adapting to such target data results in escalating computational\nand resource consumption over time. Hence, it is vital to devise algorithms to\naddress the evolving domain adaptation (EDA) problem, \\emph{i.e.,} adapting\nmodels to evolving target domains without access to historic target domains. To\nachieve this goal, we propose a simple yet effective approach, termed\nprogressive conservative adaptation (PCAda). To manage new target data that\ndiverges from previous distributions, we fine-tune the classifier head based on\nthe progressively updated class prototypes. Moreover, as adjusting to the most\nrecent target domain can interfere with the features learned from previous\ntarget domains, we develop a conservative sparse attention mechanism. This\nmechanism restricts feature adaptation within essential dimensions, thus easing\nthe inference related to historical knowledge. The proposed PCAda is\nimplemented with a meta-learning framework, which achieves the fast adaptation\nof the classifier with the help of the progressively updated class prototypes\nin the inner loop and learns a generalized feature without severely interfering\nwith the historic knowledge via the conservative sparse attention in the outer\nloop. Experiments on Rotated MNIST, Caltran, and Portraits datasets demonstrate\nthe effectiveness of our method.\n","authors":["Gangming Zhao","Chaoqi Chen","Wenhao He","Chengwei Pan","Chaowei Fang","Jinpeng Li","Xilin Chen","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2402.04573v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.04566v1","updated":"2024-02-07T04:05:29Z","published":"2024-02-07T04:05:29Z","title":"Triplet-constraint Transformer with Multi-scale Refinement for Dose\n Prediction in Radiotherapy","summary":" Radiotherapy is a primary treatment for cancers with the aim of applying\nsufficient radiation dose to the planning target volume (PTV) while minimizing\ndose hazards to the organs at risk (OARs). Convolutional neural networks (CNNs)\nhave automated the radiotherapy plan-making by predicting the dose maps.\nHowever, current CNN-based methods ignore the remarkable dose difference in the\ndose map, i.e., high dose value in the interior PTV while low value in the\nexterior PTV, leading to a suboptimal prediction. In this paper, we propose a\ntriplet-constraint transformer (TCtrans) with multi-scale refinement to predict\nthe high-quality dose distribution. Concretely, a novel PTV-guided triplet\nconstraint is designed to refine dose feature representations in the interior\nand exterior PTV by utilizing the explicit geometry of PTV. Furthermore, we\nintroduce a multi-scale refinement (MSR) module to effectively fulfill the\ntriplet constraint in different decoding layers with multiple scales. Besides,\na transformer encoder is devised to learn the important global dosimetric\nknowledge. Experiments on a clinical cervical cancer dataset demonstrate the\nsuperiority of our method.\n","authors":["Lu Wen","Qihun Zhang","Zhenghao Feng","Yuanyuan Xu","Xiao Chen","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04566v1.pdf","comment":"accepted by 2024 IEEE ISBI"},{"id":"http://arxiv.org/abs/2402.04563v1","updated":"2024-02-07T03:43:56Z","published":"2024-02-07T03:43:56Z","title":"Attention Guided CAM: Visual Explanations of Vision Transformer Guided\n by Self-Attention","summary":" Vision Transformer(ViT) is one of the most widely used models in the computer\nvision field with its great performance on various tasks. In order to fully\nutilize the ViT-based architecture in various applications, proper\nvisualization methods with a decent localization performance are necessary, but\nthese methods employed in CNN-based models are still not available in ViT due\nto its unique structure. In this work, we propose an attention-guided\nvisualization method applied to ViT that provides a high-level semantic\nexplanation for its decision. Our method selectively aggregates the gradients\ndirectly propagated from the classification output to each self-attention,\ncollecting the contribution of image features extracted from each location of\nthe input image. These gradients are additionally guided by the normalized\nself-attention scores, which are the pairwise patch correlation scores. They\nare used to supplement the gradients on the patch-level context information\nefficiently detected by the self-attention mechanism. This approach of our\nmethod provides elaborate high-level semantic explanations with great\nlocalization performance only with the class labels. As a result, our method\noutperforms the previous leading explainability methods of ViT in the\nweakly-supervised localization task and presents great capability in capturing\nthe full instances of the target class object. Meanwhile, our method provides a\nvisualization that faithfully explains the model, which is demonstrated in the\nperturbation comparison test.\n","authors":["Saebom Leem","Hyunseok Seo"],"pdf_url":"https://arxiv.org/pdf/2402.04563v1.pdf","comment":"AAAI2024. Code available at\n https://github.com/LeemSaebom/Attention-Guided-CAM-Visual-Explanations-of-Vision-Transformer-Guided-by-Self-Attention.git"},{"id":"http://arxiv.org/abs/2402.04558v1","updated":"2024-02-07T03:36:41Z","published":"2024-02-07T03:36:41Z","title":"DMAT: A Dynamic Mask-Aware Transformer for Human De-occlusion","summary":" Human de-occlusion, which aims to infer the appearance of invisible human\nparts from an occluded image, has great value in many human-related tasks, such\nas person re-id, and intention inference. To address this task, this paper\nproposes a dynamic mask-aware transformer (DMAT), which dynamically augments\ninformation from human regions and weakens that from occlusion. First, to\nenhance token representation, we design an expanded convolution head with\nenlarged kernels, which captures more local valid context and mitigates the\ninfluence of surrounding occlusion. To concentrate on the visible human parts,\nwe propose a novel dynamic multi-head human-mask guided attention mechanism\nthrough integrating multiple masks, which can prevent the de-occluded regions\nfrom assimilating to the background. Besides, a region upsampling strategy is\nutilized to alleviate the impact of occlusion on interpolated images. During\nmodel learning, an amodal loss is developed to further emphasize the recovery\neffect of human regions, which also refines the model's convergence. Extensive\nexperiments on the AHP dataset demonstrate its superior performance compared to\nrecent state-of-the-art methods.\n","authors":["Guoqiang Liang","Jiahao Hu","Qingyue Wang","Shizhou Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02544v2","updated":"2024-02-07T03:28:12Z","published":"2024-02-04T15:46:43Z","title":"LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal\n Language Model","summary":" The revolutionary capabilities of large language models (LLMs) have paved the\nway for multimodal large language models (MLLMs) and fostered diverse\napplications across various specialized domains. In the remote sensing (RS)\nfield, however, the diverse geographical landscapes and varied objects in RS\nimagery are not adequately considered in recent MLLM endeavors. To bridge this\ngap, we construct a large-scale RS image-text dataset, LHRS-Align, and an\ninformative RS-specific instruction dataset, LHRS-Instruct, leveraging the\nextensive volunteered geographic information (VGI) and globally available RS\nimages. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored\nfor RS image understanding through a novel multi-level vision-language\nalignment strategy and a curriculum learning method. Comprehensive experiments\ndemonstrate that LHRS-Bot exhibits a profound understanding of RS images and\nthe ability to perform nuanced reasoning within the RS domain.\n","authors":["Dilxat Muhtar","Zhenshi Li","Feng Gu","Xueliang Zhang","Pengfeng Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.02544v2.pdf","comment":"32 pages, 8 figures. Github https://github.com/NJU-LHRS/LHRS-Bot"},{"id":"http://arxiv.org/abs/2402.03292v2","updated":"2024-02-07T03:21:57Z","published":"2024-02-05T18:50:27Z","title":"Zero-shot Object-Level OOD Detection with Context-Aware Inpainting","summary":" Machine learning algorithms are increasingly provided as black-box cloud\nservices or pre-trained models, without access to their training data. This\nmotivates the problem of zero-shot out-of-distribution (OOD) detection.\nConcretely, we aim to detect OOD objects that do not belong to the classifier's\nlabel set but are erroneously classified as in-distribution (ID) objects. Our\napproach, RONIN, uses an off-the-shelf diffusion model to replace detected\nobjects with inpainting. RONIN conditions the inpainting process with the\npredicted ID label, drawing the input object closer to the in-distribution\ndomain. As a result, the reconstructed object is very close to the original in\nthe ID cases and far in the OOD cases, allowing RONIN to effectively\ndistinguish ID and OOD samples. Throughout extensive experiments, we\ndemonstrate that RONIN achieves competitive results compared to previous\napproaches across several datasets, both in zero-shot and non-zero-shot\nsettings.\n","authors":["Quang-Huy Nguyen","Jin Peng Zhou","Zhenzhen Liu","Khanh-Huyen Bui","Kilian Q. Weinberger","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2402.03292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04555v1","updated":"2024-02-07T03:19:02Z","published":"2024-02-07T03:19:02Z","title":"FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language\n Foundation Models","summary":" Semantic mapping based on the supervised object detectors is sensitive to\nimage distribution. In real-world environments, the object detection and\nsegmentation performance can lead to a major drop, preventing the use of\nsemantic mapping in a wider domain. On the other hand, the development of\nvision-language foundation models demonstrates a strong zero-shot\ntransferability across data distribution. It provides an opportunity to\nconstruct generalizable instance-aware semantic maps. Hence, this work explores\nhow to boost instance-aware semantic mapping from object detection generated\nfrom foundation models. We propose a probabilistic label fusion method to\npredict close-set semantic classes from open-set label measurements. An\ninstance refinement module merges the over-segmented instances caused by\ninconsistent segmentation. We integrate all the modules into a unified semantic\nmapping system. Reading a sequence of RGB-D input, our work incrementally\nreconstructs an instance-aware semantic map. We evaluate the zero-shot\nperformance of our method in ScanNet and SceneNN datasets. Our method achieves\n40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation\ntask. It outperforms the traditional semantic mapping method significantly.\n","authors":["Chuhao Liu","Ke Wang","Jieqi Shi","Zhijian Qiao","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2402.04555v1.pdf","comment":"Accepted by IEEE RA-L"},{"id":"http://arxiv.org/abs/2402.04554v1","updated":"2024-02-07T03:18:34Z","published":"2024-02-07T03:18:34Z","title":"BirdNeRF: Fast Neural Reconstruction of Large-Scale Scenes From Aerial\n Imagery","summary":" In this study, we introduce BirdNeRF, an adaptation of Neural Radiance Fields\n(NeRF) designed specifically for reconstructing large-scale scenes using aerial\nimagery. Unlike previous research focused on small-scale and object-centric\nNeRF reconstruction, our approach addresses multiple challenges, including (1)\nAddressing the issue of slow training and rendering associated with large\nmodels. (2) Meeting the computational demands necessitated by modeling a\nsubstantial number of images, requiring extensive resources such as\nhigh-performance GPUs. (3) Overcoming significant artifacts and low visual\nfidelity commonly observed in large-scale reconstruction tasks due to limited\nmodel capacity. Specifically, we present a novel bird-view pose-based spatial\ndecomposition algorithm that decomposes a large aerial image set into multiple\nsmall sets with appropriately sized overlaps, allowing us to train individual\nNeRFs of sub-scene. This decomposition approach not only decouples rendering\ntime from the scene size but also enables rendering to scale seamlessly to\narbitrarily large environments. Moreover, it allows for per-block updates of\nthe environment, enhancing the flexibility and adaptability of the\nreconstruction process. Additionally, we propose a projection-guided novel view\nre-rendering strategy, which aids in effectively utilizing the independently\ntrained sub-scenes to generate superior rendering results. We evaluate our\napproach on existing datasets as well as against our own drone footage,\nimproving reconstruction speed by 10x over classical photogrammetry software\nand 50x over state-of-the-art large-scale NeRF solution, on a single GPU with\nsimilar rendering quality.\n","authors":["Huiqing Zhang","Yifei Xue","Ming Liao","Yizhen Lao"],"pdf_url":"https://arxiv.org/pdf/2402.04554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04541v1","updated":"2024-02-07T02:57:40Z","published":"2024-02-07T02:57:40Z","title":"BRI3L: A Brightness Illusion Image Dataset for Identification and\n Localization of Regions of Illusory Perception","summary":" Visual illusions play a significant role in understanding visual perception.\nCurrent methods in understanding and evaluating visual illusions are mostly\ndeterministic filtering based approach and they evaluate on a handful of visual\nillusions, and the conclusions therefore, are not generic. To this end, we\ngenerate a large-scale dataset of 22,366 images (BRI3L: BRightness Illusion\nImage dataset for Identification and Localization of illusory perception) of\nthe five types of brightness illusions and benchmark the dataset using\ndata-driven neural network based approaches. The dataset contains label\ninformation - (1) whether a particular image is illusory/nonillusory, (2) the\nsegmentation mask of the illusory region of the image. Hence, both the\nclassification and segmentation task can be evaluated using this dataset. We\nfollow the standard psychophysical experiments involving human subjects to\nvalidate the dataset. To the best of our knowledge, this is the first attempt\nto develop a dataset of visual illusions and benchmark using data-driven\napproach for illusion classification and localization. We consider five\nwell-studied types of brightness illusions: 1) Hermann grid, 2) Simultaneous\nBrightness Contrast, 3) White illusion, 4) Grid illusion, and 5) Induced\nGrating illusion. Benchmarking on the dataset achieves 99.56% accuracy in\nillusion identification and 84.37% pixel accuracy in illusion localization. The\napplication of deep learning model, it is shown, also generalizes over unseen\nbrightness illusions like brightness assimilation to contrast transitions. We\nalso test the ability of state-of-theart diffusion models to generate\nbrightness illusions. We have provided all the code, dataset, instructions etc\nin the github repo: https://github.com/aniket004/BRI3L\n","authors":["Aniket Roy","Anirban Roy","Soma Mitra","Kuntal Ghosh"],"pdf_url":"https://arxiv.org/pdf/2402.04541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01946v4","updated":"2024-02-07T02:56:52Z","published":"2023-07-04T22:42:55Z","title":"ECG-Image-Kit: A Synthetic Image Generation Toolbox to Facilitate Deep\n Learning-Based Electrocardiogram Digitization","summary":" Cardiovascular diseases are a major cause of mortality globally, and\nelectrocardiograms (ECGs) are crucial for diagnosing them. Traditionally, ECGs\nare printed on paper. However, these printouts, even when scanned, are\nincompatible with advanced ECG diagnosis software that require time-series\ndata. Digitizing ECG images is vital for training machine learning models in\nECG diagnosis and to leverage the extensive global archives collected over\ndecades. Deep learning models for image processing are promising in this\nregard, although the lack of clinical ECG archives with reference time-series\ndata is challenging. Data augmentation techniques using realistic generative\ndata models provide a solution.\n We introduce ECG-Image-Kit, an open-source toolbox for generating synthetic\nmulti-lead ECG images with realistic artifacts from time-series data. The tool\nsynthesizes ECG images from real time-series data, applying distortions like\ntext artifacts, wrinkles, and creases on a standard ECG paper background.\n As a case study, we used ECG-Image-Kit to create a dataset of 21,801 ECG\nimages from the PhysioNet QT database. We developed and trained a combination\nof a traditional computer vision and deep neural network model on this dataset\nto convert synthetic images into time-series data for evaluation. We assessed\ndigitization quality by calculating the signal-to-noise ratio (SNR) and\ncompared clinical parameters like QRS width, RR, and QT intervals recovered\nfrom this pipeline, with the ground truth extracted from ECG time-series. The\nresults show that this deep learning pipeline accurately digitizes paper ECGs,\nmaintaining clinical parameters, and highlights a generative approach to\ndigitization. This toolbox currently supports data augmentation for the 2024\nPhysioNet Challenge, focusing on digitizing and classifying paper ECG images.\n","authors":["Kshama Kodthalu Shivashankara"," Deepanshi","Afagh Mehri Shervedani","Gari D. Clifford","Matthew A. Reyna","Reza Sameni"],"pdf_url":"https://arxiv.org/pdf/2307.01946v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04027v6","updated":"2024-02-07T02:44:24Z","published":"2023-04-08T14:40:35Z","title":"NeBLa: Neural Beer-Lambert for 3D Reconstruction of Oral Structures from\n Panoramic Radiographs","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose NeBLa (Neural\nBeer-Lambert) to estimate 3D oral structures from real-world PX. NeBLa tackles\nfull 3D reconstruction for varying subjects (patients) where each\nreconstruction is based only on a single panoramic image. We create an\nintermediate representation called simulated PX (SimPX) from 3D Cone-beam\ncomputed tomography (CBCT) data based on the Beer-Lambert law of X-ray\nrendering and rotational principles of PX imaging. SimPX aims at not only\ntruthfully simulating PX, but also facilitates the reverting process back to 3D\ndata. We propose a novel neural model based on ray tracing which exploits both\nglobal and local input features to convert SimPX to 3D output. At inference, a\nreal PX image is translated to a SimPX-style image with semantic\nregularization, and the translated image is processed by generation module to\nproduce high-quality outputs. Experiments show that NeBLa outperforms prior\nstate-of-the-art in reconstruction tasks both quantitatively and qualitatively.\nUnlike prior methods, NeBLa does not require any prior information such as the\nshape of dental arches, nor the matched PX-CBCT dataset for training, which is\ndifficult to obtain in clinical practice. Our code is available at\nhttps://github.com/sihwa-park/nebla.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seung Jun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v6.pdf","comment":"18 pages, 16 figures, Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2307.13492v2","updated":"2024-02-07T02:37:04Z","published":"2023-07-25T13:35:45Z","title":"NormAUG: Normalization-guided Augmentation for Domain Generalization","summary":" Deep learning has made significant advancements in supervised learning.\nHowever, models trained in this setting often face challenges due to domain\nshift between training and test sets, resulting in a significant drop in\nperformance during testing. To address this issue, several domain\ngeneralization methods have been developed to learn robust and domain-invariant\nfeatures from multiple training domains that can generalize well to unseen test\ndomains. Data augmentation plays a crucial role in achieving this goal by\nenhancing the diversity of the training data. In this paper, inspired by the\nobservation that normalizing an image with different statistics generated by\ndifferent batches with various domains can perturb its feature, we propose a\nsimple yet effective method called NormAUG (Normalization-guided Augmentation).\nOur method includes two paths: the main path and the auxiliary (augmented)\npath. During training, the auxiliary path includes multiple sub-paths, each\ncorresponding to batch normalization for a single domain or a random\ncombination of multiple domains. This introduces diverse information at the\nfeature level and improves the generalization of the main path. Moreover, our\nNormAUG method effectively reduces the existing upper boundary for\ngeneralization based on theoretical perspectives. During the test stage, we\nleverage an ensemble strategy to combine the predictions from the auxiliary\npath of our model, further boosting performance. Extensive experiments are\nconducted on multiple benchmark datasets to validate the effectiveness of our\nproposed method.\n","authors":["Lei Qi","Hongpeng Yang","Yinghuan Shi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2307.13492v2.pdf","comment":"Accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2402.04519v1","updated":"2024-02-07T01:57:56Z","published":"2024-02-07T01:57:56Z","title":"BioDrone: A Bionic Drone-based Single Object Tracking Benchmark for\n Robust Vision","summary":" Single object tracking (SOT) is a fundamental problem in computer vision,\nwith a wide range of applications, including autonomous driving, augmented\nreality, and robot navigation. The robustness of SOT faces two main challenges:\ntiny target and fast motion. These challenges are especially manifested in\nvideos captured by unmanned aerial vehicles (UAV), where the target is usually\nfar away from the camera and often with significant motion relative to the\ncamera. To evaluate the robustness of SOT methods, we propose BioDrone -- the\nfirst bionic drone-based visual benchmark for SOT. Unlike existing UAV\ndatasets, BioDrone features videos captured from a flapping-wing UAV system\nwith a major camera shake due to its aerodynamics. BioDrone hence highlights\nthe tracking of tiny targets with drastic changes between consecutive frames,\nproviding a new robust vision benchmark for SOT. To date, BioDrone offers the\nlargest UAV-based SOT benchmark with high-quality fine-grained manual\nannotations and automatically generates frame-level labels, designed for robust\nvision analyses. Leveraging our proposed BioDrone, we conduct a systematic\nevaluation of existing SOT methods, comparing the performance of 20\nrepresentative models and studying novel means of optimizing a SOTA method\n(KeepTrack KeepTrack) for robust SOT. Our evaluation leads to new baselines and\ninsights for robust SOT. Moving forward, we hope that BioDrone will not only\nserve as a high-quality benchmark for robust SOT, but also invite future\nresearch into robust computer vision. The database, toolkits, evaluation\nserver, and baseline results are available at http://biodrone.aitestunion.com.\n","authors":["Xin Zhao","Shiyu Hu","Yipei Wang","Jing Zhang","Yimin Hu","Rongshuai Liu","Haibin Ling","Yin Li","Renshu Li","Kun Liu","Jiadong Li"],"pdf_url":"https://arxiv.org/pdf/2402.04519v1.pdf","comment":"This paper is published in IJCV (refer to DOI). Please cite the\n published IJCV"},{"id":"http://arxiv.org/abs/2306.11305v3","updated":"2024-02-07T01:52:30Z","published":"2023-06-20T06:02:19Z","title":"Progressive Fourier Neural Representation for Sequential Video\n Compilation","summary":" Neural Implicit Representation (NIR) has recently gained significant\nattention due to its remarkable ability to encode complex and high-dimensional\ndata into representation space and easily reconstruct it through a trainable\nmapping function. However, NIR methods assume a one-to-one mapping between the\ntarget data and representation models regardless of data relevancy or\nsimilarity. This results in poor generalization over multiple complex data and\nlimits their efficiency and scalability. Motivated by continual learning, this\nwork investigates how to accumulate and transfer neural implicit\nrepresentations for multiple complex video data over sequential encoding\nsessions. To overcome the limitation of NIR, we propose a novel method,\nProgressive Fourier Neural Representation (PFNR), that aims to find an adaptive\nand compact sub-module in Fourier space to encode videos in each training\nsession. This sparsified neural encoding allows the neural network to hold free\nweights, enabling an improved adaptation for future videos. In addition, when\nlearning a representation for a new video, PFNR transfers the representation of\nprevious videos with frozen weights. This design allows the model to\ncontinuously accumulate high-quality neural representations for multiple videos\nwhile ensuring lossless decoding that perfectly preserves the learned\nrepresentations for previous videos. We validate our PFNR method on the UVG8/17\nand DAVIS50 video sequence benchmarks and achieve impressive performance gains\nover strong continual learning baselines. The PFNR code is available at\nhttps://github.com/ihaeyong/PFNR.git.\n","authors":["Haeyong Kang","Jaehong Yoon","DaHyun Kim","Sung Ju Hwang","Chang D Yoo"],"pdf_url":"https://arxiv.org/pdf/2306.11305v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03697v2","updated":"2024-02-07T01:33:37Z","published":"2024-02-06T04:33:51Z","title":"SHMC-Net: A Mask-guided Feature Fusion Network for Sperm Head Morphology\n Classification","summary":" Male infertility accounts for about one-third of global infertility cases.\nManual assessment of sperm abnormalities through head morphology analysis\nencounters issues of observer variability and diagnostic discrepancies among\nexperts. Its alternative, Computer-Assisted Semen Analysis (CASA), suffers from\nlow-quality sperm images, small datasets, and noisy class labels. We propose a\nnew approach for sperm head morphology classification, called SHMC-Net, which\nuses segmentation masks of sperm heads to guide the morphology classification\nof sperm images. SHMC-Net generates reliable segmentation masks using image\npriors, refines object boundaries with an efficient graph-based method, and\ntrains an image network with sperm head crops and a mask network with the\ncorresponding masks. In the intermediate stages of the networks, image and mask\nfeatures are fused with a fusion scheme to better learn morphological features.\nTo handle noisy class labels and regularize training on small datasets,\nSHMC-Net applies Soft Mixup to combine mixup augmentation and a loss function.\nWe achieve state-of-the-art results on SCIAN and HuSHeM datasets, outperforming\nmethods that use additional pre-training or costly ensembling techniques.\n","authors":["Nishchal Sapkota","Yejia Zhang","Sirui Li","Peixian Liang","Zhuo Zhao","Danny Z Chen"],"pdf_url":"https://arxiv.org/pdf/2402.03697v2.pdf","comment":"A shorter version is published on ISBI 2024"},{"id":"http://arxiv.org/abs/2402.04507v1","updated":"2024-02-07T01:26:14Z","published":"2024-02-07T01:26:14Z","title":"A Review on Digital Pixel Sensors","summary":" Digital pixel sensor (DPS) has evolved as a pivotal component in modern\nimaging systems and has the potential to revolutionize various fields such as\nmedical imaging, astronomy, surveillance, IoT devices, etc. Compared to analog\npixel sensors, the DPS offers high speed and good image quality. However, the\nintroduced intrinsic complexity within each pixel, primarily attributed to the\naccommodation of the ADC circuit, engenders a substantial increase in the pixel\npitch. Unfortunately, such a pronounced escalation in pixel pitch drastically\nundermines the feasibility of achieving high-density integration, which is an\nobstacle that significantly narrows down the field of potential applications.\nNonetheless, designing compact conversion circuits along with strategic\nintegration of 3D architectural paradigms can be a potential remedy to the\nprevailing situation. This review article presents a comprehensive overview of\nthe vast area of DPS technology. The operating principles, advantages, and\nchallenges of different types of DPS circuits have been analyzed. We categorize\nthe schemes into several categories based on ADC operation. A comparative study\nbased on different performance metrics has also been showcased for a\nwell-rounded understanding.\n","authors":["Md Rahatul Islam Udoy","Shamiul Alam","Md Mazharul Islam","Akhilesh Jaiswal","Ahmedullah Aziz"],"pdf_url":"https://arxiv.org/pdf/2402.04507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04504v1","updated":"2024-02-07T01:18:49Z","published":"2024-02-07T01:18:49Z","title":"Text2Street: Controllable Text-to-image Generation for Street Views","summary":" Text-to-image generation has made remarkable progress with the emergence of\ndiffusion models. However, it is still a difficult task to generate images for\nstreet views based on text, mainly because the road topology of street scenes\nis complex, the traffic status is diverse and the weather condition is various,\nwhich makes conventional text-to-image models difficult to deal with. To\naddress these challenges, we propose a novel controllable text-to-image\nframework, named \\textbf{Text2Street}. In the framework, we first introduce the\nlane-aware road topology generator, which achieves text-to-map generation with\nthe accurate road structure and lane lines armed with the counting adapter,\nrealizing the controllable road topology generation. Then, the position-based\nobject layout generator is proposed to obtain text-to-layout generation through\nan object-level bounding box diffusion strategy, realizing the controllable\ntraffic object layout generation. Finally, the multiple control image generator\nis designed to integrate the road topology, object layout and weather\ndescription to realize controllable street-view image generation. Extensive\nexperiments show that the proposed approach achieves controllable street-view\ntext-to-image generation and validates the effectiveness of the Text2Street\nframework for street views.\n","authors":["Jinming Su","Songen Gu","Yiting Duan","Xingyue Chen","Junfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2402.04504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04492v1","updated":"2024-02-07T00:31:49Z","published":"2024-02-07T00:31:49Z","title":"ColorSwap: A Color and Word Order Dataset for Multimodal Evaluation","summary":" This paper introduces the ColorSwap dataset, designed to assess and improve\nthe proficiency of multimodal models in matching objects with their colors. The\ndataset is comprised of 2,000 unique image-caption pairs, grouped into 1,000\nexamples. Each example includes a caption-image pair, along with a\n``color-swapped'' pair. We follow the Winoground schema: the two captions in an\nexample have the same words, but the color words have been rearranged to modify\ndifferent objects. The dataset was created through a novel blend of automated\ncaption and image generation with humans in the loop. We evaluate image-text\nmatching (ITM) and visual language models (VLMs) and find that even the latest\nones are still not robust at this task. GPT-4V and LLaVA score 72% and 42% on\nour main VLM metric, although they may improve with more advanced prompting\ntechniques. On the main ITM metric, contrastive models such as CLIP and SigLIP\nperform close to chance (at 12% and 30%, respectively), although the\nnon-contrastive BLIP ITM model is stronger (87%). We also find that finetuning\non fewer than 2,000 examples yields significant performance gains on this\nout-of-distribution word-order understanding task. The dataset is here:\nhttps://github.com/Top34051/colorswap.\n","authors":["Jirayu Burapacheep","Ishan Gaur","Agam Bhatia","Tristan Thrush"],"pdf_url":"https://arxiv.org/pdf/2402.04492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04482v1","updated":"2024-02-07T00:14:32Z","published":"2024-02-07T00:14:32Z","title":"BEBLID: Boosted efficient binary local image descriptor","summary":" Efficient matching of local image features is a fundamental task in many\ncomputer vision applications. However, the real-time performance of top\nmatching algorithms is compromised in computationally limited devices, such as\nmobile phones or drones, due to the simplicity of their hardware and their\nfinite energy supply. In this paper we introduce BEBLID, an efficient learned\nbinary image descriptor. It improves our previous real-valued descriptor,\nBELID, making it both more efficient for matching and more accurate. To this\nend we use AdaBoost with an improved weak-learner training scheme that produces\nbetter local descriptions. Further, we binarize our descriptor by forcing all\nweak-learners to have the same weight in the strong learner combination and\ntrain it in an unbalanced data set to address the asymmetries arising in\nmatching and retrieval tasks. In our experiments BEBLID achieves an accuracy\nclose to SIFT and better computational efficiency than ORB, the fastest\nalgorithm in the literature.\n","authors":["Iago Suárez","Ghesn Sfeir","José M. Buenaposada","Luis Baumela"],"pdf_url":"https://arxiv.org/pdf/2402.04482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04480v1","updated":"2024-02-07T00:10:39Z","published":"2024-02-07T00:10:39Z","title":"MIRT: a simultaneous reconstruction and affine motion compensation\n technique for four dimensional computed tomography (4DCT)","summary":" In four-dimensional computed tomography (4DCT), 3D images of moving or\ndeforming samples are reconstructed from a set of 2D projection images. Recent\ntechniques for iterative motion-compensated reconstruction either necessitate a\nreference acquisition or alternate image reconstruction and motion estimation\nsteps. In these methods, the motion estimation step involves the estimation of\neither complete deformation vector fields (DVFs) or a limited set of parameters\ncorresponding to the affine motion, including rigid motion or scaling. The\nmajority of these approaches rely on nested iterations, incurring significant\ncomputational expenses. Notably, despite the direct benefits of an analytical\nformulation and a substantial reduction in computational complexity, there has\nbeen no exploration into parameterizing DVFs for general affine motion in CT\nimaging. In this work, we propose the Motion-compensated Iterative\nReconstruction Technique (MIRT)- an efficient iterative reconstruction scheme\nthat combines image reconstruction and affine motion estimation in a single\nupdate step, based on the analytical gradients of the motion towards both the\nreconstruction and the affine motion parameters. When most of the\nstate-of-the-art 4DCT methods have not attempted to be tested on real data,\nresults from simulation and real experiments show that our method outperforms\nthe state-of-the-art CT reconstruction with affine motion correction methods in\ncomputational feasibility and projection distance. In particular, this allows\naccurate reconstruction for a proper microscale diamond in the appearance of\nmotion from the practically acquired projection radiographs, which leads to a\nnovel application of 4DCT.\n","authors":["Anh-Tuan Nguyen","Jens Renders","Domenico Iuso","Yves Maris","Jeroen Soete","Martine Wevers","Jan Sijbers","Jan De Beenhouwer"],"pdf_url":"https://arxiv.org/pdf/2402.04480v1.pdf","comment":"Submitted to the SIAM Journal on Imaging Sciences (SIIMS)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.05070v1","updated":"2024-02-07T18:21:17Z","published":"2024-02-07T18:21:17Z","title":"A Roadmap to Pluralistic Alignment","summary":" With increased power and prevalence of AI systems, it is ever more critical\nthat AI systems are designed to serve all, i.e., people with diverse values and\nperspectives. However, aligning models to serve pluralistic human values\nremains an open research question. In this piece, we propose a roadmap to\npluralistic alignment, specifically using language models as a test bed. We\nidentify and formalize three possible ways to define and operationalize\npluralism in AI systems: 1) Overton pluralistic models that present a spectrum\nof reasonable responses; 2) Steerably pluralistic models that can steer to\nreflect certain perspectives; and 3) Distributionally pluralistic models that\nare well-calibrated to a given population in distribution. We also propose and\nformalize three possible classes of pluralistic benchmarks: 1) Multi-objective\nbenchmarks, 2) Trade-off steerable benchmarks, which incentivize models to\nsteer to arbitrary trade-offs, and 3) Jury-pluralistic benchmarks which\nexplicitly model diverse human ratings. We use this framework to argue that\ncurrent alignment techniques may be fundamentally limited for pluralistic AI;\nindeed, we highlight empirical evidence, both from our own experiments and from\nother work, that standard alignment procedures might reduce distributional\npluralism in models, motivating the need for further research on pluralistic\nalignment.\n","authors":["Taylor Sorensen","Jared Moore","Jillian Fisher","Mitchell Gordon","Niloofar Mireshghallah","Christopher Michael Rytting","Andre Ye","Liwei Jiang","Ximing Lu","Nouha Dziri","Tim Althoff","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2402.05070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00775v2","updated":"2024-02-07T17:30:47Z","published":"2024-01-01T14:41:10Z","title":"Recent Advances in Text Analysis","summary":" Text analysis is an interesting research area in data science and has various\napplications, such as in artificial intelligence, biomedical research, and\nengineering. We review popular methods for text analysis, ranging from topic\nmodeling to the recent neural language models. In particular, we review\nTopic-SCORE, a statistical approach to topic modeling, and discuss how to use\nit to analyze MADStat - a dataset on statistical publications that we collected\nand cleaned.\n The application of Topic-SCORE and other methods on MADStat leads to\ninteresting findings. For example, $11$ representative topics in statistics are\nidentified. For each journal, the evolution of topic weights over time can be\nvisualized, and these results are used to analyze the trends in statistical\nresearch. In particular, we propose a new statistical model for ranking the\ncitation impacts of $11$ topics, and we also build a cross-topic citation graph\nto illustrate how research results on different topics spread to one another.\n The results on MADStat provide a data-driven picture of the statistical\nresearch in $1975$--$2015$, from a text analysis perspective.\n","authors":["Zheng Tracy Ke","Pengsheng Ji","Jiashun Jin","Wanshan Li"],"pdf_url":"https://arxiv.org/pdf/2401.00775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16108v3","updated":"2024-02-07T15:18:51Z","published":"2024-01-29T12:27:18Z","title":"Future Impact Decomposition in Request-level Recommendations","summary":" In recommender systems, reinforcement learning solutions have shown promising\nresults in optimizing the interaction sequence between users and the system\nover the long-term performance. For practical reasons, the policy's actions are\ntypically designed as recommending a list of items to handle users' frequent\nand continuous browsing requests more efficiently. In this list-wise\nrecommendation scenario, the user state is updated upon every request in the\ncorresponding MDP formulation. However, this request-level formulation is\nessentially inconsistent with the user's item-level behavior. In this study, we\ndemonstrate that an item-level optimization approach can better utilize item\ncharacteristics and optimize the policy's performance even under the\nrequest-level MDP. We support this claim by comparing the performance of\nstandard request-level methods with the proposed item-level actor-critic\nframework in both simulation and online experiments. Furthermore, we show that\na reward-based future decomposition strategy can better express the item-wise\nfuture impact and improve the recommendation accuracy in the long term. To\nachieve a more thorough understanding of the decomposition strategy, we propose\na model-based re-weighting framework with adversarial learning that further\nboost the performance and investigate its correlation with the reward-based\nstrategy.\n","authors":["Xiaobei Wang","Shuchang Liu","Xueliang Wang","Qingpeng Cai","Lantao Hu","Han Li","Peng Jiang","Guangming Xie"],"pdf_url":"https://arxiv.org/pdf/2401.16108v3.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.09438v2","updated":"2024-02-07T14:41:40Z","published":"2023-11-15T23:18:01Z","title":"Labeled Interactive Topic Models","summary":" Topic models are valuable for understanding extensive document collections,\nbut they don't always identify the most relevant topics. Classical\nprobabilistic and anchor-based topic models offer interactive versions that\nallow users to guide the models towards more pertinent topics. However, such\ninteractive features have been lacking in neural topic models. To correct this\nlacuna, we introduce a user-friendly interaction for neural topic models. This\ninteraction permits users to assign a word label to a topic, leading to an\nupdate in the topic model where the words in the topic become closely aligned\nwith the given label. Our approach encompasses two distinct kinds of neural\ntopic models. The first includes models where topic embeddings are trainable\nand evolve during the training process. The second kind involves models where\ntopic embeddings are integrated post-training, offering a different approach to\ntopic refinement. To facilitate user interaction with these neural topic\nmodels, we have developed an interactive interface. This interface enables\nusers to engage with and re-label topics as desired. We evaluate our method\nthrough a human study, where users can relabel topics to find relevant\ndocuments. Using our method, user labeling improves document rank scores,\nhelping to find more relevant documents to a given query when compared to no\nuser labeling.\n","authors":["Kyle Seelman","Mozhi Zhang","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2311.09438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04889v1","updated":"2024-02-07T14:22:51Z","published":"2024-02-07T14:22:51Z","title":"Detecting Generated Native Ads in Conversational Search","summary":" Conversational search engines such as YouChat and Microsoft Copilot use large\nlanguage models (LLMs) to generate answers to queries. It is only a small step\nto also use this technology to generate and integrate advertising within these\nanswers - instead of placing ads separately from the organic search results.\nThis type of advertising is reminiscent of native advertising and product\nplacement, both of which are very effective forms of subtle and manipulative\nadvertising. It is likely that information seekers will be confronted with such\nuse of LLM technology in the near future, especially when considering the high\ncomputational costs associated with LLMs, for which providers need to develop\nsustainable business models. This paper investigates whether LLMs can also be\nused as a countermeasure against generated native ads, i.e., to block them. For\nthis purpose we compile a large dataset of ad-prone queries and of generated\nanswers with automatically integrated ads to experiment with fine-tuned\nsentence transformers and state-of-the-art LLMs on the task of recognizing the\nads. In our experiments sentence transformers achieve detection precision and\nrecall values above 0.9, while the investigated LLMs struggle with the task.\n","authors":["Sebastian Schmidt","Ines Zelch","Janek Bevendorff","Benno Stein","Matthias Hagen","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2402.04889v1.pdf","comment":"Submitted to WWW'24 Short Papers Track; 4 pages"},{"id":"http://arxiv.org/abs/2402.04867v1","updated":"2024-02-07T14:07:47Z","published":"2024-02-07T14:07:47Z","title":"Multimodal Query Suggestion with Multi-Agent Reinforcement Learning from\n Human Feedback","summary":" In the rapidly evolving landscape of information retrieval, search engines\nstrive to provide more personalized and relevant results to users. Query\nsuggestion systems play a crucial role in achieving this goal by assisting\nusers in formulating effective queries. However, existing query suggestion\nsystems mainly rely on textual inputs, potentially limiting user search\nexperiences for querying images. In this paper, we introduce a novel Multimodal\nQuery Suggestion (MMQS) task, which aims to generate query suggestions based on\nuser query images to improve the intentionality and diversity of search\nresults. We present the RL4Sugg framework, leveraging the power of Large\nLanguage Models (LLMs) with Multi-Agent Reinforcement Learning from Human\nFeedback to optimize the generation process. Through comprehensive experiments,\nwe validate the effectiveness of RL4Sugg, demonstrating a 18% improvement\ncompared to the best existing approach. Moreover, the MMQS has been transferred\ninto real-world search engine products, which yield enhanced user engagement.\nOur research advances query suggestion systems and provides a new perspective\non multimodal information retrieval.\n","authors":["Zheng Wang","Bingzheng Gan","Wei Shi"],"pdf_url":"https://arxiv.org/pdf/2402.04867v1.pdf","comment":"This paper has been accepted by WWW 2024"},{"id":"http://arxiv.org/abs/2402.04853v1","updated":"2024-02-07T13:52:11Z","published":"2024-02-07T13:52:11Z","title":"Leveraging LLMs for Unsupervised Dense Retriever Ranking","summary":" This paper introduces a novel unsupervised technique that utilizes large\nlanguage models (LLMs) to determine the most suitable dense retriever for a\nspecific test(target) corpus. Selecting the appropriate dense retriever is\nvital for numerous IR applications that employ these retrievers, trained on\npublic datasets, to encode or conduct searches within a new private target\ncorpus. The effectiveness of a dense retriever can significantly diminish when\napplied to a target corpus that diverges in domain or task from the original\ntraining set. The problem becomes more pronounced in cases where the target\ncorpus is unlabeled, e.g. in zero-shot scenarios, rendering direct evaluation\nof the model's effectiveness on the target corpus unattainable. Therefore, the\nunsupervised selection of an optimally pre-trained dense retriever, especially\nunder conditions of domain shift, emerges as a critical challenge. Existing\nmethodologies for ranking dense retrievers fall short in addressing these\ndomain shift scenarios.\n To tackle this, our method capitalizes on LLMs to create pseudo-relevant\nqueries, labels, and reference lists by analyzing a subset of documents from\nthe target corpus. This allows for the ranking of dense retrievers based on\ntheir performance with these pseudo-relevant signals. Significantly, this\nstrategy is the first to depend exclusively on the target corpus data, removing\nthe necessity for training data and test labels. We assessed the effectiveness\nof our approach by compiling a comprehensive pool of cutting-edge dense\nretrievers and comparing our method against traditional dense retriever\nselection benchmarks. The findings reveal that our proposed solution surpasses\nthe existing benchmarks in both the selection and ranking of dense retrievers.\n","authors":["Ekaterina Khramtsova","Shengyao Zhuang","Mahsa Baktashmotlagh","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2402.04853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04713v1","updated":"2024-02-07T10:05:42Z","published":"2024-02-07T10:05:42Z","title":"Theoretical and Empirical Analysis of Adaptive Entry Point Selection for\n Graph-based Approximate Nearest Neighbor Search","summary":" We present a theoretical and empirical analysis of the adaptive entry point\nselection for graph-based approximate nearest neighbor search (ANNS). We\nintroduce novel concepts: $b\\textit{-monotonic path}$ and $B\\textit{-MSNET}$,\nwhich better capture an actual graph in practical algorithms than existing\nconcepts like MSNET. We prove that adaptive entry point selection offers better\nperformance upper bound than the fixed central entry point under more general\nconditions than previous work. Empirically, we validate the method's\neffectiveness in accuracy, speed, and memory usage across various datasets,\nespecially in challenging scenarios with out-of-distribution data and hard\ninstances. Our comprehensive study provides deeper insights into optimizing\nentry points for graph-based ANNS for real-world high-dimensional data\napplications.\n","authors":["Yutaro Oguri","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2402.04713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04627v1","updated":"2024-02-07T07:24:01Z","published":"2024-02-07T07:24:01Z","title":"SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question\n Answering over a Life Science Knowledge Graph","summary":" The recent success of Large Language Models (LLM) in a wide range of Natural\nLanguage Processing applications opens the path towards novel Question\nAnswering Systems over Knowledge Graphs leveraging LLMs. However, one of the\nmain obstacles preventing their implementation is the scarcity of training data\nfor the task of translating questions into corresponding SPARQL queries,\nparticularly in the case of domain-specific KGs. To overcome this challenge, in\nthis study, we evaluate several strategies for fine-tuning the OpenLlama LLM\nfor question answering over life science knowledge graphs. In particular, we\npropose an end-to-end data augmentation approach for extending a set of\nexisting queries over a given knowledge graph towards a larger dataset of\nsemantically enriched question-to-SPARQL query pairs, enabling fine-tuning even\nfor datasets where these pairs are scarce. In this context, we also investigate\nthe role of semantic \"clues\" in the queries, such as meaningful variable names\nand inline comments. Finally, we evaluate our approach over the real-world Bgee\ngene expression knowledge graph and we show that semantic clues can improve\nmodel performance by up to 33% compared to a baseline with random variable\nnames and no comments included.\n","authors":["Julio C. Rangel","Tarcisio Mendes de Farias","Ana Claudia Sima","Norio Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2402.04627v1.pdf","comment":"To appear in Proceedings of SWAT4HCLS 2024: Semantic Web Tools and\n Applications for Healthcare and Life Sciences"},{"id":"http://arxiv.org/abs/2307.06985v5","updated":"2024-02-07T05:42:12Z","published":"2023-07-13T17:25:28Z","title":"Engineering Design Knowledge Graphs from Patented Artefact Descriptions\n for Retrieval-Augmented Generation in the Design Process","summary":" Despite significant popularity, Large-language Models (LLMs) require\nexplicit, contextual facts to support domain-specific knowledge-intensive tasks\nin the design process. The applications built using LLMs should hence adopt\nRetrieval-Augmented Generation (RAG) to better suit the design process. In this\narticle, we present a data-driven method to identify explicit facts from patent\ndocuments that provide standard descriptions of over 8 million artefacts. In\nour method, we train roBERTa Transformer-based sequence classification models\nusing our dataset of 44,227 sentences and facts. Upon classifying tokens in a\nsentence as entities or relationships, our method uses another classifier to\nidentify specific relationship tokens for a given pair of entities so that\nexplicit facts of the form head entity :: relationship :: tail entity are\nidentified. In the benchmark approaches for constructing facts, we use linear\nclassifiers and Graph Neural Networks (GNNs) both incorporating BERT\nTransformer-based token embeddings to predict associations among the entities\nand relationships. We apply our method to 4,870 fan system related patents and\npopulate a knowledge base of around 3 million facts. Upon retrieving the facts\nrepresenting generalisable domain knowledge and the knowledge of specific\nsubsystems and issues, we demonstrate how these facts contextualise LLMs for\ngenerating text that is more relevant to the design process.\n","authors":["L Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2307.06985v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14595v2","updated":"2024-02-07T03:36:21Z","published":"2024-01-26T01:51:50Z","title":"Recency Ranking by Diversification of Result Set","summary":" In this paper, we propose a web search retrieval approach which automatically\ndetects recency sensitive queries and increases the freshness of the ordinary\ndocument ranking by a degree proportional to the probability of the need in\nrecent content. We propose to solve the recency ranking problem by using result\ndiversification principles and deal with the query's non-topical ambiguity\nappearing when the need in recent content can be detected only with\nuncertainty. Our offline and online experiments with millions of queries from\nreal search engine users demonstrate the significant increase in satisfaction\nof users presented with a search result generated by our approach.\n","authors":["Andrey Styskin","Fedor Romanenko","Fedor Vorobyev","Pavel Serdyukov"],"pdf_url":"https://arxiv.org/pdf/2401.14595v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04548v1","updated":"2024-02-07T03:05:54Z","published":"2024-02-07T03:05:54Z","title":"NORMY: Non-Uniform History Modeling for Open Retrieval Conversational\n Question Answering","summary":" Open Retrieval Conversational Question Answering (OrConvQA) answers a\nquestion given a conversation as context and a document collection. A typical\nOrConvQA pipeline consists of three modules: a Retriever to retrieve relevant\ndocuments from the collection, a Reranker to rerank them given the question and\nthe context, and a Reader to extract an answer span. The conversational turns\ncan provide valuable context to answer the final query. State-of-the-art\nOrConvQA systems use the same history modeling for all three modules of the\npipeline. We hypothesize this as suboptimal. Specifically, we argue that a\nbroader context is needed in the first modules of the pipeline to not miss\nrelevant documents, while a narrower context is needed in the last modules to\nidentify the exact answer span. We propose NORMY, the first unsupervised\nnon-uniform history modeling pipeline which generates the best conversational\nhistory for each module. We further propose a novel Retriever for NORMY, which\nemploys keyphrase extraction on the conversation history, and leverages\npassages retrieved in previous turns as additional context. We also created a\nnew dataset for OrConvQA, by expanding the doc2dial dataset. We implemented\nvarious state-of-the-art history modeling techniques and comprehensively\nevaluated them separately for each module of the pipeline on three datasets:\nOR-QUAC, our doc2dial extension, and ConvMix. Our extensive experiments show\nthat NORMY outperforms the state-of-the-art in the individual modules and in\nthe end-to-end system.\n","authors":["Muhammad Shihab Rashid","Jannat Ara Meem","Vagelis Hristidis"],"pdf_url":"https://arxiv.org/pdf/2402.04548v1.pdf","comment":"Accepted for publication at IEEE ICSC 2024"},{"id":"http://arxiv.org/abs/2402.04527v1","updated":"2024-02-07T02:14:58Z","published":"2024-02-07T02:14:58Z","title":"RA-Rec: An Efficient ID Representation Alignment Framework for LLM-based\n Recommendation","summary":" Large language models (LLM) have recently emerged as a powerful tool for a\nvariety of natural language processing tasks, bringing a new surge of combining\nLLM with recommendation systems, termed as LLM-based RS. Current approaches\ngenerally fall into two main paradigms, the ID direct usage paradigm and the ID\ntranslation paradigm, noting their core weakness stems from lacking\nrecommendation knowledge and uniqueness. To address this limitation, we propose\na new paradigm, ID representation, which incorporates pre-trained ID embeddings\ninto LLMs in a complementary manner. In this work, we present RA-Rec, an\nefficient ID representation alignment framework for LLM-based recommendation,\nwhich is compatible with multiple ID-based methods and LLM architectures.\nSpecifically, we treat ID embeddings as soft prompts and design an innovative\nalignment module and an efficient tuning method with tailored data construction\nfor alignment. Extensive experiments demonstrate RA-Rec substantially\noutperforms current state-of-the-art methods, achieving up to 3.0% absolute\nHitRate@100 improvements while utilizing less than 10x training data.\n","authors":["Xiaohan Yu","Li Zhang","Xin Zhao","Yue Wang","Zhongrui Ma"],"pdf_url":"https://arxiv.org/pdf/2402.04527v1.pdf","comment":"10 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.05253v2","updated":"2024-02-07T18:59:55Z","published":"2023-12-08T18:59:14Z","title":"DiSK: A Diffusion Model for Structured Knowledge","summary":" Structured (dictionary-like) data presents challenges for left-to-right\nlanguage models, as they can struggle with structured entities for a wide\nvariety of reasons such as formatting and sensitivity to the order in which\nattributes are presented. Tabular generative models suffer from a different set\nof limitations such as their lack of flexibility. We introduce Diffusion Models\nof Structured Knowledge (DiSK) - a new architecture and training approach\nspecialized for structured data. DiSK handles text, categorical, and continuous\nnumerical data using a Gaussian mixture model approach, which allows for\nimproved precision when dealing with numbers. It employs diffusion training to\nmodel relationships between properties. Experiments demonstrate DiSK's\nstate-of-the-art performance on tabular data modeling, synthesis, and\nimputation on over 15 datasets across diverse domains. DiSK provides an\neffective inductive bias for generative modeling and manipulation of structured\ndata. The techniques we propose could open the door to improved knowledge\nmanipulation in future language models.\n","authors":["Ouail Kitouni","Niklas Nolte","James Hensman","Bhaskar Mitra"],"pdf_url":"https://arxiv.org/pdf/2312.05253v2.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.05110v1","updated":"2024-02-07T18:59:12Z","published":"2024-02-07T18:59:12Z","title":"Opening the AI black box: program synthesis via mechanistic\n interpretability","summary":" We present MIPS, a novel method for program synthesis based on automated\nmechanistic interpretability of neural networks trained to perform the desired\ntask, auto-distilling the learned algorithm into Python code. We test MIPS on a\nbenchmark of 62 algorithmic tasks that can be learned by an RNN and find it\nhighly complementary to GPT-4: MIPS solves 32 of them, including 13 that are\nnot solved by GPT-4 (which also solves 30). MIPS uses an integer autoencoder to\nconvert the RNN into a finite state machine, then applies Boolean or integer\nsymbolic regression to capture the learned algorithm. As opposed to large\nlanguage models, this program synthesis technique makes no use of (and is\ntherefore not limited by) human training data such as algorithms and code from\nGitHub. We discuss opportunities and challenges for scaling up this approach to\nmake machine-learned models more interpretable and trustworthy.\n","authors":["Eric J. Michaud","Isaac Liao","Vedang Lad","Ziming Liu","Anish Mudide","Chloe Loughridge","Zifan Carl Guo","Tara Rezaei Kheirkhah","Mateja Vukelić","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2402.05110v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2402.05109v1","updated":"2024-02-07T18:58:50Z","published":"2024-02-07T18:58:50Z","title":"Hydra: Sequentially-Dependent Draft Heads for Medusa Decoding","summary":" To combat the memory bandwidth-bound nature of autoregressive LLM inference,\nprevious research has proposed the speculative decoding framework. To perform\nspeculative decoding, a small draft model proposes candidate continuations of\nthe input sequence, that are then verified in parallel by the base model. One\nway to specify the draft model, as used in the recent Medusa decoding\nframework, is as a collection of light-weight heads, called draft heads, that\noperate on the base model's hidden states. To date, all existing draft heads\nhave been sequentially independent, meaning that they speculate tokens in the\ncandidate continuation independently of any preceding tokens in the candidate\ncontinuation. In this work, we propose Hydra heads, a sequentially dependent,\ndrop-in replacement for standard draft heads that significantly improves\nspeculation accuracy. Decoding with Hydra heads improves throughput compared to\nMedusa decoding with standard draft heads. We further explore the design space\nof Hydra head training objectives and architectures, and propose a\ncarefully-tuned Hydra head recipe, which we call Hydra++, that improves\ndecoding throughput by 1.31x and 2.71x compared to Medusa decoding and\nautoregressive decoding, respectively. Overall, Hydra heads are a simple\nintervention on standard draft heads that significantly improve the end-to-end\nspeed of draft head based speculative decoding.\n","authors":["Zachary Ankner","Rishab Parthasarathy","Aniruddha Nrusimha","Christopher Rinard","Jonathan Ragan-Kelley","William Brandon"],"pdf_url":"https://arxiv.org/pdf/2402.05109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05101v1","updated":"2024-02-07T18:55:22Z","published":"2024-02-07T18:55:22Z","title":"Tighter Generalisation Bounds via Interpolation","summary":" This paper contains a recipe for deriving new PAC-Bayes generalisation bounds\nbased on the $(f, \\Gamma)$-divergence, and, in addition, presents PAC-Bayes\ngeneralisation bounds where we interpolate between a series of probability\ndivergences (including but not limited to KL, Wasserstein, and total\nvariation), making the best out of many worlds depending on the posterior\ndistributions properties. We explore the tightness of these bounds and connect\nthem to earlier results from statistical learning, which are specific cases. We\nalso instantiate our bounds as training objectives, yielding non-trivial\nguarantees and practical performances.\n","authors":["Paul Viallard","Maxime Haddouche","Umut Şimşekli","Benjamin Guedj"],"pdf_url":"https://arxiv.org/pdf/2402.05101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05099v1","updated":"2024-02-07T18:53:01Z","published":"2024-02-07T18:53:01Z","title":"Hydragen: High-Throughput LLM Inference with Shared Prefixes","summary":" Transformer-based large language models (LLMs) are now deployed to hundreds\nof millions of users. LLM inference is commonly performed on batches of\nsequences that share a prefix, such as few-shot examples or a chatbot system\nprompt. Decoding in this large-batch setting can be bottlenecked by the\nattention operation, which reads large key-value (KV) caches from memory and\ncomputes inefficient matrix-vector products for every sequence in the batch. In\nthis work, we introduce Hydragen, a hardware-aware exact implementation of\nattention with shared prefixes. Hydragen computes attention over the shared\nprefix and unique suffixes separately. This decomposition enables efficient\nprefix attention by batching queries together across sequences, reducing\nredundant memory reads and enabling the use of hardware-friendly matrix\nmultiplications. Our method can improve end-to-end LLM throughput by up to 32x\nagainst competitive baselines, with speedup growing with the batch size and\nshared prefix length. Hydragen also enables the use of very long shared\ncontexts: with a high batch size, increasing the prefix length from 1K to 16K\ntokens decreases Hydragen throughput by less than 15%, while the throughput of\nbaselines drops by over 90%. Hydragen generalizes beyond simple prefix-suffix\ndecomposition and can be applied to tree-based prompt sharing patterns,\nallowing us to further reduce inference time on competitive programming\nproblems by 55%.\n","authors":["Jordan Juravsky","Bradley Brown","Ryan Ehrlich","Daniel Y. Fu","Christopher Ré","Azalia Mirhoseini"],"pdf_url":"https://arxiv.org/pdf/2402.05099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05596v2","updated":"2024-02-07T18:52:26Z","published":"2023-12-09T15:29:45Z","title":"Factorized Explainer for Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have received increasing attention due to their\nability to learn from graph-structured data. To open the black-box of these\ndeep learning models, post-hoc instance-level explanation methods have been\nproposed to understand GNN predictions. These methods seek to discover\nsubstructures that explain the prediction behavior of a trained GNN. In this\npaper, we show analytically that for a large class of explanation tasks,\nconventional approaches, which are based on the principle of graph information\nbottleneck (GIB), admit trivial solutions that do not align with the notion of\nexplainability. Instead, we argue that a modified GIB principle may be used to\navoid the aforementioned trivial solutions. We further introduce a novel\nfactorized explanation model with theoretical performance guarantees. The\nmodified GIB is used to analyze the structural properties of the proposed\nfactorized explainer. We conduct extensive experiments on both synthetic and\nreal-world datasets to validate the effectiveness of our proposed factorized\nexplainer.\n","authors":["Rundong Huang","Farhad Shirani","Dongsheng Luo"],"pdf_url":"https://arxiv.org/pdf/2312.05596v2.pdf","comment":"AAAI 24"},{"id":"http://arxiv.org/abs/2402.05098v1","updated":"2024-02-07T18:51:49Z","published":"2024-02-07T18:51:49Z","title":"On diffusion models for amortized inference: Benchmarking and improving\n stochastic control and sampling","summary":" We study the problem of training diffusion models to sample from a\ndistribution with a given unnormalized density or energy function. We benchmark\nseveral diffusion-structured inference methods, including simulation-based\nvariational approaches and off-policy methods (continuous generative flow\nnetworks). Our results shed light on the relative advantages of existing\nalgorithms while bringing into question some claims from past work. We also\npropose a novel exploration strategy for off-policy methods, based on local\nsearch in the target space with the use of a replay buffer, and show that it\nimproves the quality of samples on a variety of target distributions. Our code\nfor the sampling methods and benchmarks studied is made public at\nhttps://github.com/GFNOrg/gfn-diffusion as a base for future work on diffusion\nmodels for amortized inference.\n","authors":["Marcin Sendera","Minsu Kim","Sarthak Mittal","Pablo Lemos","Luca Scimeca","Jarrid Rector-Brooks","Alexandre Adam","Yoshua Bengio","Nikolay Malkin"],"pdf_url":"https://arxiv.org/pdf/2402.05098v1.pdf","comment":"21 pages; code: https://github.com/GFNOrg/gfn-diffusion"},{"id":"http://arxiv.org/abs/2402.01263v2","updated":"2024-02-07T18:44:41Z","published":"2024-02-02T09:34:49Z","title":"A Differentiable Partially Observable Generalized Linear Model with\n Forward-Backward Message Passing","summary":" The partially observable generalized linear model (POGLM) is a powerful tool\nfor understanding neural connectivity under the assumption of existing hidden\nneurons. With spike trains only recorded from visible neurons, existing works\nuse variational inference to learn POGLM meanwhile presenting the difficulty of\nlearning this latent variable model. There are two main issues: (1) the sampled\nPoisson hidden spike count hinders the use of the pathwise gradient estimator\nin VI; and (2) the existing design of the variational model is neither\nexpressive nor time-efficient, which further affects the performance. For (1),\nwe propose a new differentiable POGLM, which enables the pathwise gradient\nestimator, better than the score function gradient estimator used in existing\nworks. For (2), we propose the forward-backward message-passing sampling scheme\nfor the variational model. Comprehensive experiments show that our\ndifferentiable POGLMs with our forward-backward message passing produce a\nbetter performance on one synthetic and two real-world datasets. Furthermore,\nour new method yields more interpretable parameters, underscoring its\nsignificance in neuroscience.\n","authors":["Chengrui Li","Weihan Li","Yule Wang","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2402.01263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01731v3","updated":"2024-02-07T18:41:12Z","published":"2023-06-02T17:57:53Z","title":"PAGAR: Taming Reward Misalignment in Inverse Reinforcement\n Learning-Based Imitation Learning with Protagonist Antagonist Guided\n Adversarial Reward","summary":" Many imitation learning (IL) algorithms employ inverse reinforcement learning\n(IRL) to infer the intrinsic reward function that an expert is implicitly\noptimizing for based on their demonstrated behaviors. However, in practice,\nIRL-based IL can fail to accomplish the underlying task due to a misalignment\nbetween the inferred reward and the objective of the task. In this paper, we\naddress the susceptibility of IL to such misalignment by introducing a\nsemi-supervised reward design paradigm called Protagonist Antagonist Guided\nAdversarial Reward (PAGAR). PAGAR-based IL trains a policy to perform well\nunder mixed reward functions instead of a single reward function as in\nIRL-based IL. We identify the theoretical conditions under which PAGAR-based IL\ncan avoid the task failures caused by reward misalignment. We also present a\npractical on-and-off policy approach to implementing PAGAR-based IL.\nExperimental results show that our algorithm outperforms standard IL baselines\nin complex tasks and challenging transfer settings.\n","authors":["Weichao Zhou","Wenchao Li"],"pdf_url":"https://arxiv.org/pdf/2306.01731v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04022v2","updated":"2024-02-07T18:36:18Z","published":"2024-02-06T14:12:46Z","title":"A General Theory for Kernel Packets: from state space model to compactly\n supported basis","summary":" It is well known that the state space (SS) model formulation of a Gaussian\nprocess (GP) can lower its training and prediction time both to O(n) for n data\npoints. We prove that an $m$-dimensional SS model formulation of GP is\nequivalent to a concept we introduce as the general right Kernel Packet (KP): a\ntransformation for the GP covariance function $K$ such that\n$\\sum_{i=0}^{m}a_iD_t^{(j)}K(t,t_i)=0$ holds for any $t \\leq t_1$, 0 $\\leq j\n\\leq m-1$, and $m+1$ consecutive points $t_i$, where ${D}_t^{(j)}f(t) $ denotes\n$j$-th order derivative acting on $t$. We extend this idea to the backward SS\nmodel formulation of the GP, leading to the concept of the left KP for next $m$\nconsecutive points: $\\sum_{i=0}^{m}b_i{D}_t^{(j)}K(t,t_{m+i})=0$ for any $t\\geq\nt_{2m}$. By combining both left and right KPs, we can prove that a suitable\nlinear combination of these covariance functions yields $m$ compactly supported\nKP functions: $\\phi^{(j)}(t)=0$ for any $t\\not\\in(t_0,t_{2m})$ and\n$j=0,\\cdots,m-1$. KPs further reduce the prediction time of GP to O(log n) or\neven O(1), can be applied to more general problems involving the derivative of\nGPs, and have multi-dimensional generalization for scattered data.\n","authors":["Liang Ding","Tuo Rui"],"pdf_url":"https://arxiv.org/pdf/2402.04022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05073v1","updated":"2024-02-07T18:27:29Z","published":"2024-02-07T18:27:29Z","title":"NITO: Neural Implicit Fields for Resolution-free Topology Optimization","summary":" Topology optimization is a critical task in engineering design, where the\ngoal is to optimally distribute material in a given space for maximum\nperformance. We introduce Neural Implicit Topology Optimization (NITO), a novel\napproach to accelerate topology optimization problems using deep learning. NITO\nstands out as one of the first frameworks to offer a resolution-free and\ndomain-agnostic solution in deep learning-based topology optimization. NITO\nsynthesizes structures with up to seven times better structural efficiency\ncompared to SOTA diffusion models and does so in a tenth of the time. In the\nNITO framework, we introduce a novel method, the Boundary Point Order-Invariant\nMLP (BPOM), to represent boundary conditions in a sparse and domain-agnostic\nmanner, moving away from expensive simulation-based approaches. Crucially, NITO\ncircumvents the domain and resolution limitations that restrict Convolutional\nNeural Network (CNN) models to a structured domain of fixed size -- limitations\nthat hinder the widespread adoption of CNNs in engineering applications. This\ngeneralizability allows a single NITO model to train and generate solutions in\ncountless domains, eliminating the need for numerous domain-specific CNNs and\ntheir extensive datasets. Despite its generalizability, NITO outperforms SOTA\nmodels even in specialized tasks, is an order of magnitude smaller, and is\npractically trainable at high resolutions that would be restrictive for CNNs.\nThis combination of versatility, efficiency, and performance underlines NITO's\npotential to transform the landscape of engineering design optimization\nproblems through implicit fields.\n","authors":["Amin Heyrani Nobari","Giorgio Giannone","Lyle Regenwetter","Faez Ahmed"],"pdf_url":"https://arxiv.org/pdf/2402.05073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05071v1","updated":"2024-02-07T18:22:41Z","published":"2024-02-07T18:22:41Z","title":"Extending the Reach of First-Order Algorithms for Nonconvex Min-Max\n Problems with Cohypomonotonicity","summary":" We focus on constrained, $L$-smooth, nonconvex-nonconcave min-max problems\neither satisfying $\\rho$-cohypomonotonicity or admitting a solution to the\n$\\rho$-weakly Minty Variational Inequality (MVI), where larger values of the\nparameter $\\rho>0$ correspond to a greater degree of nonconvexity. These\nproblem classes include examples in two player reinforcement learning,\ninteraction dominant min-max problems, and certain synthetic test problems on\nwhich classical min-max algorithms fail. It has been conjectured that\nfirst-order methods can tolerate value of $\\rho$ no larger than $\\frac{1}{L}$,\nbut existing results in the literature have stagnated at the tighter\nrequirement $\\rho < \\frac{1}{2L}$. With a simple argument, we obtain optimal or\nbest-known complexity guarantees with cohypomonotonicity or weak MVI conditions\nfor $\\rho < \\frac{1}{L}$. The algorithms we analyze are inexact variants of\nHalpern and Krasnosel'ski\\u{\\i}-Mann (KM) iterations. We also provide\nalgorithms and complexity guarantees in the stochastic case with the same range\non $\\rho$. Our main insight for the improvements in the convergence analyses is\nto harness the recently proposed \"conic nonexpansiveness\" property of\noperators. As byproducts, we provide a refined analysis for inexact Halpern\niteration and propose a stochastic KM iteration with a multilevel Monte Carlo\nestimator.\n","authors":["Ahmet Alacaoglu","Donghwan Kim","Stephen J. Wright"],"pdf_url":"https://arxiv.org/pdf/2402.05071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05067v1","updated":"2024-02-07T18:19:51Z","published":"2024-02-07T18:19:51Z","title":"Multiscale Modelling with Physics-informed Neural Network: from\n Large-scale Dynamics to Small-scale Predictions in Complex Systems","summary":" Multiscale phenomena manifest across various scientific domains, presenting a\nubiquitous challenge in accurately and effectively predicting multiscale\ndynamics in complex systems. In this paper, a novel solving mode is proposed\nfor characterizing multiscale dynamics through a decoupling method. By\nmodelling large-scale dynamics independently and treating small-scale dynamics\nas a slaved system, a Spectral PINN is developed to approach the small-scale\nsystem in an orthogonal basis functional space. The effectiveness of the method\nis demonstrated through extensive numerical experiments, including\none-dimensional Kuramot-Sivashinsky (KS) equation, two- and three-dimensional\nNavier-Stokes (NS) equations, showcasing its versatility in addressing problems\nof fluid dynamics. Furthermore, we also delve into the application of the\nproposed approach to more complex problems, including non-uniform meshes,\ncomplex geometries, large-scale data with noise, and high-dimensional\nsmall-scale dynamics. The discussions about these scenarios contribute to a\ncomprehensive understanding of the method's capabilities and limitations. This\nnovel decoupling approach simplifies the analysis and prediction of\nspatiotemporal systems, where large-scale data can be obtained with low\ncomputational demands, followed by Spectral PINNs for capturing small-scale\ndynamics with improved efficiency and accuracy.\n","authors":["Jing Wang","Zheng Li","Pengyu Lai","Rui Wang","Di Yang","Hui Xu"],"pdf_url":"https://arxiv.org/pdf/2402.05067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08191v2","updated":"2024-02-07T18:18:54Z","published":"2023-06-14T01:24:42Z","title":"Solving Large-scale Spatial Problems with Convolutional Neural Networks","summary":" Over the past decade, deep learning research has been accelerated by\nincreasingly powerful hardware, which facilitated rapid growth in the model\ncomplexity and the amount of data ingested. This is becoming unsustainable and\ntherefore refocusing on efficiency is necessary. In this paper, we employ\ntransfer learning to improve training efficiency for large-scale spatial\nproblems. We propose that a convolutional neural network (CNN) can be trained\non small windows of signals, but evaluated on arbitrarily large signals with\nlittle to no performance degradation, and provide a theoretical bound on the\nresulting generalization error. Our proof leverages shift-equivariance of CNNs,\na property that is underexploited in transfer learning. The theoretical results\nare experimentally supported in the context of mobile infrastructure on demand\n(MID). The proposed approach is able to tackle MID at large scales with\nhundreds of agents, which was computationally intractable prior to this work.\n","authors":["Damian Owerko","Charilaos I. Kanatsoulis","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2306.08191v2.pdf","comment":"6 pages, 2 figures, submitted to Asilomar Conference on Signals,\n Systems, and Computers 2023"},{"id":"http://arxiv.org/abs/2306.03933v5","updated":"2024-02-07T18:16:43Z","published":"2023-06-06T18:01:03Z","title":"High-dimensional and Permutation Invariant Anomaly Detection","summary":" Methods for anomaly detection of new physics processes are often limited to\nlow-dimensional spaces due to the difficulty of learning high-dimensional\nprobability densities. Particularly at the constituent level, incorporating\ndesirable properties such as permutation invariance and variable-length inputs\nbecomes difficult within popular density estimation methods. In this work, we\nintroduce a permutation-invariant density estimator for particle physics data\nbased on diffusion models, specifically designed to handle variable-length\ninputs. We demonstrate the efficacy of our methodology by utilizing the learned\ndensity as a permutation-invariant anomaly detection score, effectively\nidentifying jets with low likelihood under the background-only hypothesis. To\nvalidate our density estimation method, we investigate the ratio of learned\ndensities and compare to those obtained by a supervised classification\nalgorithm.\n","authors":["Vinicius Mikuni","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2306.03933v5.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.12498v2","updated":"2024-02-07T18:06:54Z","published":"2023-06-21T18:14:44Z","title":"Empirical Risk Minimization with Shuffled SGD: A Primal-Dual Perspective\n and Improved Bounds","summary":" Stochastic gradient descent (SGD) is perhaps the most prevalent optimization\nmethod in modern machine learning. Contrary to the empirical practice of\nsampling from the datasets without replacement and with (possible) reshuffling\nat each epoch, the theoretical counterpart of SGD usually relies on the\nassumption of sampling with replacement. It is only very recently that SGD with\nsampling without replacement -- shuffled SGD -- has been analyzed. For convex\nfinite sum problems with $n$ components and under the $L$-smoothness assumption\nfor each component function, there are matching upper and lower bounds, under\nsufficiently small -- $\\mathcal{O}(\\frac{1}{nL})$ -- step sizes. Yet those\nbounds appear too pessimistic -- in fact, the predicted performance is\ngenerally no better than for full gradient descent -- and do not agree with the\nempirical observations. In this work, to narrow the gap between the theory and\npractice of shuffled SGD, we sharpen the focus from general finite sum problems\nto empirical risk minimization with linear predictors. This allows us to take a\nprimal-dual perspective and interpret shuffled SGD as a primal-dual method with\ncyclic coordinate updates on the dual side. Leveraging this perspective, we\nprove fine-grained complexity bounds that depend on the data matrix and are\nnever worse than what is predicted by the existing bounds. Notably, our bounds\npredict much faster convergence than the existing analyses -- by a factor of\nthe order of $\\sqrt{n}$ in some cases. We empirically demonstrate that on\ncommon machine learning datasets our bounds are indeed much tighter. We further\nextend our analysis to nonsmooth convex problems and more general finite-sum\nproblems, with similar improvements.\n","authors":["Xufeng Cai","Cheuk Yin Lin","Jelena Diakonikolas"],"pdf_url":"https://arxiv.org/pdf/2306.12498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01748v2","updated":"2024-02-07T17:55:11Z","published":"2024-01-30T00:21:41Z","title":"Large Multi-Modal Models (LMMs) as Universal Foundation Models for\n AI-Native Wireless Systems","summary":" Large language models (LLMs) and foundation models have been recently touted\nas a game-changer for 6G systems. However, recent efforts on LLMs for wireless\nnetworks are limited to a direct application of existing language models that\nwere designed for natural language processing (NLP) applications. To address\nthis challenge and create wireless-centric foundation models, this paper\npresents a comprehensive vision on how to design universal foundation models\nthat are tailored towards the deployment of artificial intelligence (AI)-native\nnetworks. Diverging from NLP-based foundation models, the proposed framework\npromotes the design of large multi-modal models (LMMs) fostered by three key\ncapabilities: 1) processing of multi-modal sensing data, 2) grounding of\nphysical symbol representations in real-world wireless systems using causal\nreasoning and retrieval-augmented generation (RAG), and 3) enabling\ninstructibility from the wireless environment feedback to facilitate dynamic\nnetwork adaptation thanks to logical and mathematical reasoning facilitated by\nneuro-symbolic AI. In essence, these properties enable the proposed LMM\nframework to build universal capabilities that cater to various cross-layer\nnetworking tasks and alignment of intents across different domains. Preliminary\nresults from experimental evaluation demonstrate the efficacy of grounding\nusing RAG in LMMs, and showcase the alignment of LMMs with wireless system\ndesigns. Furthermore, the enhanced rationale exhibited in the responses to\nmathematical questions by LMMs, compared to vanilla LLMs, demonstrates the\nlogical and mathematical reasoning capabilities inherent in LMMs. Building on\nthose results, we present a sequel of open questions and challenges for LMMs.\nWe then conclude with a set of recommendations that ignite the path towards\nLMM-empowered AI-native systems.\n","authors":["Shengzhe Xu","Christo Kurisummoottil Thomas","Omar Hashash","Nikhil Muralidhar","Walid Saad","Naren Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2402.01748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05052v1","updated":"2024-02-07T17:51:38Z","published":"2024-02-07T17:51:38Z","title":"Causal Representation Learning from Multiple Distributions: A General\n Setting","summary":" In many problems, the measured variables (e.g., image pixels) are just\nmathematical functions of the hidden causal variables (e.g., the underlying\nconcepts or objects). For the purpose of making predictions in changing\nenvironments or making proper changes to the system, it is helpful to recover\nthe hidden causal variables $Z_i$ and their causal relations represented by\ngraph $\\mathcal{G}_Z$. This problem has recently been known as causal\nrepresentation learning. This paper is concerned with a general, completely\nnonparametric setting of causal representation learning from multiple\ndistributions (arising from heterogeneous data or nonstationary time series),\nwithout assuming hard interventions behind distribution changes. We aim to\ndevelop general solutions in this fundamental case; as a by product, this helps\nsee the unique benefit offered by other assumptions such as parametric causal\nmodels or hard interventions. We show that under the sparsity constraint on the\nrecovered graph over the latent variables and suitable sufficient change\nconditions on the causal influences, interestingly, one can recover the\nmoralized graph of the underlying directed acyclic graph, and the recovered\nlatent variables and their relations are related to the underlying causal model\nin a specific, nontrivial way. In some cases, each latent variable can even be\nrecovered up to component-wise transformations. Experimental results verify our\ntheoretical claims.\n","authors":["Kun Zhang","Shaoan Xie","Ignavier Ng","Yujia Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.05052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05050v1","updated":"2024-02-07T17:46:37Z","published":"2024-02-07T17:46:37Z","title":"Federated Learning Can Find Friends That Are Beneficial","summary":" In Federated Learning (FL), the distributed nature and heterogeneity of\nclient data present both opportunities and challenges. While collaboration\namong clients can significantly enhance the learning process, not all\ncollaborations are beneficial; some may even be detrimental. In this study, we\nintroduce a novel algorithm that assigns adaptive aggregation weights to\nclients participating in FL training, identifying those with data distributions\nmost conducive to a specific learning objective. We demonstrate that our\naggregation method converges no worse than the method that aggregates only the\nupdates received from clients with the same data distribution. Furthermore,\nempirical evaluations consistently reveal that collaborations guided by our\nalgorithm outperform traditional FL approaches. This underscores the critical\nrole of judicious client selection and lays the foundation for more streamlined\nand effective FL implementations in the coming years.\n","authors":["Nazarii Tupitsa","Samuel Horváth","Martin Takáč","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2402.05050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02304v2","updated":"2024-02-07T17:42:57Z","published":"2024-02-04T00:07:05Z","title":"Efficient Numerical Wave Propagation Enhanced By An End-to-End Deep\n Learning Model","summary":" In a variety of scientific and engineering domains, the need for\nhigh-fidelity and efficient solutions for high-frequency wave propagation holds\ngreat significance. Recent advances in wave modeling use sufficiently accurate\nfine solver outputs to train a neural networks that enhances the accuracy of a\nfast but inaccurate coarse solver. A stable and fast solver allows the use of\nParareal, a parallel-in-time algorithm to correct high-frequency wave\ncomponents. In this paper we build upon the work of Nguyen and Tsai (2023) and\npresent a unified system that integrates a numerical solver with a neural\nnetwork into an end-to-end framework. In the proposed setting, we investigate\nrefinements to the deep learning architecture, data generation algorithm and\nParareal scheme. Our results show that the cohesive structure improves\nperformance without sacrificing speed, and demonstrate the importance of\ntemporal dynamics, as well as Parareal, for accurate wave propagation.\n","authors":["Luis Kaiser","Richard Tsai","Christian Klingenberg"],"pdf_url":"https://arxiv.org/pdf/2402.02304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05044v1","updated":"2024-02-07T17:33:54Z","published":"2024-02-07T17:33:54Z","title":"SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large\n Language Models","summary":" In the rapidly evolving landscape of Large Language Models (LLMs), ensuring\nrobust safety measures is paramount. To meet this crucial need, we propose\n\\emph{SALAD-Bench}, a safety benchmark specifically designed for evaluating\nLLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench\ntranscends conventional benchmarks through its large scale, rich diversity,\nintricate taxonomy spanning three levels, and versatile\nfunctionalities.SALAD-Bench is crafted with a meticulous array of questions,\nfrom standard queries to complex ones enriched with attack, defense\nmodifications and multiple-choice. To effectively manage the inherent\ncomplexity, we introduce an innovative evaluators: the LLM-based MD-Judge for\nQA pairs with a particular focus on attack-enhanced queries, ensuring a\nseamless, and reliable evaluation. Above components extend SALAD-Bench from\nstandard LLM safety evaluation to both LLM attack and defense methods\nevaluation, ensuring the joint-purpose utility. Our extensive experiments shed\nlight on the resilience of LLMs against emerging threats and the efficacy of\ncontemporary defense tactics. Data and evaluator are released under\n\\url{https://github.com/OpenSafetyLab/SALAD-BENCH}. Warning: this paper\nincludes examples that may be offensive or harmful.\n","authors":["Lijun Li","Bowen Dong","Ruohui Wang","Xuhao Hu","Wangmeng Zuo","Dahua Lin","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2402.05044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05039v1","updated":"2024-02-07T17:23:15Z","published":"2024-02-07T17:23:15Z","title":"PAC Learnability under Explanation-Preserving Graph Perturbations","summary":" Graphical models capture relations between entities in a wide range of\napplications including social networks, biology, and natural language\nprocessing, among others. Graph neural networks (GNN) are neural models that\noperate over graphs, enabling the model to leverage the complex relationships\nand dependencies in graph-structured data. A graph explanation is a subgraph\nwhich is an `almost sufficient' statistic of the input graph with respect to\nits classification label. Consequently, the classification label is invariant,\nwith high probability, to perturbations of graph edges not belonging to its\nexplanation subgraph. This work considers two methods for leveraging such\nperturbation invariances in the design and training of GNNs. First,\nexplanation-assisted learning rules are considered. It is shown that the sample\ncomplexity of explanation-assisted learning can be arbitrarily smaller than\nexplanation-agnostic learning. Next, explanation-assisted data augmentation is\nconsidered, where the training set is enlarged by artificially producing new\ntraining samples via perturbation of the non-explanation edges in the original\ntraining set. It is shown that such data augmentation methods may improve\nperformance if the augmented data is in-distribution, however, it may also lead\nto worse sample complexity compared to explanation-agnostic learning rules if\nthe augmented data is out-of-distribution. Extensive empirical evaluations are\nprovided to verify the theoretical analysis.\n","authors":["Xu Zheng","Farhad Shirani","Tianchun Wang","Shouwei Gao","Wenqian Dong","Wei Cheng","Dongsheng Luo"],"pdf_url":"https://arxiv.org/pdf/2402.05039v1.pdf","comment":"21 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.11903v2","updated":"2024-02-07T17:18:09Z","published":"2023-06-20T21:30:54Z","title":"Deep Fusion: Efficient Network Training via Pre-trained Initializations","summary":" In recent years, deep learning has made remarkable progress in a wide range\nof domains, with a particularly notable impact on natural language processing\ntasks. One of the challenges associated with training deep neural networks in\nthe context of LLMs is the need for large amounts of computational resources\nand time. To mitigate this, network growing algorithms offer potential cost\nsavings, but their underlying mechanisms are poorly understood. We present two\nnotable contributions in this paper. First, we present Deep Fusion, an\nefficient approach to network training that leverages pre-trained\ninitializations of smaller networks. Second, we propose a theoretical framework\nusing backward error analysis to illustrate the dynamics of mid-training\nnetwork growth. Our experiments show how Deep Fusion is a practical and\neffective approach that not only accelerates the training process but also\nreduces computational requirements, maintaining or surpassing traditional\ntraining methods' performance in various NLP tasks and T5 model sizes. Finally,\nwe validate our theoretical framework, which guides the optimal use of Deep\nFusion, showing that with carefully optimized training dynamics, it\nsignificantly reduces both training time and resource consumption.\n","authors":["Hanna Mazzawi","Xavi Gonzalvo","Michael Wunder","Sammy Jerome","Benoit Dherin"],"pdf_url":"https://arxiv.org/pdf/2306.11903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01070v2","updated":"2024-02-07T17:13:43Z","published":"2023-08-02T10:37:25Z","title":"When Analytic Calculus Cracks AdaBoost Code","summary":" The principle of boosting in supervised learning involves combining multiple\nweak classifiers to obtain a stronger classifier. AdaBoost has the reputation\nto be a perfect example of this approach.\n This study analyzes the (two classes) AdaBoost procedure implemented in\nscikit-learn.\n This paper shows that AdaBoost is an algorithm in name only, as the resulting\ncombination of weak classifiers can be explicitly calculated using a truth\ntable.\n Indeed, using a logical analysis of the training set with weak classifiers\nconstructing a truth table, we recover, through an analytical formula, the\nweights of the combination of these weak classifiers obtained by the procedure.\n We observe that this formula does not give the point of minimum of the risk,\nwe provide a system to compute the exact point of minimum and we check that the\nAdaBoost procedure in scikit-learn does not implement the algorithm described\nby Freund and Schapire.\n","authors":["Jean-Marc Brossier","Olivier Lafitte","Lenny Réthoré"],"pdf_url":"https://arxiv.org/pdf/2308.01070v2.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.05033v1","updated":"2024-02-07T17:07:41Z","published":"2024-02-07T17:07:41Z","title":"Simulated Overparameterization","summary":" In this work, we introduce a novel paradigm called Simulated\nOverparametrization (SOP). SOP merges the computational efficiency of compact\nmodels with the advanced learning proficiencies of overparameterized models.\nSOP proposes a unique approach to model training and inference, where a model\nwith a significantly larger number of parameters is trained in such a way that\na smaller, efficient subset of these parameters is used for the actual\ncomputation during inference. Building upon this framework, we present a novel,\narchitecture agnostic algorithm called \"majority kernels\", which seamlessly\nintegrates with predominant architectures, including Transformer models.\nMajority kernels enables the simulated training of overparameterized models,\nresulting in performance gains across architectures and tasks. Furthermore, our\napproach adds minimal overhead to the cost incurred (wall clock time) at\ntraining time. The proposed approach shows strong performance on a wide variety\nof datasets and models, even outperforming strong baselines such as\ncombinatorial optimization methods based on submodular optimization.\n","authors":["Hanna Mazzawi","Pranjal Awasthi","Xavi Gonzalvo","Srikumar Ramalingam"],"pdf_url":"https://arxiv.org/pdf/2402.05033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20172v2","updated":"2024-02-07T16:58:38Z","published":"2023-10-31T04:40:20Z","title":"Compact Binary Systems Waveform Generation with Generative Pre-trained\n Transformer","summary":" Space-based gravitational wave detection is one of the most anticipated\ngravitational wave (GW) detection projects in the next decade, which is\npromising to detect abundant compact binary systems. However, the precise\nprediction of space GW waveforms remains unexplored. To solve the data\nprocessing difficulty in the increasing waveform complexity caused by\ndetectors' response and second-generation time-delay interferometry (TDI 2.0),\nan interpretable pre-trained large model named CBS-GPT (Compact Binary Systems\nWaveform Generation with Generative Pre-trained Transformer) is proposed. For\ncompact binary system waveforms, three models were trained to predict the\nwaveforms of massive black hole binary (MBHB), extreme mass-ratio inspirals\n(EMRIs), and galactic binary (GB), achieving prediction accuracies of 99%, 91%,\nand 99%, respectively at most.The CBS-GPT model exhibits notable generalization\nand interpretability, with its hidden parameters effectively capturing the\nintricate information of waveforms, even with complex instrument response and a\nwide parameter range. Our research demonstrates the potential of large\npre-trained models in gravitational wave realm, opening up new opportunities\nand guidance for future researches such as the complex waveforms generation,\ngap completion, and deep learning model design for GW science.\n","authors":["Ruijun Shi","Yue Zhou","Tianyu Zhao","Zhoujian Cao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2310.20172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01000v2","updated":"2024-02-07T16:53:25Z","published":"2024-02-01T20:27:19Z","title":"Multivariate Probabilistic Time Series Forecasting with Correlated\n Errors","summary":" Modeling the correlations among errors is closely associated with how\naccurately the model can quantify predictive uncertainty in probabilistic time\nseries forecasting. Recent multivariate models have made significant progress\nin accounting for contemporaneous correlations among errors, while a common\nassumption on these errors is that they are temporally independent for the sake\nof statistical simplicity. However, real-world observations often deviate from\nthis assumption, since errors usually exhibit substantial autocorrelation due\nto various factors such as the exclusion of temporally correlated covariates.\nIn this work, we propose an efficient method, based on a low-rank-plus-diagonal\nparameterization of the covariance matrix, which can effectively characterize\nthe autocorrelation of errors. The proposed method possesses several desirable\nproperties: the complexity does not scale with the number of time series, the\nresulting covariance can be used for calibrating predictions, and it can\nseamlessly integrate with any model with Gaussian-distributed errors. We\nempirically demonstrate these properties using two distinct neural forecasting\nmodels-GPVar and Transformer. Our experimental results confirm the\neffectiveness of our method in enhancing predictive accuracy and the quality of\nuncertainty quantification on multiple real-world datasets.\n","authors":["Vincent Zhihao Zheng","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2402.01000v2.pdf","comment":"This paper extends the work presented in arXiv:2305.17028 to a\n multivariate setting"},{"id":"http://arxiv.org/abs/2402.05025v1","updated":"2024-02-07T16:47:07Z","published":"2024-02-07T16:47:07Z","title":"Strong convexity-guided hyper-parameter optimization for flatter losses","summary":" We propose a novel white-box approach to hyper-parameter optimization.\nMotivated by recent work establishing a relationship between flat minima and\ngeneralization, we first establish a relationship between the strong convexity\nof the loss and its flatness. Based on this, we seek to find hyper-parameter\nconfigurations that improve flatness by minimizing the strong convexity of the\nloss. By using the structure of the underlying neural network, we derive\nclosed-form equations to approximate the strong convexity parameter, and\nattempt to find hyper-parameters that minimize it in a randomized fashion.\nThrough experiments on 14 classification datasets, we show that our method\nachieves strong performance at a fraction of the runtime.\n","authors":["Rahul Yedida","Snehanshu Saha"],"pdf_url":"https://arxiv.org/pdf/2402.05025v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2402.05015v1","updated":"2024-02-07T16:32:58Z","published":"2024-02-07T16:32:58Z","title":"A Sober Look at LLMs for Material Discovery: Are They Actually Good for\n Bayesian Optimization Over Molecules?","summary":" Automation is one of the cornerstones of contemporary material discovery.\nBayesian optimization (BO) is an essential part of such workflows, enabling\nscientists to leverage prior domain knowledge into efficient exploration of a\nlarge molecular space. While such prior knowledge can take many forms, there\nhas been significant fanfare around the ancillary scientific knowledge\nencapsulated in large language models (LLMs). However, existing work thus far\nhas only explored LLMs for heuristic materials searches. Indeed, recent work\nobtains the uncertainty estimate -- an integral part of BO -- from\npoint-estimated, non-Bayesian LLMs. In this work, we study the question of\nwhether LLMs are actually useful to accelerate principled Bayesian optimization\nin the molecular space. We take a sober, dispassionate stance in answering this\nquestion. This is done by carefully (i) viewing LLMs as fixed feature\nextractors for standard but principled BO surrogate models and by (ii)\nleveraging parameter-efficient finetuning methods and Bayesian neural networks\nto obtain the posterior of the LLM surrogate. Our extensive experiments with\nreal-world chemistry problems show that LLMs can be useful for BO over\nmolecules, but only if they have been pretrained or finetuned with\ndomain-specific data.\n","authors":["Agustinus Kristiadi","Felix Strieth-Kalthoff","Marta Skreta","Pascal Poupart","Alán Aspuru-Guzik","Geoff Pleiss"],"pdf_url":"https://arxiv.org/pdf/2402.05015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05013v1","updated":"2024-02-07T16:32:29Z","published":"2024-02-07T16:32:29Z","title":"Compression of Structured Data with Autoencoders: Provable Benefit of\n Nonlinearities and Depth","summary":" Autoencoders are a prominent model in many empirical branches of machine\nlearning and lossy data compression. However, basic theoretical questions\nremain unanswered even in a shallow two-layer setting. In particular, to what\ndegree does a shallow autoencoder capture the structure of the underlying data\ndistribution? For the prototypical case of the 1-bit compression of sparse\nGaussian data, we prove that gradient descent converges to a solution that\ncompletely disregards the sparse structure of the input. Namely, the\nperformance of the algorithm is the same as if it was compressing a Gaussian\nsource - with no sparsity. For general data distributions, we give evidence of\na phase transition phenomenon in the shape of the gradient descent minimizer,\nas a function of the data sparsity: below the critical sparsity level, the\nminimizer is a rotation taken uniformly at random (just like in the compression\nof non-sparse data); above the critical sparsity, the minimizer is the identity\n(up to a permutation). Finally, by exploiting a connection with approximate\nmessage passing algorithms, we show how to improve upon Gaussian performance\nfor the compression of sparse data: adding a denoising function to a shallow\narchitecture already reduces the loss provably, and a suitable multi-layer\ndecoder leads to a further improvement. We validate our findings on image\ndatasets, such as CIFAR-10 and MNIST.\n","authors":["Kevin Kögler","Alexander Shevchenko","Hamed Hassani","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2402.05013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05011v1","updated":"2024-02-07T16:32:02Z","published":"2024-02-07T16:32:02Z","title":"Navigating Complexity: Toward Lossless Graph Condensation via Expanding\n Window Matching","summary":" Graph condensation aims to reduce the size of a large-scale graph dataset by\nsynthesizing a compact counterpart without sacrificing the performance of Graph\nNeural Networks (GNNs) trained on it, which has shed light on reducing the\ncomputational cost for training GNNs. Nevertheless, existing methods often fall\nshort of accurately replicating the original graph for certain datasets,\nthereby failing to achieve the objective of lossless condensation. To\nunderstand this phenomenon, we investigate the potential reasons and reveal\nthat the previous state-of-the-art trajectory matching method provides biased\nand restricted supervision signals from the original graph when optimizing the\ncondensed one. This significantly limits both the scale and efficacy of the\ncondensed graph. In this paper, we make the first attempt toward\n\\textit{lossless graph condensation} by bridging the previously neglected\nsupervision signals. Specifically, we employ a curriculum learning strategy to\ntrain expert trajectories with more diverse supervision signals from the\noriginal graph, and then effectively transfer the information into the\ncondensed graph with expanding window matching. Moreover, we design a loss\nfunction to further extract knowledge from the expert trajectories. Theoretical\nanalysis justifies the design of our method and extensive experiments verify\nits superiority across different datasets. Code is released at\nhttps://github.com/NUS-HPC-AI-Lab/GEOM.\n","authors":["Yuchen Zhang","Tianle Zhang","Kai Wang","Ziyao Guo","Yuxuan Liang","Xavier Bresson","Wei Jin","Yang You"],"pdf_url":"https://arxiv.org/pdf/2402.05011v1.pdf","comment":"Lossless graph condensation method"},{"id":"http://arxiv.org/abs/2402.05008v1","updated":"2024-02-07T16:28:36Z","published":"2024-02-07T16:28:36Z","title":"EfficientViT-SAM: Accelerated Segment Anything Model Without Performance\n Loss","summary":" We present EfficientViT-SAM, a new family of accelerated segment anything\nmodels. We retain SAM's lightweight prompt encoder and mask decoder while\nreplacing the heavy image encoder with EfficientViT. For the training, we begin\nwith the knowledge distillation from the SAM-ViT-H image encoder to\nEfficientViT. Subsequently, we conduct end-to-end training on the SA-1B\ndataset. Benefiting from EfficientViT's efficiency and capacity,\nEfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over\nSAM-ViT-H without sacrificing performance. Our code and pre-trained models are\nreleased at https://github.com/mit-han-lab/efficientvit.\n","authors":["Zhuoyang Zhang","Han Cai","Song Han"],"pdf_url":"https://arxiv.org/pdf/2402.05008v1.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2402.05007v1","updated":"2024-02-07T16:28:04Z","published":"2024-02-07T16:28:04Z","title":"Example-based Explanations for Random Forests using Machine Unlearning","summary":" Tree-based machine learning models, such as decision trees and random\nforests, have been hugely successful in classification tasks primarily because\nof their predictive power in supervised learning tasks and ease of\ninterpretation. Despite their popularity and power, these models have been\nfound to produce unexpected or discriminatory outcomes. Given their\noverwhelming success for most tasks, it is of interest to identify sources of\ntheir unexpected and discriminatory behavior. However, there has not been much\nwork on understanding and debugging tree-based classifiers in the context of\nfairness.\n We introduce FairDebugger, a system that utilizes recent advances in machine\nunlearning research to identify training data subsets responsible for instances\nof fairness violations in the outcomes of a random forest classifier.\nFairDebugger generates top-$k$ explanations (in the form of coherent training\ndata subsets) for model unfairness. Toward this goal, FairDebugger first\nutilizes machine unlearning to estimate the change in the tree structures of\nthe random forest when parts of the underlying training data are removed, and\nthen leverages the Apriori algorithm from frequent itemset mining to reduce the\nsubset search space. We empirically evaluate our approach on three real-world\ndatasets, and demonstrate that the explanations generated by FairDebugger are\nconsistent with insights from prior studies on these datasets.\n","authors":["Tanmay Surve","Romila Pradhan"],"pdf_url":"https://arxiv.org/pdf/2402.05007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05002v1","updated":"2024-02-07T16:18:59Z","published":"2024-02-07T16:18:59Z","title":"Randomized Confidence Bounds for Stochastic Partial Monitoring","summary":" The partial monitoring (PM) framework provides a theoretical formulation of\nsequential learning problems with incomplete feedback. On each round, a\nlearning agent plays an action while the environment simultaneously chooses an\noutcome. The agent then observes a feedback signal that is only partially\ninformative about the (unobserved) outcome. The agent leverages the received\nfeedback signals to select actions that minimize the (unobserved) cumulative\nloss. In contextual PM, the outcomes depend on some side information that is\nobservable by the agent before selecting the action on each round. In this\npaper, we consider the contextual and non-contextual PM settings with\nstochastic outcomes. We introduce a new class of strategies based on the\nrandomization of deterministic confidence bounds, that extend regret guarantees\nto settings where existing stochastic strategies are not applicable. Our\nexperiments show that the proposed RandCBP and RandCBPside* strategies improve\nstate-of-the-art baselines in PM games. To encourage the adoption of the PM\nframework, we design a use case on the real-world problem of monitoring the\nerror rate of any deployed classification system.\n","authors":["Maxime Heuillet","Ola Ahmad","Audrey Durand"],"pdf_url":"https://arxiv.org/pdf/2402.05002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02692v2","updated":"2024-02-07T16:16:08Z","published":"2024-02-05T03:03:00Z","title":"Statistical Guarantees for Link Prediction using Graph Neural Networks","summary":" This paper derives statistical guarantees for the performance of Graph Neural\nNetworks (GNNs) in link prediction tasks on graphs generated by a graphon. We\npropose a linear GNN architecture (LG-GNN) that produces consistent estimators\nfor the underlying edge probabilities. We establish a bound on the mean squared\nerror and give guarantees on the ability of LG-GNN to detect high-probability\nedges. Our guarantees hold for both sparse and dense graphs. Finally, we\ndemonstrate some of the shortcomings of the classical GCN architecture, as well\nas verify our results on real and synthetic datasets.\n","authors":["Alan Chung","Amin Saberi","Morgane Austern"],"pdf_url":"https://arxiv.org/pdf/2402.02692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04997v1","updated":"2024-02-07T16:15:36Z","published":"2024-02-07T16:15:36Z","title":"Generative Flows on Discrete State-Spaces: Enabling Multimodal Flows\n with Applications to Protein Co-Design","summary":" Combining discrete and continuous data is an important capability for\ngenerative models. We present Discrete Flow Models (DFMs), a new flow-based\nmodel of discrete data that provides the missing link in enabling flow-based\ngenerative models to be applied to multimodal continuous and discrete data\nproblems. Our key insight is that the discrete equivalent of continuous space\nflow matching can be realized using Continuous Time Markov Chains. DFMs benefit\nfrom a simple derivation that includes discrete diffusion models as a specific\ninstance while allowing improved performance over existing diffusion-based\napproaches. We utilize our DFMs method to build a multimodal flow-based\nmodeling framework. We apply this capability to the task of protein co-design,\nwherein we learn a model for jointly generating protein structure and sequence.\nOur approach achieves state-of-the-art co-design performance while allowing the\nsame multimodal model to be used for flexible generation of the sequence or\nstructure.\n","authors":["Andrew Campbell","Jason Yim","Regina Barzilay","Tom Rainforth","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2402.04997v1.pdf","comment":"52 pages, 11 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.06650v3","updated":"2024-02-07T16:12:06Z","published":"2023-11-11T19:39:50Z","title":"Heuristic Optimal Transport in Branching Networks","summary":" Optimal transport aims to learn a mapping of sources to targets by minimizing\nthe cost, which is typically defined as a function of distance. The solution to\nthis problem consists of straight line segments optimally connecting sources to\ntargets, and it does not exhibit branching. These optimal solutions are in\nstark contrast with both natural, and man-made transportation networks, where\nbranching structures are prevalent. Here we discuss a fast heuristic branching\nmethod for optimal transport in networks. We also provide several numerical\napplications to synthetic examples, a simplified cardiovascular network, and\nthe \"Santa Claus\" distribution network which includes 141,182 cities around the\nworld, with known location and population.\n","authors":["M. Andrecut"],"pdf_url":"https://arxiv.org/pdf/2311.06650v3.pdf","comment":"Accepted in Int. J. Mod. Phys. C, 11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.04987v1","updated":"2024-02-07T16:06:20Z","published":"2024-02-07T16:06:20Z","title":"PriorBoost: An Adaptive Algorithm for Learning from Aggregate Responses","summary":" This work studies algorithms for learning from aggregate responses. We focus\non the construction of aggregation sets (called bags in the literature) for\nevent-level loss functions. We prove for linear regression and generalized\nlinear models (GLMs) that the optimal bagging problem reduces to\none-dimensional size-constrained $k$-means clustering. Further, we\ntheoretically quantify the advantage of using curated bags over random bags. We\nthen propose the PriorBoost algorithm, which adaptively forms bags of samples\nthat are increasingly homogeneous with respect to (unobserved) individual\nresponses to improve model quality. We study label differential privacy for\naggregate learning, and we also provide extensive experiments showing that\nPriorBoost regularly achieves optimal model quality for event-level\npredictions, in stark contrast to non-adaptive algorithms.\n","authors":["Adel Javanmard","Matthew Fahrbach","Vahab Mirrokni"],"pdf_url":"https://arxiv.org/pdf/2402.04987v1.pdf","comment":"29 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.01037v2","updated":"2024-02-07T16:00:59Z","published":"2023-07-03T14:17:12Z","title":"Vector Quantile Regression on Manifolds","summary":" Quantile regression (QR) is a statistical tool for distribution-free\nestimation of conditional quantiles of a target variable given explanatory\nfeatures. QR is limited by the assumption that the target distribution is\nunivariate and defined on an Euclidean domain. Although the notion of quantiles\nwas recently extended to multi-variate distributions, QR for multi-variate\ndistributions on manifolds remains underexplored, even though many important\napplications inherently involve data distributed on, e.g., spheres (climate and\ngeological phenomena), and tori (dihedral angles in proteins). By leveraging\noptimal transport theory and c-concave functions, we meaningfully define\nconditional vector quantile functions of high-dimensional variables on\nmanifolds (M-CVQFs). Our approach allows for quantile estimation, regression,\nand computation of conditional confidence sets and likelihoods. We demonstrate\nthe approach's efficacy and provide insights regarding the meaning of\nnon-Euclidean quantiles through synthetic and real data experiments.\n","authors":["Marco Pegoraro","Sanketh Vedula","Aviv A. Rosenberg","Irene Tallini","Emanuele Rodolà","Alex M. Bronstein"],"pdf_url":"https://arxiv.org/pdf/2307.01037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04081v3","updated":"2024-02-07T16:00:21Z","published":"2023-07-09T01:41:22Z","title":"Score-based Conditional Generation with Fewer Labeled Data by\n Self-calibrating Classifier Guidance","summary":" Score-based generative models (SGMs) are a popular family of deep generative\nmodels that achieve leading image generation quality. Early studies extend SGMs\nto tackle class-conditional generation by coupling an unconditional SGM with\nthe guidance of a trained classifier. Nevertheless, such classifier-guided SGMs\ndo not always achieve accurate conditional generation, especially when trained\nwith fewer labeled data. We argue that the problem is rooted in the\nclassifier's tendency to overfit without coordinating with the underlying\nunconditional distribution. To make the classifier respect the unconditional\ndistribution, we propose improving classifier-guided SGMs by letting the\nclassifier regularize itself. The key idea of our proposed method is to use\nprinciples from energy-based models to convert the classifier into another view\nof the unconditional SGM. Existing losses for unconditional SGMs can then be\nleveraged to achieve regularization by calibrating the classifier's internal\nunconditional scores. The regularization scheme can be applied to not only the\nlabeled data but also unlabeled ones to further improve the classifier. Across\nvarious percentages of fewer labeled data, empirical results show that the\nproposed approach significantly enhances conditional generation quality. The\nenhancements confirm the potential of the proposed self-calibration technique\nfor generative modeling with limited labeled data.\n","authors":["Paul Kuo-Ming Huang","Si-An Chen","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2307.04081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04982v1","updated":"2024-02-07T15:58:51Z","published":"2024-02-07T15:58:51Z","title":"Beyond explaining: XAI-based Adaptive Learning with SHAP Clustering for\n Energy Consumption Prediction","summary":" This paper presents an approach integrating explainable artificial\nintelligence (XAI) techniques with adaptive learning to enhance energy\nconsumption prediction models, with a focus on handling data distribution\nshifts. Leveraging SHAP clustering, our method provides interpretable\nexplanations for model predictions and uses these insights to adaptively refine\nthe model, balancing model complexity with predictive performance. We introduce\na three-stage process: (1) obtaining SHAP values to explain model predictions,\n(2) clustering SHAP values to identify distinct patterns and outliers, and (3)\nrefining the model based on the derived SHAP clustering characteristics. Our\napproach mitigates overfitting and ensures robustness in handling data\ndistribution shifts. We evaluate our method on a comprehensive dataset\ncomprising energy consumption records of buildings, as well as two additional\ndatasets to assess the transferability of our approach to other domains,\nregression, and classification problems. Our experiments demonstrate the\neffectiveness of our approach in both task types, resulting in improved\npredictive performance and interpretable model explanations.\n","authors":["Tobias Clement","Hung Truong Thanh Nguyen","Nils Kemmerzell","Mohamed Abdelaal","Davor Stjelja"],"pdf_url":"https://arxiv.org/pdf/2402.04982v1.pdf","comment":"A short version of this paper was published at the Australasian Joint\n Conference on Artificial Intelligence in 2023"},{"id":"http://arxiv.org/abs/2402.04980v1","updated":"2024-02-07T15:57:30Z","published":"2024-02-07T15:57:30Z","title":"Asymptotics of feature learning in two-layer networks after one\n gradient-step","summary":" In this manuscript we investigate the problem of how two-layer neural\nnetworks learn features from data, and improve over the kernel regime, after\nbeing trained with a single gradient descent step. Leveraging a connection from\n(Ba et al., 2022) with a non-linear spiked matrix model and recent progress on\nGaussian universality (Dandi et al., 2023), we provide an exact asymptotic\ndescription of the generalization error in the high-dimensional limit where the\nnumber of samples $n$, the width $p$ and the input dimension $d$ grow at a\nproportional rate. We characterize exactly how adapting to the data is crucial\nfor the network to efficiently learn non-linear functions in the direction of\nthe gradient -- where at initialization it can only express linear functions in\nthis regime. To our knowledge, our results provides the first tight description\nof the impact of feature learning in the generalization of two-layer neural\nnetworks in the large learning rate regime $\\eta=\\Theta_{d}(d)$, beyond\nperturbative finite width corrections of the conjugate and neural tangent\nkernels.\n","authors":["Hugo Cui","Luca Pesce","Yatin Dandi","Florent Krzakala","Yue M. Lu","Lenka Zdeborová","Bruno Loureiro"],"pdf_url":"https://arxiv.org/pdf/2402.04980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01426v3","updated":"2024-02-07T15:56:30Z","published":"2022-10-04T07:34:06Z","title":"Continuous Monte Carlo Graph Search","summary":" Online planning is crucial for high performance in many complex sequential\ndecision-making tasks. Monte Carlo Tree Search (MCTS) employs a principled\nmechanism for trading off exploration for exploitation for efficient online\nplanning, and it outperforms comparison methods in many discrete\ndecision-making domains such as Go, Chess, and Shogi. Subsequently, extensions\nof MCTS to continuous domains have been developed. However, the inherent high\nbranching factor and the resulting explosion of the search tree size are\nlimiting the existing methods. To address this problem, we propose Continuous\nMonte Carlo Graph Search (CMCGS), an extension of MCTS to online planning in\nenvironments with continuous state and action spaces. CMCGS takes advantage of\nthe insight that, during planning, sharing the same action policy between\nseveral states can yield high performance. To implement this idea, at each time\nstep, CMCGS clusters similar states into a limited number of stochastic action\nbandit nodes, which produce a layered directed graph instead of an MCTS search\ntree. Experimental evaluation shows that CMCGS outperforms comparable planning\nmethods in several complex continuous DeepMind Control Suite benchmarks and 2D\nnavigation and exploration tasks with limited sample budgets. Furthermore,\nCMCGS can be scaled up through parallelization, and it outperforms the\nCross-Entropy Method (CEM) in continuous control with learned dynamics models.\n","authors":["Kalle Kujanpää","Amin Babadi","Yi Zhao","Juho Kannala","Alexander Ilin","Joni Pajarinen"],"pdf_url":"https://arxiv.org/pdf/2210.01426v3.pdf","comment":"Accepted at AAMAS 2024 (full paper & oral)"},{"id":"http://arxiv.org/abs/2303.17235v2","updated":"2024-02-07T15:45:43Z","published":"2023-03-30T09:08:57Z","title":"Kaizen: Practical Self-supervised Continual Learning with Continual\n Fine-tuning","summary":" Self-supervised learning (SSL) has shown remarkable performance in computer\nvision tasks when trained offline. However, in a Continual Learning (CL)\nscenario where new data is introduced progressively, models still suffer from\ncatastrophic forgetting. Retraining a model from scratch to adapt to newly\ngenerated data is time-consuming and inefficient. Previous approaches suggested\nre-purposing self-supervised objectives with knowledge distillation to mitigate\nforgetting across tasks, assuming that labels from all tasks are available\nduring fine-tuning. In this paper, we generalize self-supervised continual\nlearning in a practical setting where available labels can be leveraged in any\nstep of the SSL process. With an increasing number of continual tasks, this\noffers more flexibility in the pre-training and fine-tuning phases. With\nKaizen, we introduce a training architecture that is able to mitigate\ncatastrophic forgetting for both the feature extractor and classifier with a\ncarefully designed loss function. By using a set of comprehensive evaluation\nmetrics reflecting different aspects of continual learning, we demonstrated\nthat Kaizen significantly outperforms previous SSL models in competitive vision\nbenchmarks, with up to 16.5% accuracy improvement on split CIFAR-100. Kaizen is\nable to balance the trade-off between knowledge retention and learning from new\ndata with an end-to-end model, paving the way for practical deployment of\ncontinual learning systems.\n","authors":["Chi Ian Tang","Lorena Qendro","Dimitris Spathis","Fahim Kawsar","Cecilia Mascolo","Akhil Mathur"],"pdf_url":"https://arxiv.org/pdf/2303.17235v2.pdf","comment":"Presented at IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2024. The code for this work is available at\n https://github.com/dr-bell/kaizen"},{"id":"http://arxiv.org/abs/2401.16356v3","updated":"2024-02-07T15:37:24Z","published":"2024-01-29T17:59:26Z","title":"cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and\n Glitch Generation","summary":" Simulating realistic time-domain observations of gravitational waves (GWs)\nand GW detector glitches can help in advancing GW data analysis. Simulated data\ncan be used in downstream tasks by augmenting datasets for signal searches,\nbalancing data sets for machine learning, and validating detection schemes. In\nthis work, we present Conditional Derivative GAN (cDVGAN), a novel conditional\nmodel in the Generative Adversarial Network framework for simulating multiple\nclasses of time-domain observations that represent gravitational waves (GWs)\nand detector glitches. cDVGAN can also generate generalized hybrid samples that\nspan the variation between classes through interpolation in the conditioned\nclass vector. cDVGAN introduces an additional player into the typical 2-player\nadversarial game of GANs, where an auxiliary discriminator analyzes the\nfirst-order derivative time-series. Our results show that this provides\nsynthetic data that better captures the features of the original data. cDVGAN\nconditions on three classes, two denoised from LIGO blip and tomte glitch\nevents from its 3rd observing run (O3), and the third representing binary black\nhole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN\nmodels in replicating the features of the three classes. Specifically, our\nexperiments show that training convolutional neural networks (CNNs) with our\ncDVGAN-generated data improves the detection of samples embedded in detector\nnoise beyond the synthetic data from other state-of-the-art GAN models. Our\nbest synthetic dataset yields as much as a 4.2% increase in\narea-under-the-curve (AUC) performance compared to synthetic datasets from\nbaseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN\noutperforms CNNs trained only on the standard classes, when identifying real\nsamples embedded in LIGO detector background (4% AUC improvement for cDVGAN).\n","authors":["Tom Dooney","Lyana Curier","Daniel Tan","Melissa Lopez","Chris Van Den Broeck","Stefano Bromuri"],"pdf_url":"https://arxiv.org/pdf/2401.16356v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04933v1","updated":"2024-02-07T15:11:37Z","published":"2024-02-07T15:11:37Z","title":"A Bayesian Approach to Online Learning for Contextual Restless Bandits\n with Applications to Public Health","summary":" Restless multi-armed bandits (RMABs) are used to model sequential resource\nallocation in public health intervention programs. In these settings, the\nunderlying transition dynamics are often unknown a priori, requiring online\nreinforcement learning (RL). However, existing methods in online RL for RMABs\ncannot incorporate properties often present in real-world public health\napplications, such as contextual information and non-stationarity. We present\nBayesian Learning for Contextual RMABs (BCoR), an online RL approach for RMABs\nthat novelly combines techniques in Bayesian modeling with Thompson sampling to\nflexibly model a wide range of complex RMAB settings, such as contextual and\nnon-stationary RMABs. A key contribution of our approach is its ability to\nleverage shared information within and between arms to learn unknown RMAB\ntransition dynamics quickly in budget-constrained settings with relatively\nshort time horizons. Empirically, we show that BCoR achieves substantially\nhigher finite-sample performance than existing approaches over a range of\nexperimental settings, including one constructed from a real-world public\nhealth campaign in India.\n","authors":["Biyonka Liang","Lily Xu","Aparna Taneja","Milind Tambe","Lucas Janson"],"pdf_url":"https://arxiv.org/pdf/2402.04933v1.pdf","comment":"26 pages, 18 figures"},{"id":"http://arxiv.org/abs/2402.04930v1","updated":"2024-02-07T14:59:25Z","published":"2024-02-07T14:59:25Z","title":"Blue noise for diffusion models","summary":" Most of the existing diffusion models use Gaussian noise for training and\nsampling across all time steps, which may not optimally account for the\nfrequency contents reconstructed by the denoising network. Despite the diverse\napplications of correlated noise in computer graphics, its potential for\nimproving the training process has been underexplored. In this paper, we\nintroduce a novel and general class of diffusion models taking correlated noise\nwithin and across images into account. More specifically, we propose a\ntime-varying noise model to incorporate correlated noise into the training\nprocess, as well as a method for fast generation of correlated noise mask. Our\nmodel is built upon deterministic diffusion models and utilizes blue noise to\nhelp improve the generation quality compared to using Gaussian white (random)\nnoise only. Further, our framework allows introducing correlation across images\nwithin a single mini-batch to improve gradient flow. We perform both\nqualitative and quantitative evaluations on a variety of datasets using our\nmethod, achieving improvements on different tasks over existing deterministic\ndiffusion models in terms of FID metric.\n","authors":["Xingchang Huang","Corentin Salaün","Cristina Vasconcelos","Christian Theobalt","Cengiz Öztireli","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2402.04930v1.pdf","comment":"10 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.04929v1","updated":"2024-02-07T14:56:13Z","published":"2024-02-07T14:56:13Z","title":"Source-Free Domain Adaptation with Diffusion-Guided Source Data\n Generation","summary":" This paper introduces a novel approach to leverage the generalizability\ncapability of Diffusion Models for Source-Free Domain Adaptation (DM-SFDA). Our\nproposed DM-SFDA method involves fine-tuning a pre-trained text-to-image\ndiffusion model to generate source domain images using features from the target\nimages to guide the diffusion process. Specifically, the pre-trained diffusion\nmodel is fine-tuned to generate source samples that minimize entropy and\nmaximize confidence for the pre-trained source model. We then apply established\nunsupervised domain adaptation techniques to align the generated source images\nwith target domain data. We validate our approach through comprehensive\nexperiments across a range of datasets, including Office-31, Office-Home, and\nVisDA. The results highlight significant improvements in SFDA performance,\nshowcasing the potential of diffusion models in generating contextually\nrelevant, domain-specific images.\n","authors":["Shivang Chopra","Suraj Kothawade","Houda Aynaou","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2402.04929v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.01701"},{"id":"http://arxiv.org/abs/2402.04924v1","updated":"2024-02-07T14:49:10Z","published":"2024-02-07T14:49:10Z","title":"Two Trades is not Baffled: Condense Graph via Crafting Rational Gradient\n Matching","summary":" Training on large-scale graphs has achieved remarkable results in graph\nrepresentation learning, but its cost and storage have raised growing concerns.\nAs one of the most promising directions, graph condensation methods address\nthese issues by employing gradient matching, aiming to condense the full graph\ninto a more concise yet information-rich synthetic set. Though encouraging,\nthese strategies primarily emphasize matching directions of the gradients,\nwhich leads to deviations in the training trajectories. Such deviations are\nfurther magnified by the differences between the condensation and evaluation\nphases, culminating in accumulated errors, which detrimentally affect the\nperformance of the condensed graphs. In light of this, we propose a novel graph\ncondensation method named \\textbf{C}raf\\textbf{T}ing \\textbf{R}ationa\\textbf{L}\ntrajectory (\\textbf{CTRL}), which offers an optimized starting point closer to\nthe original dataset's feature distribution and a more refined strategy for\ngradient matching. Theoretically, CTRL can effectively neutralize the impact of\naccumulated errors on the performance of condensed graphs. We provide extensive\nexperiments on various graph datasets and downstream tasks to support the\neffectiveness of CTRL. Code is released at\nhttps://github.com/NUS-HPC-AI-Lab/CTRL.\n","authors":["Tianle Zhang","Yuchen Zhang","Kun Wang","Kai Wang","Beining Yang","Kaipeng Zhang","Wenqi Shao","Ping Liu","Joey Tianyi Zhou","Yang You"],"pdf_url":"https://arxiv.org/pdf/2402.04924v1.pdf","comment":"An effective method for graph condensation"},{"id":"http://arxiv.org/abs/2310.05401v3","updated":"2024-02-07T14:49:08Z","published":"2023-10-09T04:40:20Z","title":"Entropy-MCMC: Sampling from Flat Basins with Ease","summary":" Bayesian deep learning counts on the quality of posterior distribution\nestimation. However, the posterior of deep neural networks is highly\nmulti-modal in nature, with local modes exhibiting varying generalization\nperformance. Given a practical budget, targeting at the original posterior can\nlead to suboptimal performance, as some samples may become trapped in \"bad\"\nmodes and suffer from overfitting. Leveraging the observation that \"good\" modes\nwith low generalization error often reside in flat basins of the energy\nlandscape, we propose to bias sampling on the posterior toward these flat\nregions. Specifically, we introduce an auxiliary guiding variable, the\nstationary distribution of which resembles a smoothed posterior free from sharp\nmodes, to lead the MCMC sampler to flat basins. By integrating this guiding\nvariable with the model parameter, we create a simple joint distribution that\nenables efficient sampling with minimal computational overhead. We prove the\nconvergence of our method and further show that it converges faster than\nseveral existing flatness-aware methods in the strongly convex setting.\nEmpirical results demonstrate that our method can successfully sample from flat\nbasins of the posterior, and outperforms all compared baselines on multiple\nbenchmarks including classification, calibration, and out-of-distribution\ndetection.\n","authors":["Bolian Li","Ruqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01995v2","updated":"2024-02-07T14:48:35Z","published":"2024-02-03T02:36:59Z","title":"Online Uniform Risk Times Sampling: First Approximation Algorithms,\n Learning Augmentation with Full Confidence Interval Integration","summary":" In digital health, the strategy of allocating a limited treatment budget\nacross available risk times is crucial to reduce user fatigue. This strategy,\nhowever, encounters a significant obstacle due to the unknown actual number of\nrisk times, a factor not adequately addressed by existing methods lacking\ntheoretical guarantees. This paper introduces, for the first time, the online\nuniform risk times sampling problem within the approximation algorithm\nframework. We propose two online approximation algorithms for this problem, one\nwith and one without learning augmentation, and provide rigorous theoretical\nperformance guarantees for them using competitive ratio analysis. We assess the\nperformance of our algorithms using both synthetic experiments and a real-world\ncase study on HeartSteps mobile applications.\n","authors":["Xueqing Liu","Kyra Gan","Esmaeil Keyvanshokooh","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2402.01995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04922v1","updated":"2024-02-07T14:47:13Z","published":"2024-02-07T14:47:13Z","title":"Voronoi Candidates for Bayesian Optimization","summary":" Bayesian optimization (BO) offers an elegant approach for efficiently\noptimizing black-box functions. However, acquisition criteria demand their own\nchallenging inner-optimization, which can induce significant overhead. Many\npractical BO methods, particularly in high dimension, eschew a formal,\ncontinuous optimization of the acquisition function and instead search\ndiscretely over a finite set of space-filling candidates. Here, we propose to\nuse candidates which lie on the boundary of the Voronoi tessellation of the\ncurrent design points, so they are equidistant to two or more of them. We\ndiscuss strategies for efficient implementation by directly sampling the\nVoronoi boundary without explicitly generating the tessellation, thus\naccommodating large designs in high dimension. On a battery of test problems\noptimized via Gaussian processes with expected improvement, our proposed\napproach significantly improves the execution time of a multi-start continuous\nsearch without a loss in accuracy.\n","authors":["Nathan Wycoff","John W. Smith","Annie S. Booth","Robert B. Gramacy"],"pdf_url":"https://arxiv.org/pdf/2402.04922v1.pdf","comment":"comments very welcome"},{"id":"http://arxiv.org/abs/2310.07433v3","updated":"2024-02-07T14:43:41Z","published":"2023-10-11T12:34:39Z","title":"Imitation Learning from Observation with Automatic Discount Scheduling","summary":" Humans often acquire new skills through observation and imitation. For\nrobotic agents, learning from the plethora of unlabeled video demonstration\ndata available on the Internet necessitates imitating the expert without access\nto its action, presenting a challenge known as Imitation Learning from\nObservations (ILfO). A common approach to tackle ILfO problems is to convert\nthem into inverse reinforcement learning problems, utilizing a proxy reward\ncomputed from the agent's and the expert's observations. Nonetheless, we\nidentify that tasks characterized by a progress dependency property pose\nsignificant challenges for such approaches; in these tasks, the agent needs to\ninitially learn the expert's preceding behaviors before mastering the\nsubsequent ones. Our investigation reveals that the main cause is that the\nreward signals assigned to later steps hinder the learning of initial\nbehaviors. To address this challenge, we present a novel ILfO framework that\nenables the agent to master earlier behaviors before advancing to later ones.\nWe introduce an Automatic Discount Scheduling (ADS) mechanism that adaptively\nalters the discount factor in reinforcement learning during the training phase,\nprioritizing earlier rewards initially and gradually engaging later rewards\nonly when the earlier behaviors have been mastered. Our experiments, conducted\non nine Meta-World tasks, demonstrate that our method significantly outperforms\nstate-of-the-art methods across all tasks, including those that are unsolvable\nby them.\n","authors":["Yuyang Liu","Weijun Dong","Yingdong Hu","Chuan Wen","Zhao-Heng Yin","Chongjie Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2310.07433v3.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2311.09438v2","updated":"2024-02-07T14:41:40Z","published":"2023-11-15T23:18:01Z","title":"Labeled Interactive Topic Models","summary":" Topic models are valuable for understanding extensive document collections,\nbut they don't always identify the most relevant topics. Classical\nprobabilistic and anchor-based topic models offer interactive versions that\nallow users to guide the models towards more pertinent topics. However, such\ninteractive features have been lacking in neural topic models. To correct this\nlacuna, we introduce a user-friendly interaction for neural topic models. This\ninteraction permits users to assign a word label to a topic, leading to an\nupdate in the topic model where the words in the topic become closely aligned\nwith the given label. Our approach encompasses two distinct kinds of neural\ntopic models. The first includes models where topic embeddings are trainable\nand evolve during the training process. The second kind involves models where\ntopic embeddings are integrated post-training, offering a different approach to\ntopic refinement. To facilitate user interaction with these neural topic\nmodels, we have developed an interactive interface. This interface enables\nusers to engage with and re-label topics as desired. We evaluate our method\nthrough a human study, where users can relabel topics to find relevant\ndocuments. Using our method, user labeling improves document rank scores,\nhelping to find more relevant documents to a given query when compared to no\nuser labeling.\n","authors":["Kyle Seelman","Mozhi Zhang","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2311.09438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04915v1","updated":"2024-02-07T14:41:17Z","published":"2024-02-07T14:41:17Z","title":"Moco: A Learnable Meta Optimizer for Combinatorial Optimization","summary":" Relevant combinatorial optimization problems (COPs) are often NP-hard. While\nthey have been tackled mainly via handcrafted heuristics in the past, advances\nin neural networks have motivated the development of general methods to learn\nheuristics from data. Many approaches utilize a neural network to directly\nconstruct a solution, but are limited in further improving based on already\nconstructed solutions at inference time. Our approach, Moco, learns a graph\nneural network that updates the solution construction procedure based on\nfeatures extracted from the current search state. This meta training procedure\ntargets the overall best solution found during the search procedure given\ninformation such as the search budget. This allows Moco to adapt to varying\ncircumstances such as different computational budgets. Moco is a fully\nlearnable meta optimizer that does not utilize any problem specific local\nsearch or decomposition. We test Moco on the Traveling Salesman Problem (TSP)\nand Maximum Independent Set (MIS) and show that it outperforms other approaches\non MIS and is overall competitive on the TSP, especially outperforming related\napproaches, partially even if they use additional local search.\n","authors":["Tim Dernedde","Daniela Thyssens","Sören Dittrich","Maximilan Stubbemann","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2402.04915v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.04912v1","updated":"2024-02-07T14:39:11Z","published":"2024-02-07T14:39:11Z","title":"Towards Biologically Plausible and Private Gene Expression Data\n Generation","summary":" Generative models trained with Differential Privacy (DP) are becoming\nincreasingly prominent in the creation of synthetic data for downstream\napplications. Existing literature, however, primarily focuses on basic\nbenchmarking datasets and tends to report promising results only for elementary\nmetrics and relatively simple data distributions. In this paper, we initiate a\nsystematic analysis of how DP generative models perform in their natural\napplication scenarios, specifically focusing on real-world gene expression\ndata. We conduct a comprehensive analysis of five representative DP generation\nmethods, examining them from various angles, such as downstream utility,\nstatistical properties, and biological plausibility. Our extensive evaluation\nilluminates the unique characteristics of each DP generation method, offering\ncritical insights into the strengths and weaknesses of each approach, and\nuncovering intriguing possibilities for future developments. Perhaps\nsurprisingly, our analysis reveals that most methods are capable of achieving\nseemingly reasonable downstream utility, according to the standard evaluation\nmetrics considered in existing literature. Nevertheless, we find that none of\nthe DP methods are able to accurately capture the biological characteristics of\nthe real dataset. This observation suggests a potential over-optimistic\nassessment of current methodologies in this field and underscores a pressing\nneed for future enhancements in model design.\n","authors":["Dingfan Chen","Marie Oestreich","Tejumade Afonja","Raouf Kerkouche","Matthias Becker","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2402.04912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04907v1","updated":"2024-02-07T14:37:37Z","published":"2024-02-07T14:37:37Z","title":"On a Combinatorial Problem Arising in Machine Teaching","summary":" We study a model of machine teaching where the teacher mapping is constructed\nfrom a size function on both concepts and examples. The main question in\nmachine teaching is the minimum number of examples needed for any concept, the\nso-called teaching dimension. A recent paper [7] conjectured that the worst\ncase for this model, as a function of the size of the concept class, occurs\nwhen the consistency matrix contains the binary representations of numbers from\nzero and up. In this paper we prove their conjecture. The result can be seen as\na generalization of a theorem resolving the edge isoperimetry problem for\nhypercubes [12], and our proof is based on a lemma of [10].\n","authors":["Brigt Håvardstun","Jan Kratochvíl","Joakim Sunde","Jan Arne"],"pdf_url":"https://arxiv.org/pdf/2402.04907v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2402.04906v1","updated":"2024-02-07T14:35:25Z","published":"2024-02-07T14:35:25Z","title":"Conformal Monte Carlo Meta-learners for Predictive Inference of\n Individual Treatment Effects","summary":" Knowledge of the effect of interventions, called the treatment effect, is\nparamount for decision-making. Approaches to estimating this treatment effect,\ne.g. by using Conditional Average Treatment Effect (CATE) estimators, often\nonly provide a point estimate of this treatment effect, while additional\nuncertainty quantification is frequently desired instead. Therefore, we present\na novel method, the Conformal Monte Carlo (CMC) meta-learners, leveraging\nconformal predictive systems, Monte Carlo sampling, and CATE meta-learners, to\ninstead produce a predictive distribution usable in individualized\ndecision-making. Furthermore, we show how specific assumptions on the noise\ndistribution of the outcome heavily affect these uncertainty predictions.\nNonetheless, the CMC framework shows strong experimental coverage while\nretaining small interval widths to provide estimates of the true individual\ntreatment effect.\n","authors":["Jef Jonkers","Jarne Verhaeghe","Glenn Van Wallendael","Luc Duchateau","Sofie Van Hoecke"],"pdf_url":"https://arxiv.org/pdf/2402.04906v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.04902v1","updated":"2024-02-07T14:35:05Z","published":"2024-02-07T14:35:05Z","title":"L4Q: Parameter Efficient Quantization-Aware Training on Large Language\n Models via LoRA-wise LSQ","summary":" Post-training quantization (PTQ) and quantization-aware training (QAT)\nmethods are gaining popularity in mitigating the high memory and computational\ncosts associated with Large Language Models (LLMs). In resource-constrained\nscenarios, PTQ, with its reduced training overhead, is often preferred over\nQAT, despite the latter's potential for higher accuracy. Meanwhile,\nparameter-efficient fine-tuning (PEFT) methods like low-rank adaptation (LoRA)\nhave been introduced, and recent efforts have explored quantization-aware PEFT\ntechniques. However, these approaches may lack generality due to their reliance\non the pre-quantized model's configuration. Their effectiveness may be\ncompromised by non-linearly quantized or mixed-precision weights, and the\nretraining of specific quantization parameters might impede optimal\nperformance. To address these challenges, we propose L4Q, an algorithm for\nparameter-efficient quantization-aware training. L4Q leverages LoRA-wise\nlearned quantization step size for LLMs, aiming to enhance generality. The\nsimultaneous quantization-and-fine-tuning process of L4Q is applicable to\nhigh-precision models, yielding linearly quantized weights with superior\naccuracy. Our experiments, conducted on the LLaMA and LLaMA2 model families\nusing an instructional dataset, showcase L4Q's capabilities in language\ncomprehension and few-shot in-context learning, achieving sub-4-bit precision\nwhile maintaining comparable training times to applying PEFT on a quantized\nmodel.\n","authors":["Hyesung Jeon","Yulhwa Kim","Jae-joon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.04902v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.04898v1","updated":"2024-02-07T14:28:04Z","published":"2024-02-07T14:28:04Z","title":"The Strain of Success: A Predictive Model for Injury Risk Mitigation and\n Team Success in Soccer","summary":" In this paper, we present a novel sequential team selection model in soccer.\nSpecifically, we model the stochastic process of player injury and\nunavailability using player-specific information learned from real-world soccer\ndata. Monte-Carlo Tree Search is used to select teams for games that optimise\nlong-term team performance across a soccer season by reasoning over player\ninjury probability. We validate our approach compared to benchmark solutions\nfor the 2018/19 English Premier League season. Our model achieves similar\nseason expected points to the benchmark whilst reducing first-team injuries by\n~13% and the money inefficiently spent on injured players by ~11% -\ndemonstrating the potential to reduce costs and improve player welfare in\nreal-world soccer teams.\n","authors":["Gregory Everett","Ryan Beal","Tim Matthews","Timothy J. Norman","Sarvapali D. Ramchurn"],"pdf_url":"https://arxiv.org/pdf/2402.04898v1.pdf","comment":"19 pages (16 main, 2 references, 1 appendix), 10 figures (9 main, 1\n appendix). Accepted at the MIT Sloan Sports Analytics Conference 2024\n Research Paper Competition"},{"id":"http://arxiv.org/abs/2402.04247v2","updated":"2024-02-07T14:26:02Z","published":"2024-02-06T18:54:07Z","title":"Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science","summary":" Intelligent agents powered by large language models (LLMs) have demonstrated\nsubstantial promise in autonomously conducting experiments and facilitating\nscientific discoveries across various disciplines. While their capabilities are\npromising, they also introduce novel vulnerabilities that demand careful\nconsideration for safety. However, there exists a notable gap in the\nliterature, as there has been no comprehensive exploration of these\nvulnerabilities. This position paper fills this gap by conducting a thorough\nexamination of vulnerabilities in LLM-based agents within scientific domains,\nshedding light on potential risks associated with their misuse and emphasizing\nthe need for safety measures. We begin by providing a comprehensive overview of\nthe potential risks inherent to scientific LLM agents, taking into account user\nintent, the specific scientific domain, and their potential impact on the\nexternal environment. Then, we delve into the origins of these vulnerabilities\nand provide a scoping review of the limited existing works. Based on our\nanalysis, we propose a triadic framework involving human regulation, agent\nalignment, and an understanding of environmental feedback (agent regulation) to\nmitigate these identified risks. Furthermore, we highlight the limitations and\nchallenges associated with safeguarding scientific agents and advocate for the\ndevelopment of improved models, robust benchmarks, and comprehensive\nregulations to address these issues effectively.\n","authors":["Xiangru Tang","Qiao Jin","Kunlun Zhu","Tongxin Yuan","Yichi Zhang","Wangchunshu Zhou","Meng Qu","Yilun Zhao","Jian Tang","Zhuosheng Zhang","Arman Cohan","Zhiyong Lu","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2402.04247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04894v1","updated":"2024-02-07T14:24:41Z","published":"2024-02-07T14:24:41Z","title":"Deep Reinforcement Learning with Dynamic Graphs for Adaptive Informative\n Path Planning","summary":" Autonomous robots are often employed for data collection due to their\nefficiency and low labour costs. A key task in robotic data acquisition is\nplanning paths through an initially unknown environment to collect observations\ngiven platform-specific resource constraints, such as limited battery life.\nAdaptive online path planning in 3D environments is challenging due to the\nlarge set of valid actions and the presence of unknown occlusions. To address\nthese issues, we propose a novel deep reinforcement learning approach for\nadaptively replanning robot paths to map targets of interest in unknown 3D\nenvironments. A key aspect of our approach is a dynamically constructed graph\nthat restricts planning actions local to the robot, allowing us to quickly\nreact to newly discovered obstacles and targets of interest. For replanning, we\npropose a new reward function that balances between exploring the unknown\nenvironment and exploiting online-collected data about the targets of interest.\nOur experiments show that our method enables more efficient target detection\ncompared to state-of-the-art learning and non-learning baselines. We also show\nthe applicability of our approach for orchard monitoring using an unmanned\naerial vehicle in a photorealistic simulator.\n","authors":["Apoorva Vashisth","Julius Rückin","Federico Magistri","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2402.04894v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.04892v1","updated":"2024-02-07T14:24:04Z","published":"2024-02-07T14:24:04Z","title":"A Unified Framework for Probabilistic Verification of AI Systems via\n Weighted Model Integration","summary":" The probabilistic formal verification (PFV) of AI systems is in its infancy.\nSo far, approaches have been limited to ad-hoc algorithms for specific classes\nof models and/or properties.\n We propose a unifying framework for the PFV of AI systems based onWeighted\nModel Integration (WMI), which allows to frame the problem in very general\nterms.\n Crucially, this reduction enables the verification of many properties of\ninterest, like fairness, robustness or monotonicity, over a wide range of\nmachine learning models, without making strong distributional assumptions.\n We support the generality of the approach by solving multiple verification\ntasks with a single, off-the-shelf WMI solver, then discuss the scalability\nchallenges and research directions related to this promising framework.\n","authors":["Paolo Morettin","Andrea Passerini","Roberto Sebastiani"],"pdf_url":"https://arxiv.org/pdf/2402.04892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02910v2","updated":"2024-02-07T14:21:37Z","published":"2024-02-05T11:25:45Z","title":"DS-MS-TCN: Otago Exercises Recognition with a Dual-Scale Multi-Stage\n Temporal Convolutional Network","summary":" The Otago Exercise Program (OEP) represents a crucial rehabilitation\ninitiative tailored for older adults, aimed at enhancing balance and strength.\nDespite previous efforts utilizing wearable sensors for OEP recognition,\nexisting studies have exhibited limitations in terms of accuracy and\nrobustness. This study addresses these limitations by employing a single\nwaist-mounted Inertial Measurement Unit (IMU) to recognize OEP exercises among\ncommunity-dwelling older adults in their daily lives. A cohort of 36 older\nadults participated in laboratory settings, supplemented by an additional 7\nolder adults recruited for at-home assessments. The study proposes a Dual-Scale\nMulti-Stage Temporal Convolutional Network (DS-MS-TCN) designed for two-level\nsequence-to-sequence classification, incorporating them in one loss function.\nIn the first stage, the model focuses on recognizing each repetition of the\nexercises (micro labels). Subsequent stages extend the recognition to encompass\nthe complete range of exercises (macro labels). The DS-MS-TCN model surpasses\nexisting state-of-the-art deep learning models, achieving f1-scores exceeding\n80% and Intersection over Union (IoU) f1-scores surpassing 60% for all four\nexercises evaluated. Notably, the model outperforms the prior study utilizing\nthe sliding window technique, eliminating the need for post-processing stages\nand window size tuning. To our knowledge, we are the first to present a novel\nperspective on enhancing Human Activity Recognition (HAR) systems through the\nrecognition of each repetition of activities.\n","authors":["Meng Shang","Lenore Dedeyne","Jolan Dupont","Laura Vercauteren","Nadjia Amini","Laurence Lapauw","Evelien Gielen","Sabine Verschueren","Carolina Varon","Walter De Raedt","Bart Vanrumste"],"pdf_url":"https://arxiv.org/pdf/2402.02910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04875v1","updated":"2024-02-07T14:16:28Z","published":"2024-02-07T14:16:28Z","title":"On Provable Length and Compositional Generalization","summary":" Length generalization -- the ability to generalize to longer sequences than\nones seen during training, and compositional generalization -- the ability to\ngeneralize to token combinations not seen during training, are crucial forms of\nout-of-distribution generalization in sequence-to-sequence models. In this\nwork, we take the first steps towards provable length and compositional\ngeneralization for a range of architectures, including deep sets, transformers,\nstate space models, and simple recurrent neural nets. Depending on the\narchitecture, we prove different degrees of representation identification,\ne.g., a linear or a permutation relation with ground truth representation, is\nnecessary for length and compositional generalization.\n","authors":["Kartik Ahuja","Amin Mansouri"],"pdf_url":"https://arxiv.org/pdf/2402.04875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08320v3","updated":"2024-02-07T14:13:05Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":" The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names and faces of\nindividuals from vision-language models by fine-tuning them for only a few\nminutes instead of re-training them from scratch. Specifically, through\nstrategic insertion of backdoors into text encoders, we align the embeddings of\nsensitive phrases with those of neutral terms-\"a person\" instead of the\nperson's actual name. For image encoders, we map embeddings of individuals to\nbe removed from the model to a universal, anonymous embedding. Our empirical\nresults demonstrate the effectiveness of our backdoor-based defense on CLIP by\nassessing its performance using a specialized privacy attack for zero-shot\nclassifiers. Our approach provides not only a new \"dual-use\" perspective on\nbackdoor attacks, but also presents a promising avenue to enhance the privacy\nof individuals within models trained on uncurated web-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v3.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04869v1","updated":"2024-02-07T14:09:34Z","published":"2024-02-07T14:09:34Z","title":"Learning by Doing: An Online Causal Reinforcement Learning Framework\n with Causal-Aware Policy","summary":" As a key component to intuitive cognition and reasoning solutions in human\nintelligence, causal knowledge provides great potential for reinforcement\nlearning (RL) agents' interpretability towards decision-making by helping\nreduce the searching space. However, there is still a considerable gap in\ndiscovering and incorporating causality into RL, which hinders the rapid\ndevelopment of causal RL. In this paper, we consider explicitly modeling the\ngeneration process of states with the causal graphical model, based on which we\naugment the policy. We formulate the causal structure updating into the RL\ninteraction process with active intervention learning of the environment. To\noptimize the derived objective, we propose a framework with theoretical\nperformance guarantees that alternates between two steps: using interventions\nfor causal structure learning during exploration and using the learned causal\nstructure for policy guidance during exploitation. Due to the lack of public\nbenchmarks that allow direct intervention in the state space, we design the\nroot cause localization task in our simulated fault alarm environment and then\nempirically show the effectiveness and robustness of the proposed method\nagainst state-of-the-art baselines. Theoretical analysis shows that our\nperformance improvement attributes to the virtuous cycle of causal-guided\npolicy learning and causal structure learning, which aligns with our\nexperimental results.\n","authors":["Ruichu Cai","Siyang Huang","Jie Qiao","Wei Chen","Yan Zeng","Keli Zhang","Fuchun Sun","Yang Yu","Zhifeng Hao"],"pdf_url":"https://arxiv.org/pdf/2402.04869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.14600v3","updated":"2024-02-07T14:04:43Z","published":"2020-12-29T04:18:54Z","title":"A Comprehensive Guide to CAN IDS Data & Introduction of the ROAD Dataset","summary":" Although ubiquitous in modern vehicles, Controller Area Networks (CANs) lack\nbasic security properties and are easily exploitable. A rapidly growing field\nof CAN security research has emerged that seeks to detect intrusions on CANs.\nProducing vehicular CAN data with a variety of intrusions is out of reach for\nmost researchers as it requires expensive assets and expertise. To assist\nresearchers, we present the first comprehensive guide to the existing open CAN\nintrusion datasets, including a quality analysis of each dataset and an\nenumeration of each's benefits, drawbacks, and suggested use case. Current\npublic CAN IDS datasets are limited to real fabrication (simple message\ninjection) attacks and simulated attacks often in synthetic data, which lack\nfidelity. In general, the physical effects of attacks on the vehicle are not\nverified in the available datasets. Only one dataset provides signal-translated\ndata but not a corresponding raw binary version. Overall, the available data\npigeon-holes CAN IDS works into testing on limited, often inappropriate data\n(usually with attacks that are too easily detectable to truly test the method),\nand this lack data has stymied comparability and reproducibility of results. As\nour primary contribution, we present the ROAD (Real ORNL Automotive\nDynamometer) CAN Intrusion Dataset, consisting of over 3.5 hours of one\nvehicle's CAN data. ROAD contains ambient data recorded during a diverse set of\nactivities, and attacks of increasing stealth with multiple variants and\ninstances of real fuzzing, fabrication, and unique advanced attacks, as well as\nsimulated masquerade attacks. To facilitate benchmarking CAN IDS methods that\nrequire signal-translated inputs, we also provide the signal time series format\nfor many of the CAN captures. Our contributions aim to facilitate appropriate\nbenchmarking and needed comparability in the CAN IDS field.\n","authors":["Miki E. Verma","Robert A. Bridges","Michael D. Iannacone","Samuel C. Hollifield","Pablo Moriano","Steven C. Hespeler","Bill Kay","Frank L. Combs"],"pdf_url":"https://arxiv.org/pdf/2012.14600v3.pdf","comment":"title changed and author added from original version"},{"id":"http://arxiv.org/abs/2402.04858v1","updated":"2024-02-07T13:55:27Z","published":"2024-02-07T13:55:27Z","title":"CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay","summary":" Large language models are increasingly solving tasks that are commonly\nbelieved to require human-level reasoning ability. However, these models still\nperform very poorly on benchmarks of general intelligence such as the\nAbstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a\nprogramming-by-examples problem, and introduce a novel and scalable method for\nlanguage model self-improvement called Code Iteration (CodeIt). Our method\niterates between 1) program sampling and hindsight relabeling, and 2) learning\nfrom prioritized experience replay. By relabeling the goal of an episode (i.e.,\nthe target program output given input) to the realized output produced by the\nsampled program, our method effectively deals with the extreme sparsity of\nrewards in program synthesis. Applying CodeIt to the ARC dataset, we\ndemonstrate that prioritized hindsight replay, along with pre-training and\ndata-augmentation, leads to successful inter-task generalization. CodeIt is the\nfirst neuro-symbolic approach that scales to the full ARC evaluation dataset.\nOur method solves 15% of ARC evaluation tasks, achieving state-of-the-art\nperformance and outperforming existing neural and symbolic baselines.\n","authors":["Natasha Butt","Blazej Manczak","Auke Wiggers","Corrado Rainone","David Zhang","Michaël Defferrard","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2402.04858v1.pdf","comment":"8 pages, 11 figures"},{"id":"http://arxiv.org/abs/2402.04856v1","updated":"2024-02-07T13:54:38Z","published":"2024-02-07T13:54:38Z","title":"Explaining Learned Reward Functions with Counterfactual Trajectories","summary":" Learning rewards from human behaviour or feedback is a promising approach to\naligning AI systems with human values but fails to consistently extract correct\nreward functions. Interpretability tools could enable users to understand and\nevaluate possible flaws in learned reward functions. We propose Counterfactual\nTrajectory Explanations (CTEs) to interpret reward functions in reinforcement\nlearning by contrasting an original with a counterfactual partial trajectory\nand the rewards they each receive. We derive six quality criteria for CTEs and\npropose a novel Monte-Carlo-based algorithm for generating CTEs that optimises\nthese quality criteria. Finally, we measure how informative the generated\nexplanations are to a proxy-human model by training it on CTEs. CTEs are\ndemonstrably informative for the proxy-human model, increasing the similarity\nbetween its predictions and the reward function on unseen trajectories.\nFurther, it learns to accurately judge differences in rewards between\ntrajectories and generalises to out-of-distribution examples. Although CTEs do\nnot lead to a perfect understanding of the reward, our method, and more\ngenerally the adaptation of XAI methods, are presented as a fruitful approach\nfor interpreting learned reward functions.\n","authors":["Jan Wehner","Frans Oliehoek","Luciano Cavalcante Siebert"],"pdf_url":"https://arxiv.org/pdf/2402.04856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04854v1","updated":"2024-02-07T13:54:06Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":" Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v1.pdf","comment":"This paper will submit to '27th International Symposium on\n Methodologies for Intelligent Systems'(ISMIS 2024)"},{"id":"http://arxiv.org/abs/2303.17152v3","updated":"2024-02-07T13:53:38Z","published":"2023-03-30T05:19:43Z","title":"Mixed Autoencoder for Self-supervised Visual Representation Learning","summary":" Masked Autoencoder (MAE) has demonstrated superior performance on various\nvision tasks via randomly masking image patches and reconstruction. However,\neffective data augmentation strategies for MAE still remain open questions,\ndifferent from those in contrastive learning that serve as the most important\npart. This paper studies the prevailing mixing augmentation for MAE. We first\ndemonstrate that naive mixing will in contrast degenerate model performance due\nto the increase of mutual information (MI). To address, we propose homologous\nrecognition, an auxiliary pretext task, not only to alleviate the MI\nincreasement by explicitly requiring each patch to recognize homologous\npatches, but also to perform object-aware self-supervised pre-training for\nbetter downstream dense perception performance. With extensive experiments, we\ndemonstrate that our proposed Mixed Autoencoder (MixedAE) achieves the\nstate-of-the-art transfer results among masked image modeling (MIM)\naugmentations on different downstream tasks with significant efficiency.\nSpecifically, our MixedAE outperforms MAE by +0.3% accuracy, +1.7 mIoU and +0.9\nAP on ImageNet-1K, ADE20K and COCO respectively with a standard ViT-Base.\nMoreover, MixedAE surpasses iBOT, a strong MIM method combined with instance\ndiscrimination, while accelerating training by 2x. To our best knowledge, this\nis the very first work to consider mixing for MIM from the perspective of\npretext task design. Code will be made available.\n","authors":["Kai Chen","Zhili Liu","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2303.17152v3.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2402.04852v1","updated":"2024-02-07T13:51:26Z","published":"2024-02-07T13:51:26Z","title":"Multi-Patch Prediction: Adapting LLMs for Time Series Representation\n Learning","summary":" In this study, we present aLLM4TS, an innovative framework that adapts Large\nLanguage Models (LLMs) for time-series representation learning. Central to our\napproach is that we reconceive time-series forecasting as a self-supervised,\nmulti-patch prediction task, which, compared to traditional\nmask-and-reconstruction methods, captures temporal dynamics in patch\nrepresentations more effectively. Our strategy encompasses two-stage training:\n(i). a causal continual pre-training phase on various time-series datasets,\nanchored on next patch prediction, effectively syncing LLM capabilities with\nthe intricacies of time-series data; (ii). fine-tuning for multi-patch\nprediction in the targeted time-series context. A distinctive element of our\nframework is the patch-wise decoding layer, which departs from previous methods\nreliant on sequence-level decoding. Such a design directly transposes\nindividual patches into temporal sequences, thereby significantly bolstering\nthe model's proficiency in mastering temporal patch-based representations.\naLLM4TS demonstrates superior performance in several downstream tasks, proving\nits effectiveness in deriving temporal representations with enhanced\ntransferability and marking a pivotal advancement in the adaptation of LLMs for\ntime-series analysis.\n","authors":["Yuxuan Bian","Xuan Ju","Jiangtong Li","Zhijian Xu","Dawei Cheng","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.04852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15830v2","updated":"2024-02-07T13:50:48Z","published":"2023-10-24T13:33:19Z","title":"Localizing Anomalies in Critical Infrastructure using Model-Based Drift\n Explanations","summary":" Facing climate change, the already limited availability of drinking water\nwill decrease in the future rendering drinking water an increasingly scarce\nresource. Considerable amounts of it are lost through leakages in water\ntransportation and distribution networks. Thus, anomaly detection and\nlocalization, in particular for leakages, are crucial but challenging tasks due\nto the complex interactions and changing demands in water distribution\nnetworks. In this work, we analyze the effects of anomalies on the dynamics of\ncritical infrastructure systems by modeling the networks employing Bayesian\nnetworks. We then discuss how the problem is connected to and can be considered\nthrough the lens of concept drift. In particular, we argue that model-based\nexplanations of concept drift are a promising tool for localizing anomalies\ngiven limited information about the network. The methodology is experimentally\nevaluated using realistic benchmark scenarios. To showcase that our methodology\napplies to critical infrastructure more generally, in addition to considering\nleakages and sensor faults in water systems, we showcase the suitability of the\nderived technique to localize sensor faults in power systems.\n","authors":["Valerie Vaquet","Fabian Hinder","Jonas Vaquet","Kathrin Lammers","Lars Quakernack","Barbara Hammer"],"pdf_url":"https://arxiv.org/pdf/2310.15830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15919v3","updated":"2024-02-07T13:46:35Z","published":"2023-03-28T12:20:52Z","title":"Fully Hyperbolic Convolutional Neural Networks for Computer Vision","summary":" Real-world visual data exhibit intrinsic hierarchical structures that can be\nrepresented effectively in hyperbolic spaces. Hyperbolic neural networks (HNNs)\nare a promising approach for learning feature representations in such spaces.\nHowever, current HNNs in computer vision rely on Euclidean backbones and only\nproject features to the hyperbolic space in the task heads, limiting their\nability to fully leverage the benefits of hyperbolic geometry. To address this,\nwe present HCNN, a fully hyperbolic convolutional neural network (CNN) designed\nfor computer vision tasks. Based on the Lorentz model, we generalize\nfundamental components of CNNs and propose novel formulations of the\nconvolutional layer, batch normalization, and multinomial logistic regression.\n{Experiments on standard vision tasks demonstrate the promising performance of\nour HCNN framework in both hybrid and fully hyperbolic settings.} Overall, we\nbelieve our contributions provide a foundation for developing more powerful\nHNNs that can better represent complex structures found in image data. Our code\nis publicly available at https://github.com/kschwethelm/HyperbolicCV.\n","authors":["Ahmad Bdeir","Kristian Schwethelm","Niels Landwehr"],"pdf_url":"https://arxiv.org/pdf/2303.15919v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04845v1","updated":"2024-02-07T13:44:47Z","published":"2024-02-07T13:44:47Z","title":"AlphaFold Meets Flow Matching for Generating Protein Ensembles","summary":" The biological functions of proteins often depend on dynamic structural\nensembles. In this work, we develop a flow-based generative modeling approach\nfor learning and sampling the conformational landscapes of proteins. We\nrepurpose highly accurate single-state predictors such as AlphaFold and ESMFold\nand fine-tune them under a custom flow matching framework to obtain\nsequence-conditoned generative models of protein structure called AlphaFlow and\nESMFlow. When trained and evaluated on the PDB, our method provides a superior\ncombination of precision and diversity compared to AlphaFold with MSA\nsubsampling. When further trained on ensembles from all-atom MD, our method\naccurately captures conformational flexibility, positional distributions, and\nhigher-order ensemble observables for unseen proteins. Moreover, our method can\ndiversify a static PDB structure with faster wall-clock convergence to certain\nequilibrium properties than replicate MD trajectories, demonstrating its\npotential as a proxy for expensive physics-based simulations. Code is available\nat https://github.com/bjing2016/alphaflow.\n","authors":["Bowen Jing","Bonnie Berger","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2402.04845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14455v2","updated":"2024-02-07T13:36:21Z","published":"2023-11-24T13:09:34Z","title":"Universal Jailbreak Backdoors from Poisoned Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) is used to align large\nlanguage models to produce helpful and harmless responses. Yet, prior work\nshowed these models can be jailbroken by finding adversarial prompts that\nrevert the model to its unaligned behavior. In this paper, we consider a new\nthreat where an attacker poisons the RLHF training data to embed a \"jailbreak\nbackdoor\" into the model. The backdoor embeds a trigger word into the model\nthat acts like a universal \"sudo command\": adding the trigger word to any\nprompt enables harmful responses without the need to search for an adversarial\nprompt. Universal jailbreak backdoors are much more powerful than previously\nstudied backdoors on language models, and we find they are significantly harder\nto plant using common backdoor attack techniques. We investigate the design\ndecisions in RLHF that contribute to its purported robustness, and release a\nbenchmark of poisoned models to stimulate future research on universal\njailbreak backdoors.\n","authors":["Javier Rando","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2311.14455v2.pdf","comment":"Accepted as conference paper in ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04836v1","updated":"2024-02-07T13:32:53Z","published":"2024-02-07T13:32:53Z","title":"On the Completeness of Invariant Geometric Deep Learning Models","summary":" Invariant models, one important class of geometric deep learning models, are\ncapable of generating meaningful geometric representations by leveraging\ninformative geometric features. These models are characterized by their\nsimplicity, good experimental results and computational efficiency. However,\ntheir theoretical expressive power still remains unclear, restricting a deeper\nunderstanding of the potential of such models. In this work, we concentrate on\ncharacterizing the theoretical expressiveness of invariant models. We first\nrigorously bound the expressiveness of the most classical invariant model,\nVanilla DisGNN (message passing neural networks incorporating distance),\nrestricting its unidentifiable cases to be only those highly symmetric\ngeometric graphs. To break these corner cases' symmetry, we introduce a simple\nyet E(3)-complete invariant design by nesting Vanilla DisGNN, named GeoNGNN.\nLeveraging GeoNGNN as a theoretical tool, we for the first time prove the\nE(3)-completeness of three well-established geometric models: DimeNet, GemNet\nand SphereNet. Our results fill the gap in the theoretical power of invariant\nmodels, contributing to a rigorous and comprehensive understanding of their\ncapabilities. Experimentally, GeoNGNN exhibits good inductive bias in capturing\nlocal environments, and achieves competitive results w.r.t. complicated models\nrelying on high-order invariant/equivariant representations while exhibiting\nsignificantly faster computational speed.\n","authors":["Zian Li","Xiyuan Wang","Shijia Kang","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.04836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04835v1","updated":"2024-02-07T13:32:47Z","published":"2024-02-07T13:32:47Z","title":"SARI: Simplistic Average and Robust Identification based Noisy Partial\n Label Learning","summary":" Partial label learning (PLL) is a weakly-supervised learning paradigm where\neach training instance is paired with a set of candidate labels (partial\nlabel), one of which is the true label. Noisy PLL (NPLL) relaxes this\nconstraint by allowing some partial labels to not contain the true label,\nenhancing the practicality of the problem. Our work centers on NPLL and\npresents a minimalistic framework called SARI that initially assigns\npseudo-labels to images by exploiting the noisy partial labels through a\nweighted nearest neighbour algorithm. These pseudo-label and image pairs are\nthen used to train a deep neural network classifier with label smoothing and\nstandard regularization techniques. The classifier's features and predictions\nare subsequently employed to refine and enhance the accuracy of pseudo-labels.\nSARI combines the strengths of Average Based Strategies (in pseudo labelling)\nand Identification Based Strategies (in classifier training) from the\nliterature. We perform thorough experiments on seven datasets and compare SARI\nagainst nine NPLL and PLL methods from the prior art. SARI achieves\nstate-of-the-art results in almost all studied settings, obtaining substantial\ngains in fine-grained classification and extreme noise settings.\n","authors":["Darshana Saravanan","Naresh Manwani","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2402.04835v1.pdf","comment":"13 pages, 6 tables, 2 figures"},{"id":"http://arxiv.org/abs/2312.01538v2","updated":"2024-02-07T13:29:29Z","published":"2023-12-03T23:36:16Z","title":"Recurrent Distance Filtering for Graph Representation Learning","summary":" Graph neural networks based on iterative one-hop message passing have been\nshown to struggle in harnessing the information from distant nodes effectively.\nConversely, graph transformers allow each node to attend to all other nodes\ndirectly, but lack graph inductive bias and have to rely on ad-hoc positional\nencoding. In this paper, we propose a new architecture to reconcile these\nchallenges. Our approach stems from the recent breakthroughs in long-range\nmodeling provided by deep state-space models on sequential data: for a given\ntarget node, our model aggregates other nodes by their shortest distances to\nthe target and uses a linear RNN to encode the sequence of hop representations.\nThe linear RNN is parameterized in a particular diagonal form for stable\nlong-range signal propagation and is theoretically expressive enough to encode\nthe neighborhood hierarchy. With no need for positional encoding, we\nempirically show that the performance of our model is highly competitive\ncompared with that of state-of-the-art graph transformers on various\nbenchmarks, with a significantly reduced computational cost.\n","authors":["Yuhui Ding","Antonio Orvieto","Bobby He","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2312.01538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10396v2","updated":"2024-02-07T13:27:04Z","published":"2023-12-16T09:49:31Z","title":"How Far Can Fairness Constraints Help Recover From Biased Data?","summary":" A general belief in fair classification is that fairness constraints incur a\ntrade-off with accuracy, which biased data may worsen. Contrary to this belief,\nBlum & Stangl (2019) show that fair classification with equal opportunity\nconstraints even on extremely biased data can recover optimally accurate and\nfair classifiers on the original data distribution. Their result is interesting\nbecause it demonstrates that fairness constraints can implicitly rectify data\nbias and simultaneously overcome a perceived fairness-accuracy trade-off. Their\ndata bias model simulates under-representation and label bias in\nunderprivileged population, and they show the above result on a stylized data\ndistribution with i.i.d. label noise, under simple conditions on the data\ndistribution and bias parameters. We propose a general approach to extend the\nresult of Blum & Stangl (2019) to different fairness constraints, data bias\nmodels, data distributions, and hypothesis classes. We strengthen their result,\nand extend it to the case when their stylized distribution has labels with\nMassart noise instead of i.i.d. noise. We prove a similar recovery result for\narbitrary data distributions using fair reject option classifiers. We further\ngeneralize it to arbitrary data distributions and arbitrary hypothesis classes,\ni.e., we prove that for any data distribution, if the optimally accurate\nclassifier in a given hypothesis class is fair and robust, then it can be\nrecovered through fair classification with equal opportunity constraints on the\nbiased distribution whenever the bias parameters satisfy certain simple\nconditions. Finally, we show applications of our technique to time-varying data\nbias in classification and fair machine learning pipelines.\n","authors":["Mohit Sharma","Amit Deshpande"],"pdf_url":"https://arxiv.org/pdf/2312.10396v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04830v1","updated":"2024-02-07T13:26:10Z","published":"2024-02-07T13:26:10Z","title":"Closing the Gap Between SGP4 and High-Precision Propagation via\n Differentiable Programming","summary":" The Simplified General Perturbations 4 (SGP4) orbital propagation method is\nwidely used for predicting the positions and velocities of Earth-orbiting\nobjects rapidly and reliably. Despite continuous refinement, SGP models still\nlack the precision of numerical propagators, which offer significantly smaller\nerrors. This study presents dSGP4, a novel differentiable version of SGP4\nimplemented using PyTorch. By making SGP4 differentiable, dSGP4 facilitates\nvarious space-related applications, including spacecraft orbit determination,\nstate conversion, covariance transformation, state transition matrix\ncomputation, and covariance propagation. Additionally, dSGP4's PyTorch\nimplementation allows for embarrassingly parallel orbital propagation across\nbatches of Two-Line Element Sets (TLEs), leveraging the computational power of\nCPUs, GPUs, and advanced hardware for distributed prediction of satellite\npositions at future times. Furthermore, dSGP4's differentiability enables\nintegration with modern machine learning techniques. Thus, we propose a novel\norbital propagation paradigm, ML-dSGP4, where neural networks are integrated\ninto the orbital propagator. Through stochastic gradient descent, this combined\nmodel's inputs, outputs, and parameters can be iteratively refined, surpassing\nSGP4's precision. Neural networks act as identity operators by default,\nadhering to SGP4's behavior. However, dSGP4's differentiability allows\nfine-tuning with ephemeris data, enhancing precision while maintaining\ncomputational speed. This empowers satellite operators and researchers to train\nthe model using specific ephemeris or high-precision numerical propagation\ndata, significantly advancing orbital prediction capabilities.\n","authors":["Giacomo Acciarini","Atılım Güneş Baydin","Dario Izzo"],"pdf_url":"https://arxiv.org/pdf/2402.04830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04825v1","updated":"2024-02-07T13:23:25Z","published":"2024-02-07T13:23:25Z","title":"Fast Timing-Conditioned Latent Audio Diffusion","summary":" Generating long-form 44.1kHz stereo audio from text prompts can be\ncomputationally demanding. Further, most previous works do not tackle that\nmusic and sound effects naturally vary in their duration. Our research focuses\non the efficient generation of long-form, variable-length stereo music and\nsounds at 44.1kHz using text prompts with a generative model. Stable Audio is\nbased on latent diffusion, with its latent defined by a fully-convolutional\nvariational autoencoder. It is conditioned on text prompts as well as timing\nembeddings, allowing for fine control over both the content and length of the\ngenerated music and sounds. Stable Audio is capable of rendering stereo signals\nof up to 95 sec at 44.1kHz in 8 sec on an A100 GPU. Despite its compute\nefficiency and fast inference, it is one of the best in two public\ntext-to-music and -audio benchmarks and, differently from state-of-the-art\nmodels, can generate music with structure and stereo sounds.\n","authors":["Zach Evans","CJ Carr","Josiah Taylor","Scott H. Hawley","Jordi Pons"],"pdf_url":"https://arxiv.org/pdf/2402.04825v1.pdf","comment":"Code: https://github.com/Stability-AI/stable-audio-tools. Metrics:\n https://github.com/Stability-AI/stable-audio-metrics. Demo:\n https://stability-ai.github.io/stable-audio-demo"},{"id":"http://arxiv.org/abs/2011.08388v2","updated":"2024-02-07T13:23:08Z","published":"2020-11-17T02:55:16Z","title":"Domain Adaptation based Interpretable Image Emotion Recognition using\n Facial Expression Recognition","summary":" A domain adaptation technique has been proposed in this paper to identify the\nemotions in generic images containing facial & non-facial objects and non-human\ncomponents. It addresses the challenge of the insufficient availability of\npre-trained models and well-annotated datasets for image emotion recognition\n(IER). It starts with proposing a facial emotion recognition (FER) system and\nthen moves on to adapting it for image emotion recognition. First, a\ndeep-learning-based FER system has been proposed that classifies a given facial\nimage into discrete emotion classes. Further, an image recognition system has\nbeen proposed that adapts the proposed FER system to recognize the emotions\nportrayed by images using domain adaptation. It classifies the generic images\ninto 'happy,' 'sad,' 'hate,' and 'anger' classes. A novel interpretability\napproach, Divide and Conquer based Shap (DnCShap), has also been proposed to\ninterpret the highly relevant visual features for emotion recognition. The\nproposed system's architecture has been decided through ablation studies, and\nthe experiments are conducted on four FER and four IER datasets. The proposed\nIER system has shown an emotion classification accuracy of 59.61% for the IAPSa\ndataset, 57.83% for the ArtPhoto dataset, 67.93% for the FI dataset, and 55.13%\nfor the EMOTIC dataset. The important visual features leading to a particular\nemotion class have been identified, and the embedding plots for various emotion\nclasses have been analyzed to explain the proposed system's predictions.\n","authors":["Puneet Kumar","Balasubramanian Raman"],"pdf_url":"https://arxiv.org/pdf/2011.08388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04823v1","updated":"2024-02-07T13:22:05Z","published":"2024-02-07T13:22:05Z","title":"How Realistic Is Your Synthetic Data? Constraining Deep Generative\n Models for Tabular Data","summary":" Deep Generative Models (DGMs) have been shown to be powerful tools for\ngenerating tabular data, as they have been increasingly able to capture the\ncomplex distributions that characterize them. However, to generate realistic\nsynthetic data, it is often not enough to have a good approximation of their\ndistribution, as it also requires compliance with constraints that encode\nessential background knowledge on the problem at hand. In this paper, we\naddress this limitation and show how DGMs for tabular data can be transformed\ninto Constrained Deep Generative Models (C-DGMs), whose generated samples are\nguaranteed to be compliant with the given constraints. This is achieved by\nautomatically parsing the constraints and transforming them into a Constraint\nLayer (CL) seamlessly integrated with the DGM. Our extensive experimental\nanalysis with various DGMs and tasks reveals that standard DGMs often violate\nconstraints, some exceeding $95\\%$ non-compliance, while their corresponding\nC-DGMs are never non-compliant. Then, we quantitatively demonstrate that, at\ntraining time, C-DGMs are able to exploit the background knowledge expressed by\nthe constraints to outperform their standard counterparts with up to $6.5\\%$\nimprovement in utility and detection. Further, we show how our CL does not\nnecessarily need to be integrated at training time, as it can be also used as a\nguardrail at inference time, still producing some improvements in the overall\nperformance of the models. Finally, we show that our CL does not hinder the\nsample generation time of the models.\n","authors":["Mihaela Cătălina Stoian","Salijona Dyrmishi","Maxime Cordy","Thomas Lukasiewicz","Eleonora Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2402.04823v1.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2402.04821v1","updated":"2024-02-07T13:21:41Z","published":"2024-02-07T13:21:41Z","title":"E(3)-Equivariant Mesh Neural Networks","summary":" Triangular meshes are widely used to represent three-dimensional objects. As\na result, many recent works have address the need for geometric deep learning\non 3D mesh. However, we observe that the complexities in many of these\narchitectures does not translate to practical performance, and simple deep\nmodels for geometric graphs are competitive in practice. Motivated by this\nobservation, we minimally extend the update equations of E(n)-Equivariant Graph\nNeural Networks (EGNNs) (Satorras et al., 2021) to incorporate mesh face\ninformation, and further improve it to account for long-range interactions\nthrough hierarchy. The resulting architecture, Equivariant Mesh Neural Network\n(EMNN), outperforms other, more complicated equivariant methods on mesh tasks,\nwith a fast run-time and no expensive pre-processing.\n","authors":["Thuan Trang","Nhat Khang Ngo","Daniel Levy","Thieu N. Vo","Siamak Ravanbakhsh","Truong Son Hy"],"pdf_url":"https://arxiv.org/pdf/2402.04821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16120v3","updated":"2024-02-07T13:19:29Z","published":"2023-07-30T03:59:47Z","title":"Deep Unrolling Networks with Recurrent Momentum Acceleration for\n Nonlinear Inverse Problems","summary":" Combining the strengths of model-based iterative algorithms and data-driven\ndeep learning solutions, deep unrolling networks (DuNets) have become a popular\ntool to solve inverse imaging problems. While DuNets have been successfully\napplied to many linear inverse problems, nonlinear problems tend to impair the\nperformance of the method. Inspired by momentum acceleration techniques that\nare often used in optimization algorithms, we propose a recurrent momentum\nacceleration (RMA) framework that uses a long short-term memory recurrent\nneural network (LSTM-RNN) to simulate the momentum acceleration process. The\nRMA module leverages the ability of the LSTM-RNN to learn and retain knowledge\nfrom the previous gradients. We apply RMA to two popular DuNets -- the learned\nproximal gradient descent (LPGD) and the learned primal-dual (LPD) methods,\nresulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results\non two nonlinear inverse problems: a nonlinear deconvolution problem, and an\nelectrical impedance tomography problem with limited boundary measurements. In\nthe first experiment we have observed that the improvement due to RMA largely\nincreases with respect to the nonlinearity of the problem. The results of the\nsecond example further demonstrate that the RMA schemes can significantly\nimprove the performance of DuNets in strongly ill-posed problems.\n","authors":["Qingping Zhou","Jiayu Qian","Junqi Tang","Jinglai Li"],"pdf_url":"https://arxiv.org/pdf/2307.16120v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2002.00178v3","updated":"2024-02-07T13:17:55Z","published":"2020-02-01T09:48:51Z","title":"An Equivalence between Bayesian Priors and Penalties in Variational\n Inference","summary":" In machine learning, it is common to optimize the parameters of a\nprobabilistic model, modulated by an ad hoc regularization term that penalizes\nsome values of the parameters. Regularization terms appear naturally in\nVariational Inference, a tractable way to approximate Bayesian posteriors: the\nloss to optimize contains a Kullback--Leibler divergence term between the\napproximate posterior and a Bayesian prior. We fully characterize the\nregularizers that can arise according to this procedure, and provide a\nsystematic way to compute the prior corresponding to a given penalty. Such a\ncharacterization can be used to discover constraints over the penalty function,\nso that the overall procedure remains Bayesian.\n","authors":["Pierre Wolinski","Guillaume Charpiat","Yann Ollivier"],"pdf_url":"https://arxiv.org/pdf/2002.00178v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04814v1","updated":"2024-02-07T13:04:35Z","published":"2024-02-07T13:04:35Z","title":"BOWLL: A Deceptively Simple Open World Lifelong Learner","summary":" The quest to improve scalar performance numbers on predetermined benchmarks\nseems to be deeply engraved in deep learning. However, the real world is seldom\ncarefully curated and applications are seldom limited to excelling on test\nsets. A practical system is generally required to recognize novel concepts,\nrefrain from actively including uninformative data, and retain previously\nacquired knowledge throughout its lifetime. Despite these key elements being\nrigorously researched individually, the study of their conjunction, open world\nlifelong learning, is only a recent trend. To accelerate this multifaceted\nfield's exploration, we introduce its first monolithic and much-needed\nbaseline. Leveraging the ubiquitous use of batch normalization across deep\nneural networks, we propose a deceptively simple yet highly effective way to\nrepurpose standard models for open world lifelong learning. Through extensive\nempirical evaluation, we highlight why our approach should serve as a future\nstandard for models that are able to effectively maintain their knowledge,\nselectively focus on informative data, and accelerate future learning.\n","authors":["Roshni Kamath","Rupert Mitchell","Subarnaduti Paul","Kristian Kersting","Martin Mundt"],"pdf_url":"https://arxiv.org/pdf/2402.04814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00720v2","updated":"2024-02-07T13:01:24Z","published":"2022-11-16T00:11:04Z","title":"A Stable, Fast, and Fully Automatic Learning Algorithm for Predictive\n Coding Networks","summary":" Predictive coding networks are neuroscience-inspired models with roots in\nboth Bayesian statistics and neuroscience. Training such models, however, is\nquite inefficient and unstable. In this work, we show how by simply changing\nthe temporal scheduling of the update rule for the synaptic weights leads to an\nalgorithm that is much more efficient and stable than the original one, and has\ntheoretical guarantees in terms of convergence. The proposed algorithm, that we\ncall incremental predictive coding (iPC) is also more biologically plausible\nthan the original one, as it it fully automatic. In an extensive set of\nexperiments, we show that iPC constantly performs better than the original\nformulation on a large number of benchmarks for image classification, as well\nas for the training of both conditional and masked language models, in terms of\ntest accuracy, efficiency, and convergence with respect to a large set of\nhyperparameters.\n","authors":["Tommaso Salvatori","Yuhang Song","Yordan Yordanov","Beren Millidge","Zhenghua Xu","Lei Sha","Cornelius Emde","Rafal Bogacz","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2212.00720v2.pdf","comment":"Change of title and abstract, that now reflect the version accepted\n for publication. One co-author also added, that performed the additional\n experiments"},{"id":"http://arxiv.org/abs/2402.04794v1","updated":"2024-02-07T12:35:31Z","published":"2024-02-07T12:35:31Z","title":"Scalable Multi-view Clustering via Explicit Kernel Features Maps","summary":" A growing awareness of multi-view learning as an important component in data\nscience and machine learning is a consequence of the increasing prevalence of\nmultiple views in real-world applications, especially in the context of\nnetworks. In this paper we introduce a new scalability framework for multi-view\nsubspace clustering. An efficient optimization strategy is proposed, leveraging\nkernel feature maps to reduce the computational burden while maintaining good\nclustering performance. The scalability of the algorithm means that it can be\napplied to large-scale datasets, including those with millions of data points,\nusing a standard machine, in a few minutes. We conduct extensive experiments on\nreal-world benchmark networks of various sizes in order to evaluate the\nperformance of our algorithm against state-of-the-art multi-view subspace\nclustering methods and attributed-network multi-view approaches.\n","authors":["Chakib Fettal","Lazhar Labiod","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2402.04794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01300v4","updated":"2024-02-07T12:20:13Z","published":"2023-04-03T18:52:01Z","title":"On Mitigating the Utility-Loss in Differentially Private Learning: A new\n Perspective by a Geometrically Inspired Kernel Approach","summary":" Privacy-utility tradeoff remains as one of the fundamental issues of\ndifferentially private machine learning. This paper introduces a geometrically\ninspired kernel-based approach to mitigate the accuracy-loss issue in\nclassification. In this approach, a representation of the affine hull of given\ndata points is learned in Reproducing Kernel Hilbert Spaces (RKHS). This leads\nto a novel distance measure that hides privacy-sensitive information about\nindividual data points and improves the privacy-utility tradeoff via\nsignificantly reducing the risk of membership inference attacks. The\neffectiveness of the approach is demonstrated through experiments on MNIST\ndataset, Freiburg groceries dataset, and a real biomedical dataset. It is\nverified that the approach remains computationally practical. The application\nof the approach to federated learning is considered and it is observed that the\naccuracy-loss due to data being distributed is either marginal or not\nsignificantly high.\n","authors":["Mohit Kumar","Bernhard A. Moser","Lukas Fischer"],"pdf_url":"https://arxiv.org/pdf/2304.01300v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04785v1","updated":"2024-02-07T12:15:56Z","published":"2024-02-07T12:15:56Z","title":"Shadowheart SGD: Distributed Asynchronous SGD with Optimal Time\n Complexity Under Arbitrary Computation and Communication Heterogeneity","summary":" We consider nonconvex stochastic optimization problems in the asynchronous\ncentralized distributed setup where the communication times from workers to a\nserver can not be ignored, and the computation and communication times are\npotentially different for all workers. Using an unbiassed compression\ntechnique, we develop a new method-Shadowheart SGD-that provably improves the\ntime complexities of all previous centralized methods. Moreover, we show that\nthe time complexity of Shadowheart SGD is optimal in the family of centralized\nmethods with compressed communication. We also consider the bidirectional\nsetup, where broadcasting from the server to the workers is non-negligible, and\ndevelop a corresponding method.\n","authors":["Alexander Tyurin","Marta Pozzi","Ivan Ilin","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2402.04785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10980v3","updated":"2024-02-07T12:12:40Z","published":"2023-09-20T00:42:08Z","title":"Adaptive Multi-Agent Deep Reinforcement Learning for Timely Healthcare\n Interventions","summary":" Effective patient monitoring is vital for timely interventions and improved\nhealthcare outcomes. Traditional monitoring systems often struggle to handle\ncomplex, dynamic environments with fluctuating vital signs, leading to delays\nin identifying critical conditions. To address this challenge, we propose a\nnovel AI-driven patient monitoring framework using multi-agent deep\nreinforcement learning (DRL). Our approach deploys multiple learning agents,\neach dedicated to monitoring a specific physiological feature, such as heart\nrate, respiration, and temperature. These agents interact with a generic\nhealthcare monitoring environment, learn the patients' behaviour patterns, and\nmake informed decisions to alert the corresponding Medical Emergency Teams\n(METs) based on the level of emergency estimated. In this study, we evaluate\nthe performance of the proposed multi-agent DRL framework using real-world\nphysiological and motion data from two datasets: PPG-DaLiA and WESAD. We\ncompare the results with several baseline models, including Q-Learning, PPO,\nActor-Critic, Double DQN, and DDPG, as well as monitoring frameworks like\nWISEML and CA-MAQL. Our experiments demonstrate that the proposed DRL approach\noutperforms all other baseline models, achieving more accurate monitoring of\npatient's vital signs. Furthermore, we conduct hyperparameter optimization to\nfine-tune the learning process of each agent. By optimizing hyperparameters, we\nenhance the learning rate and discount factor, thereby improving the agents'\noverall performance in monitoring patient health status.\n","authors":["Thanveer Shaik","Xiaohui Tao","Lin Li","Haoran Xie","Hong-Ning Dai","Jianming Yong"],"pdf_url":"https://arxiv.org/pdf/2309.10980v3.pdf","comment":"This work has been submitted to the ELSEVIER for possible\n publication. Copyright may be transferred without notice, after which this\n version may no longer be accessible. arXiv admin note: text overlap with\n arXiv:2309.10576"},{"id":"http://arxiv.org/abs/2402.04783v1","updated":"2024-02-07T12:06:52Z","published":"2024-02-07T12:06:52Z","title":"Analyzing the Neural Tangent Kernel of Periodically Activated Coordinate\n Networks","summary":" Recently, neural networks utilizing periodic activation functions have been\nproven to demonstrate superior performance in vision tasks compared to\ntraditional ReLU-activated networks. However, there is still a limited\nunderstanding of the underlying reasons for this improved performance. In this\npaper, we aim to address this gap by providing a theoretical understanding of\nperiodically activated networks through an analysis of their Neural Tangent\nKernel (NTK). We derive bounds on the minimum eigenvalue of their NTK in the\nfinite width setting, using a fairly general network architecture which\nrequires only one wide layer that grows at least linearly with the number of\ndata samples. Our findings indicate that periodically activated networks are\n\\textit{notably more well-behaved}, from the NTK perspective, than ReLU\nactivated networks. Additionally, we give an application to the memorization\ncapacity of such networks and verify our theoretical predictions empirically.\nOur study offers a deeper understanding of the properties of periodically\nactivated neural networks and their potential in the field of deep learning.\n","authors":["Hemanth Saratchandran","Shin-Fang Chng","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2402.04783v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2402.02711"},{"id":"http://arxiv.org/abs/2402.04777v1","updated":"2024-02-07T11:56:34Z","published":"2024-02-07T11:56:34Z","title":"A fast score-based search algorithm for maximal ancestral graphs using\n entropy","summary":" \\emph{Maximal ancestral graph} (MAGs) is a class of graphical model that\nextend the famous \\emph{directed acyclic graph} in the presence of latent\nconfounders. Most score-based approaches to learn the unknown MAG from\nempirical data rely on BIC score which suffers from instability and heavy\ncomputations. We propose to use the framework of imsets\n\\citep{studeny2006probabilistic} to score MAGs using empirical entropy\nestimation and the newly proposed \\emph{refined Markov property}\n\\citep{hu2023towards}. Our graphical search procedure is similar to\n\\citet{claassen2022greedy} but improved from our theoretical results. We show\nthat our search algorithm is polynomial in number of nodes by restricting\ndegree, maximal head size and number of discriminating paths. In simulated\nexperiment, our algorithm shows superior performance compared to other state of\nart MAG learning algorithms.\n","authors":["Zhongyi Hu","Robin Evans"],"pdf_url":"https://arxiv.org/pdf/2402.04777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15753v2","updated":"2024-02-07T11:47:38Z","published":"2024-01-28T20:30:14Z","title":"An objective comparison of methods for augmented reality in laparoscopic\n liver resection by preoperative-to-intraoperative image fusion","summary":" Augmented reality for laparoscopic liver resection is a visualisation mode\nthat allows a surgeon to localise tumours and vessels embedded within the liver\nby projecting them on top of a laparoscopic image. Preoperative 3D models\nextracted from CT or MRI data are registered to the intraoperative laparoscopic\nimages during this process. In terms of 3D-2D fusion, most of the algorithms\nmake use of anatomical landmarks to guide registration. These landmarks include\nthe liver's inferior ridge, the falciform ligament, and the occluding contours.\nThey are usually marked by hand in both the laparoscopic image and the 3D\nmodel, which is time-consuming and may contain errors if done by a\nnon-experienced user. Therefore, there is a need to automate this process so\nthat augmented reality can be used effectively in the operating room. We\npresent the Preoperative-to-Intraoperative Laparoscopic Fusion Challenge\n(P2ILF), held during the Medical Imaging and Computer Assisted Interventions\n(MICCAI 2022) conference, which investigates the possibilities of detecting\nthese landmarks automatically and using them in registration. The challenge was\ndivided into two tasks: 1) A 2D and 3D landmark detection task and 2) a 3D-2D\nregistration task. The teams were provided with training data consisting of 167\nlaparoscopic images and 9 preoperative 3D models from 9 patients, with the\ncorresponding 2D and 3D landmark annotations. A total of 6 teams from 4\ncountries participated, whose proposed methods were evaluated on 16 images and\ntwo preoperative 3D models from two patients. All the teams proposed deep\nlearning-based methods for the 2D and 3D landmark segmentation tasks and\ndifferentiable rendering-based methods for the registration task. Based on the\nexperimental outcomes, we propose three key hypotheses that determine current\nlimitations and future directions for research in this domain.\n","authors":["Sharib Ali","Yamid Espinel","Yueming Jin","Peng Liu","Bianca Güttner","Xukun Zhang","Lihua Zhang","Tom Dowrick","Matthew J. Clarkson","Shiting Xiao","Yifan Wu","Yijun Yang","Lei Zhu","Dai Sun","Lan Li","Micha Pfeiffer","Shahid Farid","Lena Maier-Hein","Emmanuel Buc","Adrien Bartoli"],"pdf_url":"https://arxiv.org/pdf/2401.15753v2.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2309.01775v2","updated":"2024-02-07T11:30:01Z","published":"2023-09-04T19:28:54Z","title":"Gated recurrent neural networks discover attention","summary":" Recent architectural developments have enabled recurrent neural networks\n(RNNs) to reach and even surpass the performance of Transformers on certain\nsequence modeling tasks. These modern RNNs feature a prominent design pattern:\nlinear recurrent layers interconnected by feedforward paths with multiplicative\ngating. Here, we show how RNNs equipped with these two design elements can\nexactly implement (linear) self-attention, the main building block of\nTransformers. By reverse-engineering a set of trained RNNs, we find that\ngradient descent in practice discovers our construction. In particular, we\nexamine RNNs trained to solve simple in-context learning tasks on which\nTransformers are known to excel and find that gradient descent instills in our\nRNNs the same attention-based in-context learning algorithm used by\nTransformers. Our findings highlight the importance of multiplicative\ninteractions in neural networks and suggest that certain RNNs might be\nunexpectedly implementing attention under the hood.\n","authors":["Nicolas Zucchet","Seijin Kobayashi","Yassir Akram","Johannes von Oswald","Maxime Larcher","Angelika Steger","João Sacramento"],"pdf_url":"https://arxiv.org/pdf/2309.01775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15613v5","updated":"2024-02-07T11:28:16Z","published":"2023-05-24T23:04:34Z","title":"O$n$ Learning Deep O($n$)-Equivariant Hyperspheres","summary":" In this paper, we utilize hyperspheres and regular $n$-simplexes and propose\nan approach to learning deep features equivariant under the transformations of\n$n$D reflections and rotations, encompassed by the powerful group of O$(n)$.\nNamely, we propose O$(n)$-equivariant neurons with spherical decision surfaces\nthat generalize to any dimension $n$, which we call Deep Equivariant\nHyperspheres. We demonstrate how to combine them in a network that directly\noperates on the basis of the input points and propose an invariant operator\nbased on the relation between two points and a sphere, which as we show, turns\nout to be a Gram matrix. Using synthetic and real-world data in $n$D, we\nexperimentally verify our theoretical contributions and find that our approach\nis superior to the competing methods for O$(n)$-equivariant benchmark datasets\n(classification and regression), demonstrating a favorable speed/performance\ntrade-off.\n","authors":["Pavlo Melnyk","Michael Felsberg","Mårten Wadenbäck","Andreas Robinson","Cuong Le"],"pdf_url":"https://arxiv.org/pdf/2305.15613v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04764v1","updated":"2024-02-07T11:27:45Z","published":"2024-02-07T11:27:45Z","title":"Code as Reward: Empowering Reinforcement Learning with VLMs","summary":" Pre-trained Vision-Language Models (VLMs) are able to understand visual\nconcepts, describe and decompose complex tasks into sub-tasks, and provide\nfeedback on task completion. In this paper, we aim to leverage these\ncapabilities to support the training of reinforcement learning (RL) agents. In\nprinciple, VLMs are well suited for this purpose, as they can naturally analyze\nimage-based observations and provide feedback (reward) on learning progress.\nHowever, inference in VLMs is computationally expensive, so querying them\nfrequently to compute rewards would significantly slowdown the training of an\nRL agent. To address this challenge, we propose a framework named Code as\nReward (VLM-CaR). VLM-CaR produces dense reward functions from VLMs through\ncode generation, thereby significantly reducing the computational burden of\nquerying the VLM directly. We show that the dense rewards generated through our\napproach are very accurate across a diverse set of discrete and continuous\nenvironments, and can be more effective in training RL policies than the\noriginal sparse environment rewards.\n","authors":["David Venuto","Sami Nur Islam","Martin Klissarov","Doina Precup","Sherry Yang","Ankit Anand"],"pdf_url":"https://arxiv.org/pdf/2402.04764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04762v1","updated":"2024-02-07T11:26:00Z","published":"2024-02-07T11:26:00Z","title":"Color Recognition in Challenging Lighting Environments: CNN Approach","summary":" Light plays a vital role in vision either human or machine vision, the\nperceived color is always based on the lighting conditions of the surroundings.\nResearchers are working to enhance the color detection techniques for the\napplication of computer vision. They have implemented proposed several methods\nusing different color detection approaches but still, there is a gap that can\nbe filled. To address this issue, a color detection method, which is based on a\nConvolutional Neural Network (CNN), is proposed. Firstly, image segmentation is\nperformed using the edge detection segmentation technique to specify the object\nand then the segmented object is fed to the Convolutional Neural Network\ntrained to detect the color of an object in different lighting conditions. It\nis experimentally verified that our method can substantially enhance the\nrobustness of color detection in different lighting conditions, and our method\nperformed better results than existing methods.\n","authors":["Nizamuddin Maitlo","Nooruddin Noonari","Sajid Ahmed Ghanghro","Sathishkumar Duraisamy","Fayaz Ahmed"],"pdf_url":"https://arxiv.org/pdf/2402.04762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16123v2","updated":"2024-02-07T11:25:28Z","published":"2024-01-29T12:48:56Z","title":"Looking for a better fit? An Incremental Learning Multimodal Object\n Referencing Framework adapting to Individual Drivers","summary":" The rapid advancement of the automotive industry towards automated and\nsemi-automated vehicles has rendered traditional methods of vehicle\ninteraction, such as touch-based and voice command systems, inadequate for a\nwidening range of non-driving related tasks, such as referencing objects\noutside of the vehicle. Consequently, research has shifted toward gestural\ninput (e.g., hand, gaze, and head pose gestures) as a more suitable mode of\ninteraction during driving. However, due to the dynamic nature of driving and\nindividual variation, there are significant differences in drivers' gestural\ninput performance. While, in theory, this inherent variability could be\nmoderated by substantial data-driven machine learning models, prevalent\nmethodologies lean towards constrained, single-instance trained models for\nobject referencing. These models show a limited capacity to continuously adapt\nto the divergent behaviors of individual drivers and the variety of driving\nscenarios. To address this, we propose \\textit{IcRegress}, a novel\nregression-based incremental learning approach that adapts to changing behavior\nand the unique characteristics of drivers engaged in the dual task of driving\nand referencing objects. We suggest a more personalized and adaptable solution\nfor multimodal gestural interfaces, employing continuous lifelong learning to\nenhance driver experience, safety, and convenience. Our approach was evaluated\nusing an outside-the-vehicle object referencing use case, highlighting the\nsuperiority of the incremental learning models adapted over a single trained\nmodel across various driver traits such as handedness, driving experience, and\nnumerous driving conditions. Finally, to facilitate reproducibility, ease\ndeployment, and promote further research, we offer our approach as an\nopen-source framework at \\url{https://github.com/amrgomaaelhady/IcRegress}.\n","authors":["Amr Gomaa","Guillermo Reyes","Michael Feld","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2401.16123v2.pdf","comment":"Accepted for publication in the Proceedings of the 29th International\n Conference on Intelligent User Interfaces (IUI'24), March 18--21, 2024, in\n Greenville, SC, USA"},{"id":"http://arxiv.org/abs/2310.08164v4","updated":"2024-02-07T11:13:15Z","published":"2023-10-12T09:36:03Z","title":"Beyond Training Objectives: Interpreting Reward Model Divergence in\n Large Language Models","summary":" Large language models (LLMs) fine-tuned by reinforcement learning from human\nfeedback (RLHF) are becoming more widely deployed. We coin the term\n$\\textit{Implicit Reward Model}$ (IRM) to refer to the changes that occur to an\nLLM during RLHF that result in high-reward generations. We interpret IRMs, and\nmeasure their divergence from the RLHF reward model used in the fine-tuning\nprocess that induced them. By fitting a linear function to an LLM's IRM, a\nreward model with the same type signature as the RLHF reward model is\nconstructed, allowing for direct comparison. Additionally, we validate our\nconstruction of the IRM through cross-comparison with classifications of\nfeatures generated by an LLM based on their relevance to the RLHF reward model.\nBetter comprehending IRMs can help minimize discrepencies between LLM behavior\nand training objectives, which we believe to be an essential component of the\n$\\textit{safety}$ and $\\textit{alignment}$ of LLMs.\n","authors":["Luke Marks","Amir Abdullah","Clement Neo","Rauno Arike","Philip Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.08164v4.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2402.04754v1","updated":"2024-02-07T11:12:41Z","published":"2024-02-07T11:12:41Z","title":"Towards Aligned Layout Generation via Diffusion Model with Aesthetic\n Constraints","summary":" Controllable layout generation refers to the process of creating a plausible\nvisual arrangement of elements within a graphic design (e.g., document and web\ndesigns) with constraints representing design intentions. Although recent\ndiffusion-based models have achieved state-of-the-art FID scores, they tend to\nexhibit more pronounced misalignment compared to earlier transformer-based\nmodels. In this work, we propose the $\\textbf{LA}$yout $\\textbf{C}$onstraint\ndiffusion mod$\\textbf{E}$l (LACE), a unified model to handle a broad range of\nlayout generation tasks, such as arranging elements with specified attributes\nand refining or completing a coarse layout design. The model is based on\ncontinuous diffusion models. Compared with existing methods that use discrete\ndiffusion models, continuous state-space design can enable the incorporation of\ndifferentiable aesthetic constraint functions in training. For conditional\ngeneration, we introduce conditions via masked input. Extensive experiment\nresults show that LACE produces high-quality layouts and outperforms existing\nstate-of-the-art baselines.\n","authors":["Jian Chen","Ruiyi Zhang","Yufan Zhou","Changyou Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04754v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2306.00107v3","updated":"2024-02-07T11:12:27Z","published":"2023-05-31T18:27:43Z","title":"MERT: Acoustic Music Understanding Model with Large-Scale\n Self-supervised Training","summary":" Self-supervised learning (SSL) has recently emerged as a promising paradigm\nfor training generalisable models on large-scale data in the fields of vision,\ntext, and speech. Although SSL has been proven effective in speech and audio,\nits application to music audio has yet to be thoroughly explored. This is\npartially due to the distinctive challenges associated with modelling musical\nknowledge, particularly tonal and pitched characteristics of music. To address\nthis research gap, we propose an acoustic Music undERstanding model with\nlarge-scale self-supervised Training (MERT), which incorporates teacher models\nto provide pseudo labels in the masked language modelling (MLM) style acoustic\npre-training. In our exploration, we identified an effective combination of\nteacher models, which outperforms conventional speech and audio approaches in\nterms of performance. This combination includes an acoustic teacher based on\nResidual Vector Quantisation - Variational AutoEncoder (RVQ-VAE) and a musical\nteacher based on the Constant-Q Transform (CQT). Furthermore, we explore a wide\nrange of settings to overcome the instability in acoustic language model\npre-training, which allows our designed paradigm to scale from 95M to 330M\nparameters. Experimental results indicate that our model can generalise and\nperform well on 14 music understanding tasks and attain state-of-the-art (SOTA)\noverall scores.\n","authors":["Yizhi Li","Ruibin Yuan","Ge Zhang","Yinghao Ma","Xingran Chen","Hanzhi Yin","Chenghao Xiao","Chenghua Lin","Anton Ragni","Emmanouil Benetos","Norbert Gyenge","Roger Dannenberg","Ruibo Liu","Wenhu Chen","Gus Xia","Yemin Shi","Wenhao Huang","Zili Wang","Yike Guo","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2306.00107v3.pdf","comment":"accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2306.03801v2","updated":"2024-02-07T11:03:48Z","published":"2023-06-06T15:45:07Z","title":"Stable Vectorization of Multiparameter Persistent Homology using Signed\n Barcodes as Measures","summary":" Persistent homology (PH) provides topological descriptors for geometric data,\nsuch as weighted graphs, which are interpretable, stable to perturbations, and\ninvariant under, e.g., relabeling. Most applications of PH focus on the\none-parameter case -- where the descriptors summarize the changes in topology\nof data as it is filtered by a single quantity of interest -- and there is now\na wide array of methods enabling the use of one-parameter PH descriptors in\ndata science, which rely on the stable vectorization of these descriptors as\nelements of a Hilbert space. Although the multiparameter PH (MPH) of data that\nis filtered by several quantities of interest encodes much richer information\nthan its one-parameter counterpart, the scarceness of stability results for MPH\ndescriptors has so far limited the available options for the stable\nvectorization of MPH. In this paper, we aim to bring together the best of both\nworlds by showing how the interpretation of signed barcodes -- a recent family\nof MPH descriptors -- as signed measures leads to natural extensions of\nvectorization strategies from one parameter to multiple parameters. The\nresulting feature vectors are easy to define and to compute, and provably\nstable. While, as a proof of concept, we focus on simple choices of signed\nbarcodes and vectorizations, we already see notable performance improvements\nwhen comparing our feature vectors to state-of-the-art topology-based methods\non various types of data.\n","authors":["David Loiseaux","Luis Scoccola","Mathieu Carrière","Magnus Bakke Botnan","Steve Oudot"],"pdf_url":"https://arxiv.org/pdf/2306.03801v2.pdf","comment":"26 pages, 4 figures, 9 tables; v2: final version in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00639v2","updated":"2024-02-07T10:57:28Z","published":"2023-12-01T14:59:43Z","title":"RefinedFields: Radiance Fields Refinement for Unconstrained Scenes","summary":" Modeling large scenes from unconstrained images has proven to be a major\nchallenge in computer vision. Existing methods tackling in-the-wild scene\nmodeling operate in closed-world settings, where no conditioning on priors\nacquired from real-world images is present. We propose RefinedFields, which is,\nto the best of our knowledge, the first method leveraging pre-trained models to\nimprove in-the-wild scene modeling. We employ pre-trained networks to refine\nK-Planes representations via optimization guidance using an alternating\ntraining procedure. We carry out extensive experiments and verify the merit of\nour method on synthetic data and real tourism photo collections. RefinedFields\nenhances rendered scenes with richer details and outperforms previous work on\nthe task of novel view synthesis in the wild. Our project page can be found at\nhttps://refinedfields.github.io .\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04744v1","updated":"2024-02-07T10:55:59Z","published":"2024-02-07T10:55:59Z","title":"Progressive Gradient Flow for Robust N:M Sparsity Training in\n Transformers","summary":" N:M Structured sparsity has garnered significant interest as a result of\nrelatively modest overhead and improved efficiency. Additionally, this form of\nsparsity holds considerable appeal for reducing the memory footprint owing to\ntheir modest representation overhead. There have been efforts to develop\ntraining recipes for N:M structured sparsity, they primarily focus on\nlow-sparsity regions ($\\sim$50\\%). Nonetheless, performance of models trained\nusing these approaches tends to decline when confronted with high-sparsity\nregions ($>$80\\%). In this work, we study the effectiveness of existing sparse\ntraining recipes at \\textit{high-sparsity regions} and argue that these methods\nfail to sustain the model quality on par with low-sparsity regions. We\ndemonstrate that the significant factor contributing to this disparity is the\npresence of elevated levels of induced noise in the gradient magnitudes. To\nmitigate this undesirable effect, we employ decay mechanisms to progressively\nrestrict the flow of gradients towards pruned elements. Our approach improves\nthe model quality by up to 2$\\%$ and 5$\\%$ in vision and language models at\nhigh sparsity regime, respectively. We also evaluate the trade-off between\nmodel accuracy and training compute cost in terms of FLOPs. At iso-training\nFLOPs, our method yields better performance compared to conventional sparse\ntraining recipes, exhibiting an accuracy improvement of up to 2$\\%$. The source\ncode is available at\nhttps://github.com/abhibambhaniya/progressive_gradient_flow_nm_sparsity.\n","authors":["Abhimanyu Rajeshkumar Bambhaniya","Amir Yazdanbakhsh","Suvinay Subramanian","Sheng-Chun Kao","Shivani Agrawal","Utku Evci","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2402.04744v1.pdf","comment":"18 pages, 8 figures, 17 tables. Code is available at\n https://github.com/abhibambhaniya/progressive_gradient_flow_nm_sparsity"},{"id":"http://arxiv.org/abs/2402.04740v1","updated":"2024-02-07T10:51:11Z","published":"2024-02-07T10:51:11Z","title":"Non-Parametric Estimation of Multi-dimensional Marked Hawkes Processes","summary":" An extension of the Hawkes process, the Marked Hawkes process distinguishes\nitself by featuring variable jump size across each event, in contrast to the\nconstant jump size observed in a Hawkes process without marks. While extensive\nliterature has been dedicated to the non-parametric estimation of both the\nlinear and non-linear Hawkes process, there remains a significant gap in the\nliterature regarding the marked Hawkes process. In response to this, we propose\na methodology for estimating the conditional intensity of the marked Hawkes\nprocess. We introduce two distinct models: \\textit{Shallow Neural Hawkes with\nmarks}- for Hawkes processes with excitatory kernels and \\textit{Neural Network\nfor Non-Linear Hawkes with Marks}- for non-linear Hawkes processes. Both these\napproaches take the past arrival times and their corresponding marks as the\ninput to obtain the arrival intensity. This approach is entirely\nnon-parametric, preserving the interpretability associated with the marked\nHawkes process. To validate the efficacy of our method, we subject the method\nto synthetic datasets with known ground truth. Additionally, we apply our\nmethod to model cryptocurrency order book data, demonstrating its applicability\nto real-world scenarios.\n","authors":["Sobin Joseph","Shashi Jain"],"pdf_url":"https://arxiv.org/pdf/2402.04740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10947v2","updated":"2024-02-07T10:41:48Z","published":"2023-06-19T14:07:10Z","title":"PAC-Chernoff Bounds: Understanding Generalization in the Interpolation\n Regime","summary":" In this paper, we present a distribution-dependent PAC-Chernoff bound that is\nperfectly tight for interpolators even under overparametrized model classes.\nThis bound relies on basic principles of Large Deviation Theory and naturally\nprovides a characterization of the smoothness of a model described as a simple\nreal-valued function. Based on this distribution-dependent bound and the novel\ndefinition of smoothness, we propose an unifying theoretical explanation of why\nsome interpolators generalize remarkably well while others not. And why a wide\nrange of modern learning techniques (i.e., $\\ell_2$-norm,\ndistance-from-initialization, input-gradient and variance regularization\ntogether with data augmentation, invariant architectures, and\noverparameterization) are able to find them. The emergent conclusion is that\nall these methods provide complimentary procedures that bias the optimizer to\nsmoother interpolators, which, according to this theoretical analysis, are the\nones with better generalization error. One of the main insights of this study\nis that distribution-dependent bounds serve as a powerful tool better\nunderstand the complex dynamics behind the generalization capabilities of\nhighly-overparameterized interpolators.\n","authors":["Andrés R. Masegosa","Luis A. Ortega"],"pdf_url":"https://arxiv.org/pdf/2306.10947v2.pdf","comment":"34 pages, 10 figures, Pre-print"},{"id":"http://arxiv.org/abs/2402.04732v1","updated":"2024-02-07T10:33:09Z","published":"2024-02-07T10:33:09Z","title":"Graph Cuts with Arbitrary Size Constraints Through Optimal Transport","summary":" A common way of partitioning graphs is through minimum cuts. One drawback of\nclassical minimum cut methods is that they tend to produce small groups, which\nis why more balanced variants such as normalized and ratio cuts have seen more\nsuccess. However, we believe that with these variants, the balance constraints\ncan be too restrictive for some applications like for clustering of imbalanced\ndatasets, while not being restrictive enough for when searching for perfectly\nbalanced partitions. Here, we propose a new graph cut algorithm for\npartitioning graphs under arbitrary size constraints. We formulate the graph\ncut problem as a regularized Gromov-Wasserstein problem. We then propose to\nsolve it using accelerated proximal GD algorithm which has global convergence\nguarantees, results in sparse solutions and only incurs an additional ratio of\n$\\mathcal{O}(\\log(n))$ compared to the classical spectral clustering algorithm\nbut was seen to be more efficient.\n","authors":["Chakib Fettal","Lazhar Labiod","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2402.04732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12275v2","updated":"2024-02-07T10:20:55Z","published":"2023-12-19T15:56:30Z","title":"Emergence of In-Context Reinforcement Learning from Noise Distillation","summary":" Recently, extensive studies in Reinforcement Learning have been carried out\non the ability of transformers to adapt in-context to various environments and\ntasks. Current in-context RL methods are limited by their strict requirements\nfor data, which needs to be generated by RL agents or labeled with actions from\nan optimal policy. In order to address this prevalent problem, we propose\nAD$^\\varepsilon$, a new data acquisition approach that enables in-context\nReinforcement Learning from noise-induced curriculum. We show that it is viable\nto construct a synthetic noise injection curriculum which helps to obtain\nlearning histories. Moreover, we experimentally demonstrate that it is possible\nto alleviate the need for generation using optimal policies, with in-context RL\nstill able to outperform the best suboptimal policy in a learning dataset by a\n2x margin.\n","authors":["Ilya Zisman","Vladislav Kurenkov","Alexander Nikulin","Viacheslav Sinii","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.12275v2.pdf","comment":"Preprint, Under Review; code: https://github.com/corl-team/ad-eps"},{"id":"http://arxiv.org/abs/2301.03962v3","updated":"2024-02-07T10:11:39Z","published":"2023-01-10T13:51:07Z","title":"A Unified Theory of Diversity in Ensemble Learning","summary":" We present a theory of ensemble diversity, explaining the nature of diversity\nfor a wide range of supervised learning scenarios. This challenge has been\nreferred to as the holy grail of ensemble learning, an open research issue for\nover 30 years. Our framework reveals that diversity is in fact a hidden\ndimension in the bias-variance decomposition of the ensemble loss. We prove a\nfamily of exact bias-variance-diversity decompositions, for a wide range of\nlosses in both regression and classification, e.g., squared, cross-entropy, and\nPoisson losses. For losses where an additive bias-variance decomposition is not\navailable (e.g., 0/1 loss) we present an alternative approach: quantifying the\neffects of diversity, which turn out to be dependent on the label distribution.\nOverall, we argue that diversity is a measure of model fit, in precisely the\nsame sense as bias and variance, but accounting for statistical dependencies\nbetween ensemble members. Thus, we should not be maximising diversity as so\nmany works aim to do -- instead, we have a bias/variance/diversity trade-off to\nmanage.\n","authors":["Danny Wood","Tingting Mu","Andrew Webb","Henry Reeve","Mikel Luján","Gavin Brown"],"pdf_url":"https://arxiv.org/pdf/2301.03962v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04713v1","updated":"2024-02-07T10:05:42Z","published":"2024-02-07T10:05:42Z","title":"Theoretical and Empirical Analysis of Adaptive Entry Point Selection for\n Graph-based Approximate Nearest Neighbor Search","summary":" We present a theoretical and empirical analysis of the adaptive entry point\nselection for graph-based approximate nearest neighbor search (ANNS). We\nintroduce novel concepts: $b\\textit{-monotonic path}$ and $B\\textit{-MSNET}$,\nwhich better capture an actual graph in practical algorithms than existing\nconcepts like MSNET. We prove that adaptive entry point selection offers better\nperformance upper bound than the fixed central entry point under more general\nconditions than previous work. Empirically, we validate the method's\neffectiveness in accuracy, speed, and memory usage across various datasets,\nespecially in challenging scenarios with out-of-distribution data and hard\ninstances. Our comprehensive study provides deeper insights into optimizing\nentry points for graph-based ANNS for real-world high-dimensional data\napplications.\n","authors":["Yutaro Oguri","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2402.04713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04710v1","updated":"2024-02-07T09:57:39Z","published":"2024-02-07T09:57:39Z","title":"Incorporating Retrieval-based Causal Learning with Information\n Bottlenecks for Interpretable Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have gained considerable traction for their\ncapability to effectively process topological data, yet their interpretability\nremains a critical concern. Current interpretation methods are dominated by\npost-hoc explanations to provide a transparent and intuitive understanding of\nGNNs. However, they have limited performance in interpreting complicated\nsubgraphs and can't utilize the explanation to advance GNN predictions. On the\nother hand, transparent GNN models are proposed to capture critical subgraphs.\nWhile such methods could improve GNN predictions, they usually don't perform\nwell on explanations. Thus, it is desired for a new strategy to better couple\nGNN explanation and prediction. In this study, we have developed a novel\ninterpretable causal GNN framework that incorporates retrieval-based causal\nlearning with Graph Information Bottleneck (GIB) theory. The framework could\nsemi-parametrically retrieve crucial subgraphs detected by GIB and compress the\nexplanatory subgraphs via a causal module. The framework was demonstrated to\nconsistently outperform state-of-the-art methods, and to achieve 32.71\\% higher\nprecision on real-world explanation scenarios with diverse explanation types.\nMore importantly, the learned explanations were shown able to also improve GNN\nprediction performance.\n","authors":["Jiahua Rao","Jiancong Xie","Hanjing Lin","Shuangjia Zheng","Zhen Wang","Yuedong Yang"],"pdf_url":"https://arxiv.org/pdf/2402.04710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14839v4","updated":"2024-02-07T09:47:25Z","published":"2022-05-30T03:57:46Z","title":"Adversarial Bandits against Arbitrary Strategies","summary":" We study the adversarial bandit problem against arbitrary strategies, in\nwhich $S$ is the parameter for the hardness of the problem and this parameter\nis not given to the agent. To handle this problem, we adopt the master-base\nframework using the online mirror descent method (OMD). We first provide a\nmaster-base algorithm with simple OMD, achieving\n$\\tilde{O}(S^{1/2}K^{1/3}T^{2/3})$, in which $T^{2/3}$ comes from the variance\nof loss estimators. To mitigate the impact of the variance, we propose using\nadaptive learning rates for OMD and achieve\n$\\tilde{O}(\\min\\{\\mathbb{E}[\\sqrt{SKT\\rho_T(h^\\dagger)}],S\\sqrt{KT}\\})$, where\n$\\rho_T(h^\\dagger)$ is a variance term for loss estimators.\n","authors":["Jung-hun Kim","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2205.14839v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04699v1","updated":"2024-02-07T09:39:29Z","published":"2024-02-07T09:39:29Z","title":"EvoSeed: Unveiling the Threat on Deep Neural Networks with Real-World\n Illusions","summary":" Deep neural networks are exploited using natural adversarial samples, which\nhave no impact on human perception but are misclassified. Current approaches\noften rely on the white-box nature of deep neural networks to generate these\nadversarial samples or alter the distribution of adversarial samples compared\nto training distribution. To alleviate the limitations of current approaches,\nwe propose EvoSeed, a novel evolutionary strategy-based search algorithmic\nframework to generate natural adversarial samples. Our EvoSeed framework uses\nauxiliary Diffusion and Classifier models to operate in a model-agnostic\nblack-box setting. We employ CMA-ES to optimize the search for an adversarial\nseed vector, which, when processed by the Conditional Diffusion Model, results\nin an unrestricted natural adversarial sample misclassified by the Classifier\nModel. Experiments show that generated adversarial images are of high image\nquality and are transferable to different classifiers. Our approach\ndemonstrates promise in enhancing the quality of adversarial samples using\nevolutionary algorithms. We hope our research opens new avenues to enhance the\nrobustness of deep neural networks in real-world scenarios. Project Website can\nbe accessed at \\url{https://shashankkotyan.github.io/EvoSeed}.\n","authors":["Shashank Kotyan","PoYuan Mao","Danilo Vasconcellos Vargas"],"pdf_url":"https://arxiv.org/pdf/2402.04699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04692v1","updated":"2024-02-07T09:32:32Z","published":"2024-02-07T09:32:32Z","title":"From explained variance of correlated components to PCA without\n orthogonality constraints","summary":" Block Principal Component Analysis (Block PCA) of a data matrix A, where\nloadings Z are determined by maximization of AZ 2 over unit norm orthogonal\nloadings, is difficult to use for the design of sparse PCA by 1 regularization,\ndue to the difficulty of taking care of both the orthogonality constraint on\nloadings and the non differentiable 1 penalty. Our objective in this paper is\nto relax the orthogonality constraint on loadings by introducing new objective\nfunctions expvar(Y) which measure the part of the variance of the data matrix A\nexplained by correlated components Y = AZ. So we propose first a comprehensive\nstudy of mathematical and numerical properties of expvar(Y) for two existing\ndefinitions Zou et al. [2006], Shen and Huang [2008] and four new definitions.\nThen we show that only two of these explained variance are fit to use as\nobjective function in block PCA formulations for A rid of orthogonality\nconstraints.\n","authors":["Marie Chavent","Guy Chavent"],"pdf_url":"https://arxiv.org/pdf/2402.04692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04691v1","updated":"2024-02-07T09:31:01Z","published":"2024-02-07T09:31:01Z","title":"Learning Operators with Stochastic Gradient Descent in General Hilbert\n Spaces","summary":" This study investigates leveraging stochastic gradient descent (SGD) to learn\noperators between general Hilbert spaces. We propose weak and strong regularity\nconditions for the target operator to depict its intrinsic structure and\ncomplexity. Under these conditions, we establish upper bounds for convergence\nrates of the SGD algorithm and conduct a minimax lower bound analysis, further\nillustrating that our convergence analysis and regularity conditions\nquantitatively characterize the tractability of solving operator learning\nproblems using the SGD algorithm. It is crucial to highlight that our\nconvergence analysis is still valid for nonlinear operator learning. We show\nthat the SGD estimator will converge to the best linear approximation of the\nnonlinear target operator. Moreover, applying our analysis to operator learning\nproblems based on vector-valued and real-valued reproducing kernel Hilbert\nspaces yields new convergence results, thereby refining the conclusions of\nexisting literature.\n","authors":["Lei Shi","Jia-Qi Yang"],"pdf_url":"https://arxiv.org/pdf/2402.04691v1.pdf","comment":"56 pages"},{"id":"http://arxiv.org/abs/2402.03780v2","updated":"2024-02-07T09:23:42Z","published":"2024-02-06T07:51:54Z","title":"Exposing propaganda: an analysis of stylistic cues comparing human\n annotations and machine classification","summary":" This paper investigates the language of propaganda and its stylistic\nfeatures. It presents the PPN dataset, standing for Propagandist Pseudo-News, a\nmultisource, multilingual, multimodal dataset composed of news articles\nextracted from websites identified as propaganda sources by expert agencies. A\nlimited sample from this set was randomly mixed with papers from the regular\nFrench press, and their URL masked, to conduct an annotation-experiment by\nhumans, using 11 distinct labels. The results show that human annotators were\nable to reliably discriminate between the two types of press across each of the\nlabels. We propose different NLP techniques to identify the cues used by the\nannotators, and to compare them with machine classification. They include the\nanalyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to\nserve as a baseline, and four different classifiers: two RoBERTa-based models,\nCATS using syntax, and one XGBoost combining syntactic and semantic features.\n","authors":["Géraud Faye","Benjamin Icard","Morgane Casanova","Julien Chanson","François Maine","François Bancilhon","Guillaume Gadek","Guillaume Gravier","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2402.03780v2.pdf","comment":"Paper to appear in the EACL 2024 Proceedings of the Third Workshop on\n Understanding Implicit and Underspecified Language (UnImplicit 2024)"},{"id":"http://arxiv.org/abs/2402.04678v1","updated":"2024-02-07T09:09:14Z","published":"2024-02-07T09:09:14Z","title":"Large Language Models As Faithful Explainers","summary":" Large Language Models (LLMs) have recently become proficient in addressing\ncomplex tasks by utilizing their rich internal knowledge and reasoning ability.\nConsequently, this complexity hinders traditional input-focused explanation\nalgorithms for explaining the complex decision-making processes of LLMs. Recent\nadvancements have thus emerged for self-explaining their predictions through a\nsingle feed-forward inference in a natural language format. However, natural\nlanguage explanations are often criticized for lack of faithfulness since these\nexplanations may not accurately reflect the decision-making behaviors of the\nLLMs. In this work, we introduce a generative explanation framework, xLLM, to\nimprove the faithfulness of the explanations provided in natural language\nformats for LLMs. Specifically, we propose an evaluator to quantify the\nfaithfulness of natural language explanation and enhance the faithfulness by an\niterative optimization process of xLLM, with the goal of maximizing the\nfaithfulness scores. Experiments conducted on three NLU datasets demonstrate\nthat xLLM can significantly improve the faithfulness of generated explanations,\nwhich are in alignment with the behaviors of LLMs.\n","authors":["Yu-Neng Chuang","Guanchu Wang","Chia-Yuan Chang","Ruixiang Tang","Fan Yang","Mengnan Du","Xuanting Cai","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2402.04678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04676v1","updated":"2024-02-07T09:03:04Z","published":"2024-02-07T09:03:04Z","title":"Group Distributionally Robust Dataset Distillation with Risk\n Minimization","summary":" Dataset distillation (DD) has emerged as a widely adopted technique for\ncrafting a synthetic dataset that captures the essential information of a\ntraining dataset, facilitating the training of accurate neural models. Its\napplications span various domains, including transfer learning, federated\nlearning, and neural architecture search. The most popular methods for\nconstructing the synthetic data rely on matching the convergence properties of\ntraining the model with the synthetic dataset and the training dataset.\nHowever, targeting the training dataset must be thought of as auxiliary in the\nsame sense that the training set is an approximate substitute for the\npopulation distribution, and the latter is the data of interest. Yet despite\nits popularity, an aspect that remains unexplored is the relationship of DD to\nits generalization, particularly across uncommon subgroups. That is, how can we\nensure that a model trained on the synthetic dataset performs well when faced\nwith samples from regions with low population density? Here, the\nrepresentativeness and coverage of the dataset become salient over the\nguaranteed training error at inference. Drawing inspiration from\ndistributionally robust optimization, we introduce an algorithm that combines\nclustering with the minimization of a risk measure on the loss to conduct DD.\nWe provide a theoretical rationale for our approach and demonstrate its\neffective generalization and robustness across subgroups through numerical\nexperiments.\n","authors":["Saeed Vahidian","Mingyu Wang","Jianyang Gu","Vyacheslav Kungurtsev","Wei Jiang","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2402.04676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04668v1","updated":"2024-02-07T08:53:46Z","published":"2024-02-07T08:53:46Z","title":"A Perspective on Individualized Treatment Effects Estimation from\n Time-series Health Data","summary":" The burden of diseases is rising worldwide, with unequal treatment efficacy\nfor patient populations that are underrepresented in clinical trials.\nHealthcare, however, is driven by the average population effect of medical\ntreatments and, therefore, operates in a \"one-size-fits-all\" approach, not\nnecessarily what best fits each patient. These facts suggest a pressing need\nfor methodologies to study individualized treatment effects (ITE) to drive\npersonalized treatment. Despite the increased interest in\nmachine-learning-driven ITE estimation models, the vast majority focus on\ntabular data with limited review and understanding of methodologies proposed\nfor time-series electronic health records (EHRs). To this end, this work\nprovides an overview of ITE works for time-series data and insights into future\nresearch. The work summarizes the latest work in the literature and reviews it\nin light of theoretical assumptions, types of treatment settings, and\ncomputational frameworks. Furthermore, this work discusses challenges and\nfuture research directions for ITEs in a time-series setting. We hope this work\nopens new directions and serves as a resource for understanding one of the\nexciting yet under-studied research areas.\n","authors":["Ghadeer O. Ghosheh","Moritz Gögl","Tingting Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.04668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04660v1","updated":"2024-02-07T08:49:33Z","published":"2024-02-07T08:49:33Z","title":"Adversarial Robustness Through Artifact Design","summary":" Adversarial examples arose as a challenge for machine learning. To hinder\nthem, most defenses alter how models are trained (e.g., adversarial training)\nor inference is made (e.g., randomized smoothing). Still, while these\napproaches markedly improve models' adversarial robustness, models remain\nhighly susceptible to adversarial examples. Identifying that, in certain\ndomains such as traffic-sign recognition, objects are implemented per standards\nspecifying how artifacts (e.g., signs) should be designed, we propose a novel\napproach for improving adversarial robustness. Specifically, we offer a method\nto redefine standards, making minor changes to existing ones, to defend against\nadversarial examples. We formulate the problem of artifact design as a robust\noptimization problem, and propose gradient-based and greedy search methods to\nsolve it. We evaluated our approach in the domain of traffic-sign recognition,\nallowing it to alter traffic-sign pictograms (i.e., symbols within the signs)\nand their colors. We found that, combined with adversarial training, our\napproach led to up to 25.18\\% higher robust accuracy compared to\nstate-of-the-art methods against two adversary types, while further increasing\naccuracy on benign inputs.\n","authors":["Tsufit Shua","Mahmood Sharif"],"pdf_url":"https://arxiv.org/pdf/2402.04660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18562v2","updated":"2024-02-07T08:47:47Z","published":"2023-10-28T02:20:33Z","title":"Optimization-Free Test-Time Adaptation for Cross-Person Activity\n Recognition","summary":" Human Activity Recognition (HAR) models often suffer from performance\ndegradation in real-world applications due to distribution shifts in activity\npatterns across individuals. Test-Time Adaptation (TTA) is an emerging learning\nparadigm that aims to utilize the test stream to adjust predictions in\nreal-time inference, which has not been explored in HAR before. However, the\nhigh computational cost of optimization-based TTA algorithms makes it\nintractable to run on resource-constrained edge devices. In this paper, we\npropose an Optimization-Free Test-Time Adaptation (OFTTA) framework for\nsensor-based HAR. OFTTA adjusts the feature extractor and linear classifier\nsimultaneously in an optimization-free manner. For the feature extractor, we\npropose Exponential DecayTest-time Normalization (EDTN) to replace the\nconventional batch normalization (CBN) layers. EDTN combines CBN and Test-time\nbatch Normalization (TBN) to extract reliable features against domain shifts\nwith TBN's influence decreasing exponentially in deeper layers. For the\nclassifier, we adjust the prediction by computing the distance between the\nfeature and the prototype, which is calculated by a maintained support set. In\naddition, the update of the support set is based on the pseudo label, which can\nbenefit from reliable features extracted by EDTN. Extensive experiments on\nthree public cross-person HAR datasets and two different TTA settings\ndemonstrate that OFTTA outperforms the state-of-the-art TTA approaches in both\nclassification performance and computational efficiency. Finally, we verify the\nsuperiority of our proposed OFTTA on edge devices, indicating possible\ndeployment in real applications. Our code is available at\nhttps://github.com/Claydon-Wang/OFTTA.\n","authors":["Shuoyuan Wang","Jindong Wang","HuaJun Xi","Bob Zhang","Lei Zhang","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2310.18562v2.pdf","comment":"To be presented at UbiComp 2024; Accepted by Proceedings of the ACM\n on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT)"},{"id":"http://arxiv.org/abs/2402.04655v1","updated":"2024-02-07T08:42:48Z","published":"2024-02-07T08:42:48Z","title":"Open-Vocabulary Calibration for Vision-Language Models","summary":" Vision-language models (VLMs) have emerged as formidable tools, showing their\nstrong capability in handling various open-vocabulary tasks in image\nrecognition, text-driven visual content generation, and visual chatbots, to\nname a few. In recent years, considerable efforts and resources have been\ndevoted to adaptation methods for improving downstream performance of VLMs,\nparticularly on parameter-efficient fine-tuning methods like prompt learning.\nHowever, a crucial aspect that has been largely overlooked is the confidence\ncalibration problem in fine-tuned VLMs, which could greatly reduce reliability\nwhen deploying such models in the real world. This paper bridges the gap by\nsystematically investigating the confidence calibration problem in the context\nof prompt learning and reveals that existing calibration methods are\ninsufficient to address the problem, especially in the open-vocabulary setting.\nTo solve the problem, we present a simple and effective approach called\nDistance-Aware Calibration (DAC), which is based on scaling the temperature\nusing as guidance the distance between predicted text labels and base classes.\nThe experiments with 7 distinct prompt learning methods applied across 11\ndiverse downstream datasets demonstrate the effectiveness of DAC, which\nachieves high efficacy without sacrificing the inference speed.\n","authors":["Shuoyuan Wang","Jindong Wang","Guoqing Wang","Bob Zhang","Kaiyang Zhou","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2402.04655v1.pdf","comment":"Preprrint"},{"id":"http://arxiv.org/abs/2308.13838v4","updated":"2024-02-07T08:41:19Z","published":"2023-08-26T10:09:46Z","title":"Price-Discrimination Game for Distributed Resource Management in\n Federated Learning","summary":" In vanilla federated learning (FL) such as FedAvg, the parameter server (PS)\nand multiple distributed clients can form a typical buyer's market, where the\nnumber of PS/buyers of FL services is far less than the number of\nclients/sellers. In order to improve the performance of FL and reduce the cost\nof motivating clients to participate in FL, this paper proposes to\ndifferentiate the pricing for services provided by different clients rather\nthan simply providing the same service pricing for different clients. The price\nis differentiated based on the performance improvements brought to FL and their\nheterogeneity in computing and communication capabilities. To this end, a\nprice-discrimination game (PDG) is formulated to comprehensively address the\ndistributed resource management problems in FL, including multi-objective\ntrade-off, client selection, and incentive mechanism. As the PDG is a\nmixed-integer nonlinear programming (MINLP) problem, a distributed\nsemi-heuristic algorithm with low computational complexity and low\ncommunication overhead is designed to solve it. The simulation result verifies\nthe effectiveness of the proposed approach.\n","authors":["Han Zhang","Halvin Yang","Guopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13838v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.18079v2","updated":"2024-02-07T08:39:28Z","published":"2024-01-31T18:58:14Z","title":"KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache\n Quantization","summary":" LLMs are seeing growing use for applications such as document analysis and\nsummarization which require large context windows, and with these large context\nwindows KV cache activations surface as the dominant contributor to memory\nconsumption during inference. Quantization is a promising approach for\ncompressing KV cache activations; however, existing solutions fail to represent\nactivations accurately in ultra-low precisions, such as sub-4-bit. In this\nwork, we present KVQuant, which addresses this problem by incorporating novel\nmethods for quantizing cached KV activations, including: (i) Per-Channel Key\nQuantization, where we adjust the dimension along which we quantize the Key\nactivations to better match the distribution; (ii) Pre-RoPE Key Quantization,\nwhere we quantize Key activations before the rotary positional embedding to\nmitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization,\nwhere we derive per-layer sensitivity-weighted non-uniform datatypes that\nbetter represent the distributions; (iv) Per-Vector Dense-and-Sparse\nQuantization, where we isolate outliers separately for each vector to minimize\nskews in quantization ranges; and (v) Q-Norm, where we normalize quantization\ncentroids in order to mitigate distribution shift, providing additional\nbenefits for 2-bit quantization. By applying our method to the LLaMA, LLaMA-2,\nand Mistral models, we achieve $<0.1$ perplexity degradation with 3-bit\nquantization on both Wikitext-2 and C4, outperforming existing approaches. Our\nmethod enables serving the LLaMA-7B model with a context length of up to 1\nmillion on a single A100-80GB GPU and up to 10 million on an 8-GPU system.\n","authors":["Coleman Hooper","Sehoon Kim","Hiva Mohammadzadeh","Michael W. Mahoney","Yakun Sophia Shao","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2401.18079v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04653v1","updated":"2024-02-07T08:38:12Z","published":"2024-02-07T08:38:12Z","title":"An Over Complete Deep Learning Method for Inverse Problems","summary":" Obtaining meaningful solutions for inverse problems has been a major\nchallenge with many applications in science and engineering. Recent machine\nlearning techniques based on proximal and diffusion-based methods have shown\npromising results. However, as we show in this work, they can also face\nchallenges when applied to some exemplary problems. We show that similar to\nprevious works on over-complete dictionaries, it is possible to overcome these\nshortcomings by embedding the solution into higher dimensions. The novelty of\nthe work proposed is that we jointly design and learn the embedding and the\nregularizer for the embedding vector. We demonstrate the merit of this approach\non several exemplary and common inverse problems.\n","authors":["Moshe Eliasof","Eldad Haber","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2402.04653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20285v2","updated":"2024-02-07T08:37:16Z","published":"2023-10-31T08:58:16Z","title":"Accelerating Generalized Linear Models by Trading off Computation for\n Uncertainty","summary":" Bayesian Generalized Linear Models (GLMs) define a flexible probabilistic\nframework to model categorical, ordinal and continuous data, and are widely\nused in practice. However, exact inference in GLMs is prohibitively expensive\nfor large datasets, thus requiring approximations in practice. The resulting\napproximation error adversely impacts the reliability of the model and is not\naccounted for in the uncertainty of the prediction. In this work, we introduce\na family of iterative methods that explicitly model this error. They are\nuniquely suited to parallel modern computing hardware, efficiently recycle\ncomputations, and compress information to reduce both the time and memory\nrequirements for GLMs. As we demonstrate on a realistically large\nclassification problem, our method significantly accelerates training compared\nto competitive baselines by trading off reduced computation for increased\nuncertainty.\n","authors":["Lukas Tatzel","Jonathan Wenger","Frank Schneider","Philipp Hennig"],"pdf_url":"https://arxiv.org/pdf/2310.20285v2.pdf","comment":"Main text: 11 pages, 6 figures; Supplements: 13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.09108v3","updated":"2024-02-07T08:34:53Z","published":"2023-12-14T16:44:38Z","title":"Greedy Shapley Client Selection for Communication-Efficient Federated\n Learning","summary":" The standard client selection algorithms for Federated Learning (FL) are\noften unbiased and involve uniform random sampling of clients. This has been\nproven sub-optimal for fast convergence under practical settings characterized\nby significant heterogeneity in data distribution, computing, and communication\nresources across clients. For applications having timing constraints due to\nlimited communication opportunities with the parameter server (PS), the client\nselection strategy is critical to complete model training within the fixed\nbudget of communication rounds. To address this, we develop a biased client\nselection strategy, GreedyFed, that identifies and greedily selects the most\ncontributing clients in each communication round. This method builds on a fast\napproximation algorithm for the Shapley Value at the PS, making the computation\ntractable for real-world applications with many clients. Compared to various\nclient selection strategies on several real-world datasets, GreedyFed\ndemonstrates fast and stable convergence with high accuracy under timing\nconstraints and when imposing a higher degree of heterogeneity in data\ndistribution, systems constraints, and privacy requirements.\n","authors":["Pranava Singhal","Shashi Raj Pandey","Petar Popovski"],"pdf_url":"https://arxiv.org/pdf/2312.09108v3.pdf","comment":"Accepted for publication in IEEE Networking Letters"},{"id":"http://arxiv.org/abs/2401.04472v2","updated":"2024-02-07T08:33:18Z","published":"2024-01-09T10:22:23Z","title":"A Survey on Efficient Federated Learning Methods for Foundation Model\n Training","summary":" Federated Learning (FL) has become an established technique to facilitate\nprivacy-preserving collaborative training across a multitude of clients.\nHowever, new approaches to FL often discuss their contributions involving small\ndeep-learning models only and focus on training full models on clients. In the\nwake of Foundation Models (FM), the reality is different for many deep learning\napplications. Typically, FMs have already been pre-trained across a wide\nvariety of tasks and can be fine-tuned to specific downstream tasks over\nsignificantly smaller datasets than required for full model training. However,\naccess to such datasets is often challenging. By its design, FL can help to\nopen data silos. With this survey, we introduce a novel taxonomy focused on\ncomputational and communication efficiency, the vital elements to make use of\nFMs in FL systems. We discuss the benefits and drawbacks of parameter-efficient\nfine-tuning (PEFT) for FL applications, elaborate on the readiness of FL\nframeworks to work with FMs and provide future research opportunities on how to\nevaluate generative models in FL as well as the interplay of privacy and PEFT.\n","authors":["Herbert Woisetschläger","Alexander Isenko","Shiqiang Wang","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2401.04472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04647v1","updated":"2024-02-07T08:18:09Z","published":"2024-02-07T08:18:09Z","title":"Latent Plan Transformer: Planning as Latent Variable Inference","summary":" In tasks aiming for long-term returns, planning becomes necessary. We study\ngenerative modeling for planning with datasets repurposed from offline\nreinforcement learning. Specifically, we identify temporal consistency in the\nabsence of step-wise rewards as one key technical challenge. We introduce the\nLatent Plan Transformer (LPT), a novel model that leverages a latent space to\nconnect a Transformer-based trajectory generator and the final return. LPT can\nbe learned with maximum likelihood estimation on trajectory-return pairs. In\nlearning, posterior sampling of the latent variable naturally gathers\nsub-trajectories to form a consistent abstraction despite the finite context.\nDuring test time, the latent variable is inferred from an expected return\nbefore policy execution, realizing the idea of planning as inference. It then\nguides the autoregressive policy throughout the episode, functioning as a plan.\nOur experiments demonstrate that LPT can discover improved decisions from\nsuboptimal trajectories. It achieves competitive performance across several\nbenchmarks, including Gym-Mujoco, Maze2D, and Connect Four, exhibiting\ncapabilities of nuanced credit assignments, trajectory stitching, and\nadaptation to environmental contingencies. These results validate that latent\nvariable inference can be a strong alternative to step-wise reward prompting.\n","authors":["Deqian Kong","Dehong Xu","Minglu Zhao","Bo Pang","Jianwen Xie","Andrew Lizarraga","Yuhao Huang","Sirui Xie","Ying Nian Wu"],"pdf_url":"https://arxiv.org/pdf/2402.04647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04646v1","updated":"2024-02-07T08:18:06Z","published":"2024-02-07T08:18:06Z","title":"Learning with Diversification from Block Sparse Signal","summary":" This paper introduces a novel prior called Diversified Block Sparse Prior to\ncharacterize the widespread block sparsity phenomenon in real-world data. By\nallowing diversification on variance and correlation matrix, we effectively\naddress the sensitivity issue of existing block sparse learning methods to\npre-defined block information, which enables adaptive block estimation while\nmitigating the risk of overfitting. Based on this, a diversified block sparse\nBayesian learning method (DivSBL) is proposed, utilizing EM algorithm and dual\nascent method for hyperparameter estimation. Moreover, we establish the global\nand local optimality theory of our model. Experiments validate the advantages\nof DivSBL over existing algorithms.\n","authors":["Yanhao Zhang","Zhihan Zhu","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2402.04646v1.pdf","comment":"12 pages, 12 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.04644v1","updated":"2024-02-07T08:16:40Z","published":"2024-02-07T08:16:40Z","title":"LEVI: Generalizable Fine-tuning via Layer-wise Ensemble of Different\n Views","summary":" Fine-tuning is becoming widely used for leveraging the power of pre-trained\nfoundation models in new downstream tasks. While there are many successes of\nfine-tuning on various tasks, recent studies have observed challenges in the\ngeneralization of fine-tuned models to unseen distributions (i.e.,\nout-of-distribution; OOD). To improve OOD generalization, some previous studies\nidentify the limitations of fine-tuning data and regulate fine-tuning to\npreserve the general representation learned from pre-training data. However,\npotential limitations in the pre-training data and models are often ignored. In\nthis paper, we contend that overly relying on the pre-trained representation\nmay hinder fine-tuning from learning essential representations for downstream\ntasks and thus hurt its OOD generalization. It can be especially catastrophic\nwhen new tasks are from different (sub)domains compared to pre-training data.\nTo address the issues in both pre-training and fine-tuning data, we propose a\nnovel generalizable fine-tuning method LEVI, where the pre-trained model is\nadaptively ensembled layer-wise with a small task-specific model, while\npreserving training and inference efficiencies. By combining two complementing\nmodels, LEVI effectively suppresses problematic features in both the\nfine-tuning data and pre-trained model and preserves useful features for new\ntasks. Broad experiments with large language and vision models show that LEVI\ngreatly improves fine-tuning generalization via emphasizing different views\nfrom fine-tuning data and pre-trained features.\n","authors":["Yuji Roh","Qingyun Liu","Huan Gui","Zhe Yuan","Yujin Tang","Steven Euijong Whang","Liang Liu","Shuchao Bi","Lichan Hong","Ed H. Chi","Zhe Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.04644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01345v3","updated":"2024-02-07T08:07:02Z","published":"2024-02-02T12:02:46Z","title":"Skip \\n: A Simple Method to Reduce Hallucination in Large\n Vision-Language Models","summary":" Recent advancements in large vision-language models (LVLMs) have demonstrated\nimpressive capability in visual information understanding with human language.\nDespite these advances, LVLMs still face challenges with multimodal\nhallucination, such as generating text descriptions of objects that are not\npresent in the visual information. However, the underlying fundamental reasons\nof multimodal hallucinations remain poorly explored. In this paper, we propose\na new perspective, suggesting that the inherent biases in LVLMs might be a key\nfactor in hallucinations. Specifically, we systematically identify a semantic\nshift bias related to paragraph breaks (\\n\\n), where the content before and\nafter '\\n\\n' in the training data frequently exhibit significant semantic\nchanges. This pattern leads the model to infer that the contents following\n'\\n\\n' should be obviously different from the preceding contents with less\nhallucinatory descriptions, thereby increasing the probability of hallucinatory\ndescriptions subsequent to the '\\n\\n'. We have validated this hypothesis on\nmultiple publicly available LVLMs. Besides, we find that deliberately inserting\n'\\n\\n' at the generated description can induce more hallucinations. A simple\nmethod is proposed to effectively mitigate the hallucination of LVLMs by\nskipping the output of '\\n'.\n","authors":["Zongbo Han","Zechen Bai","Haiyang Mei","Qianli Xu","Changqing Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2402.01345v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2402.04640v1","updated":"2024-02-07T07:57:43Z","published":"2024-02-07T07:57:43Z","title":"Domain Bridge: Generative model-based domain forensic for black-box\n models","summary":" In forensic investigations of machine learning models, techniques that\ndetermine a model's data domain play an essential role, with prior work relying\non large-scale corpora like ImageNet to approximate the target model's domain.\nAlthough such methods are effective in finding broad domains, they often\nstruggle in identifying finer-grained classes within those domains. In this\npaper, we introduce an enhanced approach to determine not just the general data\ndomain (e.g., human face) but also its specific attributes (e.g., wearing\nglasses). Our approach uses an image embedding model as the encoder and a\ngenerative model as the decoder. Beginning with a coarse-grained description,\nthe decoder generates a set of images, which are then presented to the unknown\ntarget model. Successful classifications by the model guide the encoder to\nrefine the description, which in turn, are used to produce a more specific set\nof images in the subsequent iteration. This iterative refinement narrows down\nthe exact class of interest. A key strength of our approach lies in leveraging\nthe expansive dataset, LAION-5B, on which the generative model Stable Diffusion\nis trained. This enlarges our search space beyond traditional corpora, such as\nImageNet. Empirical results showcase our method's performance in identifying\nspecific attributes of a model's input domain, paving the way for more detailed\nforensic analyses of deep learning models.\n","authors":["Jiyi Zhang","Han Fang","Ee-Chien Chang"],"pdf_url":"https://arxiv.org/pdf/2402.04640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01364v2","updated":"2024-02-07T07:14:39Z","published":"2024-02-02T12:34:09Z","title":"Continual Learning for Large Language Models: A Survey","summary":" Large language models (LLMs) are not amenable to frequent re-training, due to\nhigh training costs arising from their massive scale. However, updates are\nnecessary to endow LLMs with new skills and keep them up-to-date with rapidly\nevolving human knowledge. This paper surveys recent works on continual learning\nfor LLMs. Due to the unique nature of LLMs, we catalog continue learning\ntechniques in a novel multi-staged categorization scheme, involving continual\npretraining, instruction tuning, and alignment. We contrast continual learning\nfor LLMs with simpler adaptation methods used in smaller models, as well as\nwith other enhancement strategies like retrieval-augmented generation and model\nediting. Moreover, informed by a discussion of benchmarks and evaluation, we\nidentify several challenges and future work directions for this crucial task.\n","authors":["Tongtong Wu","Linhao Luo","Yuan-Fang Li","Shirui Pan","Thuy-Trang Vu","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2402.01364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04621v1","updated":"2024-02-07T07:09:15Z","published":"2024-02-07T07:09:15Z","title":"Feature Distribution on Graph Topology Mediates the Effect of Graph\n Convolution: Homophily Perspective","summary":" How would randomly shuffling feature vectors among nodes from the same class\naffect graph neural networks (GNNs)? The feature shuffle, intuitively, perturbs\nthe dependence between graph topology and features (A-X dependence) for GNNs to\nlearn from. Surprisingly, we observe a consistent and significant improvement\nin GNN performance following the feature shuffle. Having overlooked the impact\nof A-X dependence on GNNs, the prior literature does not provide a satisfactory\nunderstanding of the phenomenon. Thus, we raise two research questions. First,\nhow should A-X dependence be measured, while controlling for potential\nconfounds? Second, how does A-X dependence affect GNNs? In response, we (i)\npropose a principled measure for A-X dependence, (ii) design a random graph\nmodel that controls A-X dependence, (iii) establish a theory on how A-X\ndependence relates to graph convolution, and (iv) present empirical analysis on\nreal-world graphs that aligns with the theory. We conclude that A-X dependence\nmediates the effect of graph convolution, such that smaller dependence improves\nGNN-based node classification.\n","authors":["Soo Yong Lee","Sunwoo Kim","Fanchen Bu","Jaemin Yoo","Jiliang Tang","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2402.04621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04620v1","updated":"2024-02-07T07:07:02Z","published":"2024-02-07T07:07:02Z","title":"CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract\n Patients","summary":" The healthcare landscape is evolving, with patients seeking more reliable\ninformation about their health conditions, treatment options, and potential\nrisks. Despite the abundance of information sources, the digital age overwhelms\nindividuals with excess, often inaccurate information. Patients primarily trust\ndoctors and hospital staff, highlighting the need for expert-endorsed health\ninformation. However, the pressure on experts has led to reduced communication\ntime, impacting information sharing. To address this gap, we propose\nCataractBot, an experts-in-the-loop chatbot powered by large language models\n(LLMs). Developed in collaboration with a tertiary eye hospital in India,\nCataractBot answers cataract surgery related questions instantly by querying a\ncurated knowledge base, and provides expert-verified responses asynchronously.\nCataractBot features multimodal support and multilingual capabilities. In an\nin-the-wild deployment study with 49 participants, CataractBot proved valuable,\nproviding anytime accessibility, saving time, and accommodating diverse\nliteracy levels. Trust was established through expert verification. Broadly,\nour results could inform future work on designing expert-mediated LLM bots.\n","authors":["Pragnya Ramjee","Bhuvan Sachdeva","Satvik Golechha","Shreyas Kulkarni","Geeta Fulari","Kaushik Murali","Mohit Jain"],"pdf_url":"https://arxiv.org/pdf/2402.04620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01945v3","updated":"2024-02-07T06:52:29Z","published":"2023-09-05T04:39:34Z","title":"OHQ: On-chip Hardware-aware Quantization","summary":" Quantization emerges as one of the most promising approaches for deploying\nadvanced deep models on resource-constrained hardware. Mixed-precision\nquantization leverages multiple bit-width architectures to unleash the accuracy\nand efficiency potential of quantized models. However, existing mixed-precision\nquantization suffers exhaustive search space that causes immense computational\noverhead. The quantization process thus relies on separate high-performance\ndevices rather than locally, which also leads to a significant gap between the\nconsidered hardware metrics and the real deployment. In this paper, we propose\nan On-chip Hardware-aware Quantization (OHQ) framework that performs\nhardware-aware mixed-precision quantization without accessing online devices.\nFirst, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling\nperceive the actual efficiency metrics of the quantization operator on the\nhardware. Second, we propose Mask-guided Quantization Estimation (MQE)\ntechnique to efficiently estimate the accuracy metrics of operators under the\nconstraints of on-chip-level computing power. By synthesizing network and\nhardware insights through linear programming, we obtain optimized bit-width\nconfigurations. Notably, the quantization process occurs on-chip entirely\nwithout any additional computing devices and data access. We demonstrate\naccelerated inference after quantization for various architectures and\ncompression ratios, achieving 70% and 73% accuracy for ResNet-18 and\nMobileNetV3, respectively. OHQ improves latency by 15~30% compared to INT8 on\ndeployment.\n","authors":["Wei Huang","Haotong Qin","Yangdong Liu","Jingzhuo Liang","Yulun Zhang","Ying Li","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.01945v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.04617v1","updated":"2024-02-07T06:50:42Z","published":"2024-02-07T06:50:42Z","title":"InfLLM: Unveiling the Intrinsic Capacity of LLMs for Understanding\n Extremely Long Sequences with Training-Free Memory","summary":" Large language models (LLMs) have emerged as a cornerstone in real-world\napplications with lengthy streaming inputs, such as LLM-driven agents. However,\nexisting LLMs, pre-trained on sequences with restricted maximum length, cannot\ngeneralize to longer sequences due to the out-of-domain and distraction issues.\nTo alleviate these issues, existing efforts employ sliding attention windows\nand discard distant tokens to achieve the processing of extremely long\nsequences. Unfortunately, these approaches inevitably fail to capture\nlong-distance dependencies within sequences to deeply understand semantics.\nThis paper introduces a training-free memory-based method, InfLLM, to unveil\nthe intrinsic ability of LLMs to process streaming long sequences.\nSpecifically, InfLLM stores distant contexts into additional memory units and\nemploys an efficient mechanism to lookup token-relevant units for attention\ncomputation. Thereby, InfLLM allows LLMs to efficiently process long sequences\nwhile maintaining the ability to capture long-distance dependencies. Without\nany training, InfLLM enables LLMs pre-trained on sequences of a few thousand\ntokens to achieve superior performance than competitive baselines continually\ntraining these LLMs on long sequences. Even when the sequence length is scaled\nto $1,024$K, InfLLM still effectively captures long-distance dependencies.\n","authors":["Chaojun Xiao","Pengle Zhang","Xu Han","Guangxuan Xiao","Yankai Lin","Zhengyan Zhang","Zhiyuan Liu","Song Han","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.04617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.17760v2","updated":"2024-02-07T06:49:41Z","published":"2024-01-31T11:37:14Z","title":"Regularized Linear Discriminant Analysis Using a Nonlinear Covariance\n Matrix Estimator","summary":" Linear discriminant analysis (LDA) is a widely used technique for data\nclassification. The method offers adequate performance in many classification\nproblems, but it becomes inefficient when the data covariance matrix is\nill-conditioned. This often occurs when the feature space's dimensionality is\nhigher than or comparable to the training data size. Regularized LDA (RLDA)\nmethods based on regularized linear estimators of the data covariance matrix\nhave been proposed to cope with such a situation. The performance of RLDA\nmethods is well studied, with optimal regularization schemes already proposed.\nIn this paper, we investigate the capability of a positive semidefinite\nridge-type estimator of the inverse covariance matrix that coincides with a\nnonlinear (NL) covariance matrix estimator. The estimator is derived by\nreformulating the score function of the optimal classifier utilizing linear\nestimation methods, which eventually results in the proposed NL-RLDA\nclassifier. We derive asymptotic and consistent estimators of the proposed\ntechnique's misclassification rate under the assumptions of a double-asymptotic\nregime and multivariate Gaussian model for the classes. The consistent\nestimator, coupled with a one-dimensional grid search, is used to set the value\nof the regularization parameter required for the proposed NL-RLDA classifier.\nPerformance evaluations based on both synthetic and real data demonstrate the\neffectiveness of the proposed classifier. The proposed technique outperforms\nstate-of-art methods over multiple datasets. When compared to state-of-the-art\nmethods across various datasets, the proposed technique exhibits superior\nperformance.\n","authors":["Maaz Mahadi","Tarig Ballal","Muhammad Moinuddin","Tareq Y. Al-Naffouri","Ubaid M. Al-Saggaf"],"pdf_url":"https://arxiv.org/pdf/2401.17760v2.pdf","comment":"\\c{opyright} 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2402.04616v1","updated":"2024-02-07T06:48:24Z","published":"2024-02-07T06:48:24Z","title":"TinyLLM: Learning a Small Student from Multiple Large Language Models","summary":" Transferring the reasoning capability from stronger large language models\n(LLMs) to smaller ones has been quite appealing, as smaller LLMs are more\nflexible to deploy with less expense. Among the existing solutions, knowledge\ndistillation stands out due to its outstanding efficiency and generalization.\nHowever, existing methods suffer from several drawbacks, including limited\nknowledge diversity and the lack of rich contextual information. To solve the\nproblems and facilitate the learning of compact language models, we propose\nTinyLLM, a novel knowledge distillation paradigm to learn a small student LLM\nfrom multiple large teacher LLMs. In particular, we encourage the student LLM\nto not only generate the correct answers but also understand the rationales\nbehind these answers. Given that different LLMs possess diverse reasoning\nskills, we guide the student model to assimilate knowledge from various teacher\nLLMs. We further introduce an in-context example generator and a\nteacher-forcing Chain-of-Thought strategy to ensure that the rationales are\naccurate and grounded in contextually appropriate scenarios. Extensive\nexperiments on six datasets across two reasoning tasks demonstrate the\nsuperiority of our method. Results show that TinyLLM can outperform large\nteacher LLMs significantly, despite having a considerably smaller model size.\n","authors":["Yijun Tian","Yikun Han","Xiusi Chen","Wei Wang","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2402.04616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16803v2","updated":"2024-02-07T06:48:12Z","published":"2024-01-30T07:50:32Z","title":"PBSCSR: The Piano Bootleg Score Composer Style Recognition Dataset","summary":" This article motivates, describes, and presents the PBSCSR dataset for\nstudying composer style recognition of piano sheet music. Our overarching goal\nwas to create a dataset for studying composer style recognition that is \"as\naccessible as MNIST and as challenging as ImageNet\". To achieve this goal, we\nuse a previously proposed feature representation of sheet music called a\nbootleg score, which encodes the position of noteheads relative to the staff\nlines. Using this representation, we sample fixed-length bootleg score\nfragments from piano sheet music images on IMSLP. The dataset itself contains\n40,000 62x64 bootleg score images for a 9-way classification task, 100,000\n62x64 bootleg score images for a 100-way classification task, and 29,310\nunlabeled variable-length bootleg score images for pretraining. The labeled\ndata is presented in a form that mirrors MNIST images, in order to make it\nextremely easy to visualize, manipulate, and train models in an efficient\nmanner. Additionally, we include relevant metadata to allow access to the\nunderlying raw sheet music images and other related data on IMSLP. We describe\nseveral research tasks that could be studied with the dataset, including\nvariations of composer style recognition in a few-shot or zero-shot setting.\nFor tasks that have previously proposed models, we release code and baseline\nresults for future works to compare against. We also discuss open research\nquestions that the PBSCSR data is especially well suited to facilitate research\non and areas of fruitful exploration in future work.\n","authors":["Arhan Jain","Alec Bunn","Austin Pham","TJ Tsai"],"pdf_url":"https://arxiv.org/pdf/2401.16803v2.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.13692v3","updated":"2024-02-07T06:45:30Z","published":"2022-11-24T16:16:40Z","title":"To be or not to be stable, that is the question: understanding neural\n networks for inverse problems","summary":" The solution of linear inverse problems arising, for example, in signal and\nimage processing is a challenging problem since the ill-conditioning amplifies,\nin the solution, the noise present in the data. Recently introduced algorithms\nbased on deep learning overwhelm the more traditional model-based approaches in\nperformance, but they typically suffer from instability with respect to data\nperturbation. In this paper, we theoretically analyze the trade-off between\nstability and accuracy of neural networks, when used to solve linear imaging\ninverse problems for not under-determined cases. Moreover, we propose different\nsupervised and unsupervised solutions to increase the network stability and\nmaintain a good accuracy, by means of regularization properties inherited from\na model-based iterative scheme during the network training and pre-processing\nstabilizing operator in the neural networks. Extensive numerical experiments on\nimage deblurring confirm the theoretical results and the effectiveness of the\nproposed deep learning-based approaches to handle noise on the data.\n","authors":["Davide Evangelista","James Nagy","Elena Morotti","Elena Loli Piccolomini"],"pdf_url":"https://arxiv.org/pdf/2211.13692v3.pdf","comment":"21 pages, 6 figure. Paper will be sent for publication on a journal\n soon. This is a preliminary version, updated versions will be uploaded on\n ArXiv"},{"id":"http://arxiv.org/abs/2402.04613v1","updated":"2024-02-07T06:30:39Z","published":"2024-02-07T06:30:39Z","title":"Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in\n Reproducing Kernel Hilbert Spaces","summary":" Most commonly used $f$-divergences of measures, e.g., the Kullback-Leibler\ndivergence, are subject to limitations regarding the support of the involved\nmeasures. A remedy consists of regularizing the $f$-divergence by a squared\nmaximum mean discrepancy (MMD) associated with a characteristic kernel $K$. In\nthis paper, we use the so-called kernel mean embedding to show that the\ncorresponding regularization can be rewritten as the Moreau envelope of some\nfunction in the reproducing kernel Hilbert space associated with $K$. Then, we\nexploit well-known results on Moreau envelopes in Hilbert spaces to prove\nproperties of the MMD-regularized $f$-divergences and, in particular, their\ngradients. Subsequently, we use our findings to analyze Wasserstein gradient\nflows of MMD-regularized $f$-divergences. Finally, we consider Wasserstein\ngradient flows starting from empirical measures and provide\nproof-of-the-concept numerical examples with Tsallis-$\\alpha$ divergences.\n","authors":["Sebastian Neumayer","Viktor Stein","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2402.04613v1.pdf","comment":"42 pages, 13 figures"},{"id":"http://arxiv.org/abs/2402.00045v3","updated":"2024-02-07T06:27:12Z","published":"2024-01-22T15:08:19Z","title":"Detecting Multimedia Generated by Large AI Models: A Survey","summary":" The rapid advancement of Large AI Models (LAIMs), particularly diffusion\nmodels and large language models, has marked a new era where AI-generated\nmultimedia is increasingly integrated into various aspects of daily life.\nAlthough beneficial in numerous fields, this content presents significant\nrisks, including potential misuse, societal disruptions, and ethical concerns.\nConsequently, detecting multimedia generated by LAIMs has become crucial, with\na marked rise in related research. Despite this, there remains a notable gap in\nsystematic surveys that focus specifically on detecting LAIM-generated\nmultimedia. Addressing this, we provide the first survey to comprehensively\ncover existing research on detecting multimedia (such as text, images, videos,\naudio, and multimodal content) created by LAIMs. Specifically, we introduce a\nnovel taxonomy for detection methods, categorized by media modality, and\naligned with two perspectives: pure detection (aiming to enhance detection\nperformance) and beyond detection (adding attributes like generalizability,\nrobustness, and interpretability to detectors). Additionally, we have presented\na brief overview of generation mechanisms, public datasets, and online\ndetection tools to provide a valuable resource for researchers and\npractitioners in this field. Furthermore, we identify current challenges in\ndetection and propose directions for future research that address unexplored,\nongoing, and emerging issues in detecting multimedia generated by LAIMs. Our\naim for this survey is to fill an academic gap and contribute to global AI\nsecurity efforts, helping to ensure the integrity of information in the digital\nrealm. The project link is\nhttps://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey.\n","authors":["Li Lin","Neeraj Gupta","Yue Zhang","Hainan Ren","Chun-Hao Liu","Feng Ding","Xin Wang","Xin Li","Luisa Verdoliva","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2402.00045v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02347v2","updated":"2024-02-07T06:17:13Z","published":"2024-02-04T05:05:43Z","title":"Riemannian Preconditioned LoRA for Fine-Tuning Foundation Models","summary":" In this work we study the enhancement of Low Rank Adaptation (LoRA)\nfine-tuning procedure by introducing a Riemannian preconditioner in its\noptimization step. Specifically, we introduce an $r\\times r$ preconditioner in\neach gradient step where $r$ is the LoRA rank. This preconditioner requires a\nsmall change to existing optimizer code and creates virtually minuscule storage\nand runtime overhead. Our experimental results with both large language models\nand text-to-image diffusion models show that with our preconditioner, the\nconvergence and reliability of SGD and AdamW can be significantly enhanced.\nMoreover, the training process becomes much more robust to hyperparameter\nchoices such as learning rate. Theoretically, we show that fine-tuning a\ntwo-layer ReLU network in the convex paramaterization with our preconditioner\nhas convergence rate independent of condition number of the data matrix. This\nnew Riemannian preconditioner, previously explored in classic low-rank matrix\nrecovery, is introduced to deep learning tasks for the first time in our work.\nWe release our code at\nhttps://github.com/pilancilab/Riemannian_Preconditioned_LoRA.\n","authors":["Fangzhao Zhang","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2402.02347v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04193v2","updated":"2024-02-07T06:13:23Z","published":"2024-02-06T17:49:02Z","title":"Gradient Coding in Decentralized Learning for Evading Stragglers","summary":" In this paper, we consider a decentralized learning problem in the presence\nof stragglers. Although gradient coding techniques have been developed for\ndistributed learning to evade stragglers, where the devices send encoded\ngradients with redundant training data, it is difficult to apply those\ntechniques directly to decentralized learning scenarios. To deal with this\nproblem, we propose a new gossip-based decentralized learning method with\ngradient coding (GOCO). In the proposed method, to avoid the negative impact of\nstragglers, the parameter vectors are updated locally using encoded gradients\nbased on the framework of stochastic gradient coding and then averaged in a\ngossip-based manner. We analyze the convergence performance of GOCO for\nstrongly convex loss functions. And we also provide simulation results to\ndemonstrate the superiority of the proposed method in terms of learning\nperformance compared with the baseline methods.\n","authors":["Chengxi Li","Mikael Skoglund"],"pdf_url":"https://arxiv.org/pdf/2402.04193v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.13967v3","updated":"2024-02-07T11:21:48Z","published":"2024-01-25T05:58:04Z","title":"Perceptual-oriented Learned Image Compression with Dynamic Kernel","summary":" In this paper, we extend our prior research named DKIC and propose the\nperceptual-oriented learned image compression method, PO-DKIC. Specifically,\nDKIC adopts a dynamic kernel-based dynamic residual block group to enhance the\ntransform coding and an asymmetric space-channel context entropy model to\nfacilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC\nintroduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to\nmaximize the overall perceptual quality under a rate constraint, we formulate\nthis challenge into a constrained programming problem and use the Linear\nInteger Programming method for resolution. The experiments demonstrate that our\nproposed method can generate realistic images with richer textures and finer\ndetails when compared to state-of-the-art image compression techniques.\n","authors":["Nianxiang Fu","Junxi Zhang","Huairui Wang","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13967v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00045v3","updated":"2024-02-07T06:27:12Z","published":"2024-01-22T15:08:19Z","title":"Detecting Multimedia Generated by Large AI Models: A Survey","summary":" The rapid advancement of Large AI Models (LAIMs), particularly diffusion\nmodels and large language models, has marked a new era where AI-generated\nmultimedia is increasingly integrated into various aspects of daily life.\nAlthough beneficial in numerous fields, this content presents significant\nrisks, including potential misuse, societal disruptions, and ethical concerns.\nConsequently, detecting multimedia generated by LAIMs has become crucial, with\na marked rise in related research. Despite this, there remains a notable gap in\nsystematic surveys that focus specifically on detecting LAIM-generated\nmultimedia. Addressing this, we provide the first survey to comprehensively\ncover existing research on detecting multimedia (such as text, images, videos,\naudio, and multimodal content) created by LAIMs. Specifically, we introduce a\nnovel taxonomy for detection methods, categorized by media modality, and\naligned with two perspectives: pure detection (aiming to enhance detection\nperformance) and beyond detection (adding attributes like generalizability,\nrobustness, and interpretability to detectors). Additionally, we have presented\na brief overview of generation mechanisms, public datasets, and online\ndetection tools to provide a valuable resource for researchers and\npractitioners in this field. Furthermore, we identify current challenges in\ndetection and propose directions for future research that address unexplored,\nongoing, and emerging issues in detecting multimedia generated by LAIMs. Our\naim for this survey is to fill an academic gap and contribute to global AI\nsecurity efforts, helping to ensure the integrity of information in the digital\nrealm. The project link is\nhttps://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey.\n","authors":["Li Lin","Neeraj Gupta","Yue Zhang","Hainan Ren","Chun-Hao Liu","Feng Ding","Xin Wang","Xin Li","Luisa Verdoliva","Shu Hu"],"pdf_url":"https://arxiv.org/pdf/2402.00045v3.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 75 + +
+
+
+ + ☆ Edu-ConvoKit: An Open-Source Library for Education Conversation Data + + +
+ We introduce Edu-ConvoKit, an open-source library designed to handle +pre-processing, annotation and analysis of conversation data in education. +Resources for analyzing education conversation data are scarce, making the +research challenging to perform and therefore hard to access. We address these +challenges with Edu-ConvoKit. Edu-ConvoKit is open-source +(https://github.com/stanfordnlp/edu-convokit ), pip-installable +(https://pypi.org/project/edu-convokit/ ), with comprehensive documentation +(https://edu-convokit.readthedocs.io/en/latest/ ). Our demo video is available +at: https://youtu.be/zdcI839vAko?si=h9qlnl76ucSuXb8- . We include additional +resources, such as Colab applications of Edu-ConvoKit to three diverse +education datasets and a repository of Edu-ConvoKit related papers, that can be +found in our GitHub repository. + +
+
+ comment: https://github.com/stanfordnlp/edu-convokit + https://edu-convokit.readthedocs.io/en/latest/ +
+
+
+
+
+ + ☆ Image captioning for Brazilian Portuguese using GRIT model + + +
+ This work presents the early development of a model of image captioning for +the Brazilian Portuguese language. We used the GRIT (Grid - and Region-based +Image captioning Transformer) model to accomplish this work. GRIT is a +Transformer-only neural architecture that effectively utilizes two visual +features to generate better captions. The GRIT method emerged as a proposal to +be a more efficient way to generate image captioning. In this work, we adapt +the GRIT model to be trained in a Brazilian Portuguese dataset to have an image +captioning method for the Brazilian Portuguese Language. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.09666 by other authors +
+
+
+
+
+ + ☆ A Roadmap to Pluralistic Alignment + + +
+ With increased power and prevalence of AI systems, it is ever more critical +that AI systems are designed to serve all, i.e., people with diverse values and +perspectives. However, aligning models to serve pluralistic human values +remains an open research question. In this piece, we propose a roadmap to +pluralistic alignment, specifically using language models as a test bed. We +identify and formalize three possible ways to define and operationalize +pluralism in AI systems: 1) Overton pluralistic models that present a spectrum +of reasonable responses; 2) Steerably pluralistic models that can steer to +reflect certain perspectives; and 3) Distributionally pluralistic models that +are well-calibrated to a given population in distribution. We also propose and +formalize three possible classes of pluralistic benchmarks: 1) Multi-objective +benchmarks, 2) Trade-off steerable benchmarks, which incentivize models to +steer to arbitrary trade-offs, and 3) Jury-pluralistic benchmarks which +explicitly model diverse human ratings. We use this framework to argue that +current alignment techniques may be fundamentally limited for pluralistic AI; +indeed, we highlight empirical evidence, both from our own experiments and from +other work, that standard alignment procedures might reduce distributional +pluralism in models, motivating the need for further research on pluralistic +alignment. + +
+
+
+
+
+ + ☆ SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large + Language Models + + +
+ In the rapidly evolving landscape of Large Language Models (LLMs), ensuring +robust safety measures is paramount. To meet this crucial need, we propose +\emph{SALAD-Bench}, a safety benchmark specifically designed for evaluating +LLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench +transcends conventional benchmarks through its large scale, rich diversity, +intricate taxonomy spanning three levels, and versatile +functionalities.SALAD-Bench is crafted with a meticulous array of questions, +from standard queries to complex ones enriched with attack, defense +modifications and multiple-choice. To effectively manage the inherent +complexity, we introduce an innovative evaluators: the LLM-based MD-Judge for +QA pairs with a particular focus on attack-enhanced queries, ensuring a +seamless, and reliable evaluation. Above components extend SALAD-Bench from +standard LLM safety evaluation to both LLM attack and defense methods +evaluation, ensuring the joint-purpose utility. Our extensive experiments shed +light on the resilience of LLMs against emerging threats and the efficacy of +contemporary defense tactics. Data and evaluator are released under +\url{https://github.com/OpenSafetyLab/SALAD-BENCH}. Warning: this paper +includes examples that may be offensive or harmful. + +
+
+
+
+
+ + ☆ How BERT Speaks Shakespearean English? Evaluating Historical Bias in + Contextual Language Models + + +
+ In this paper, we explore the idea of analysing the historical bias of +contextual language models based on BERT by measuring their adequacy with +respect to Early Modern (EME) and Modern (ME) English. In our preliminary +experiments, we perform fill-in-the-blank tests with 60 masked sentences (20 +EME-specific, 20 ME-specific and 20 generic) and three different models (i.e., +BERT Base, MacBERTh, English HLM). We then rate the model predictions according +to a 5-point bipolar scale between the two language varieties and derive a +weighted score to measure the adequacy of each model to EME and ME varieties of +English. + +
+
+
+
+
+ + ☆ Pedagogical Alignment of Large Language Models + + +
+ In this paper, we introduce the novel concept of pedagogically aligned Large +Language Models (LLMs) that signifies a transformative shift in the application +of LLMs within educational contexts. Rather than providing direct responses to +user queries, pedagogically-aligned LLMs function as scaffolding tools, +breaking complex problems into manageable subproblems and guiding students +towards the final answer through constructive feedback and hints. The objective +is to equip learners with problem-solving strategies that deepen their +understanding and internalization of the subject matter. Previous research in +this field has primarily applied the supervised finetuning approach without +framing the objective as an alignment problem, hence not employing +reinforcement learning through human feedback (RLHF) methods. This study +reinterprets the narrative by viewing the task through the lens of alignment +and demonstrates how RLHF methods emerge naturally as a superior alternative +for aligning LLM behaviour. Building on this perspective, we propose a novel +approach for constructing a reward dataset specifically designed for the +pedagogical alignment of LLMs. We apply three state-of-the-art RLHF algorithms +and find that they outperform SFT significantly. Our qualitative analyses +across model differences and hyperparameter sensitivity further validate the +superiority of RLHF over SFT. Also, our study sheds light on the potential of +online feedback for enhancing the performance of pedagogically-aligned LLMs, +thus providing valuable insights for the advancement of these models in +educational settings. + +
+
+
+
+
+ + ☆ An Enhanced Prompt-Based LLM Reasoning Scheme via Knowledge + Graph-Integrated Collaboration + + +
+ While Large Language Models (LLMs) demonstrate exceptional performance in a +multitude of Natural Language Processing (NLP) tasks, they encounter challenges +in practical applications, including issues with hallucinations, inadequate +knowledge updating, and limited transparency in the reasoning process. To +overcome these limitations, this study innovatively proposes a collaborative +training-free reasoning scheme involving tight cooperation between Knowledge +Graph (KG) and LLMs. This scheme first involves using LLMs to iteratively +explore KG, selectively retrieving a task-relevant knowledge subgraph to +support reasoning. The LLMs are then guided to further combine inherent +implicit knowledge to reason on the subgraph while explicitly elucidating the +reasoning process. Through such a cooperative approach, our scheme achieves +more reliable knowledge-based reasoning and facilitates the tracing of the +reasoning results. Experimental results show that our scheme significantly +progressed across multiple datasets, notably achieving over a 10% improvement +on the QALD10 dataset compared to the best baseline and the fine-tuned +state-of-the-art (SOTA) work. Building on this success, this study hopes to +offer a valuable reference for future research in the fusion of KG and LLMs, +thereby enhancing LLMs' proficiency in solving complex issues. + +
+
+
+
+
+ + ☆ Text or Image? What is More Important in Cross-Domain Generalization + Capabilities of Hate Meme Detection Models? EACL'2024 + + +
+ This paper delves into the formidable challenge of cross-domain +generalization in multimodal hate meme detection, presenting compelling +findings. We provide enough pieces of evidence supporting the hypothesis that +only the textual component of hateful memes enables the existing multimodal +classifier to generalize across different domains, while the image component +proves highly sensitive to a specific training dataset. The evidence includes +demonstrations showing that hate-text classifiers perform similarly to +hate-meme classifiers in a zero-shot setting. Simultaneously, the introduction +of captions generated from images of memes to the hate-meme classifier worsens +performance by an average F1 of 0.02. Through blackbox explanations, we +identify a substantial contribution of the text modality (average of 83%), +which diminishes with the introduction of meme's image captions (52%). +Additionally, our evaluation on a newly created confounder dataset reveals +higher performance on text confounders as compared to image confounders with an +average $\Delta$F1 of 0.18. + +
+
+ comment: Accepted at EACL'2024 Findings +
+
+
+
+
+ + ☆ Reconfidencing LLMs from the Grouping Loss Perspective + + +
+ Large Language Models (LLMs), including ChatGPT and LLaMA, are susceptible to +generating hallucinated answers in a confident tone. While efforts to elicit +and calibrate confidence scores have proven useful, recent findings show that +controlling uncertainty must go beyond calibration: predicted scores may +deviate significantly from the actual posterior probabilities due to the impact +of grouping loss. In this work, we construct a new evaluation dataset derived +from a knowledge base to assess confidence scores given to answers of Mistral +and LLaMA. Experiments show that they tend to be overconfident. Further, we +show that they are more overconfident on some answers than others, \emph{eg} +depending on the nationality of the person in the query. In +uncertainty-quantification theory, this is grouping loss. To address this, we +propose a solution to reconfidence LLMs, canceling not only calibration but +also grouping loss. The LLMs, after the reconfidencing process, indicate +improved confidence alignment with the accuracy of their responses. + +
+
+
+
+
+ + ☆ Prompting Implicit Discourse Relation Annotation + + +
+ Pre-trained large language models, such as ChatGPT, archive outstanding +performance in various reasoning tasks without supervised training and were +found to have outperformed crowdsourcing workers. Nonetheless, ChatGPT's +performance in the task of implicit discourse relation classification, prompted +by a standard multiple-choice question, is still far from satisfactory and +considerably inferior to state-of-the-art supervised approaches. This work +investigates several proven prompting techniques to improve ChatGPT's +recognition of discourse relations. In particular, we experimented with +breaking down the classification task that involves numerous abstract labels +into smaller subtasks. Nonetheless, experiment results show that the inference +accuracy hardly changes even with sophisticated prompt engineering, suggesting +that implicit discourse relation classification is not yet resolvable under +zero-shot or few-shot settings. + +
+
+ comment: To appear at the Linguistic Annotation Workshop 2024 +
+
+
+
+
+ + ☆ Personalized Text Generation with Fine-Grained Linguistic Control + + +
+ As the text generation capabilities of large language models become +increasingly prominent, recent studies have focused on controlling particular +aspects of the generated text to make it more personalized. However, most +research on controllable text generation focuses on controlling the content or +modeling specific high-level/coarse-grained attributes that reflect authors' +writing styles, such as formality, domain, or sentiment. In this paper, we +focus on controlling fine-grained attributes spanning multiple linguistic +dimensions, such as lexical and syntactic attributes. We introduce a novel +benchmark to train generative models and evaluate their ability to generate +personalized text based on multiple fine-grained linguistic attributes. We +systematically investigate the performance of various large language models on +our benchmark and draw insights from the factors that impact their performance. +We make our code, data, and pretrained models publicly available. + +
+
+
+
+
+ + ☆ L4Q: Parameter Efficient Quantization-Aware Training on Large Language + Models via LoRA-wise LSQ + + +
+ Post-training quantization (PTQ) and quantization-aware training (QAT) +methods are gaining popularity in mitigating the high memory and computational +costs associated with Large Language Models (LLMs). In resource-constrained +scenarios, PTQ, with its reduced training overhead, is often preferred over +QAT, despite the latter's potential for higher accuracy. Meanwhile, +parameter-efficient fine-tuning (PEFT) methods like low-rank adaptation (LoRA) +have been introduced, and recent efforts have explored quantization-aware PEFT +techniques. However, these approaches may lack generality due to their reliance +on the pre-quantized model's configuration. Their effectiveness may be +compromised by non-linearly quantized or mixed-precision weights, and the +retraining of specific quantization parameters might impede optimal +performance. To address these challenges, we propose L4Q, an algorithm for +parameter-efficient quantization-aware training. L4Q leverages LoRA-wise +learned quantization step size for LLMs, aiming to enhance generality. The +simultaneous quantization-and-fine-tuning process of L4Q is applicable to +high-precision models, yielding linearly quantized weights with superior +accuracy. Our experiments, conducted on the LLaMA and LLaMA2 model families +using an instructional dataset, showcase L4Q's capabilities in language +comprehension and few-shot in-context learning, achieving sub-4-bit precision +while maintaining comparable training times to applying PEFT on a quantized +model. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Detecting Generated Native Ads in Conversational Search WWW'24 + + +
+ Conversational search engines such as YouChat and Microsoft Copilot use large +language models (LLMs) to generate answers to queries. It is only a small step +to also use this technology to generate and integrate advertising within these +answers - instead of placing ads separately from the organic search results. +This type of advertising is reminiscent of native advertising and product +placement, both of which are very effective forms of subtle and manipulative +advertising. It is likely that information seekers will be confronted with such +use of LLM technology in the near future, especially when considering the high +computational costs associated with LLMs, for which providers need to develop +sustainable business models. This paper investigates whether LLMs can also be +used as a countermeasure against generated native ads, i.e., to block them. For +this purpose we compile a large dataset of ad-prone queries and of generated +answers with automatically integrated ads to experiment with fine-tuned +sentence transformers and state-of-the-art LLMs on the task of recognizing the +ads. In our experiments sentence transformers achieve detection precision and +recall values above 0.9, while the investigated LLMs struggle with the task. + +
+
+ comment: Submitted to WWW'24 Short Papers Track; 4 pages +
+
+
+
+
+ + ☆ On Provable Length and Compositional Generalization + + +
+ Length generalization -- the ability to generalize to longer sequences than +ones seen during training, and compositional generalization -- the ability to +generalize to token combinations not seen during training, are crucial forms of +out-of-distribution generalization in sequence-to-sequence models. In this +work, we take the first steps towards provable length and compositional +generalization for a range of architectures, including deep sets, transformers, +state space models, and simple recurrent neural nets. Depending on the +architecture, we prove different degrees of representation identification, +e.g., a linear or a permutation relation with ground truth representation, is +necessary for length and compositional generalization. + +
+
+
+
+
+ + ☆ CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay + + +
+ Large language models are increasingly solving tasks that are commonly +believed to require human-level reasoning ability. However, these models still +perform very poorly on benchmarks of general intelligence such as the +Abstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a +programming-by-examples problem, and introduce a novel and scalable method for +language model self-improvement called Code Iteration (CodeIt). Our method +iterates between 1) program sampling and hindsight relabeling, and 2) learning +from prioritized experience replay. By relabeling the goal of an episode (i.e., +the target program output given input) to the realized output produced by the +sampled program, our method effectively deals with the extreme sparsity of +rewards in program synthesis. Applying CodeIt to the ARC dataset, we +demonstrate that prioritized hindsight replay, along with pre-training and +data-augmentation, leads to successful inter-task generalization. CodeIt is the +first neuro-symbolic approach that scales to the full ARC evaluation dataset. +Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art +performance and outperforming existing neural and symbolic baselines. + +
+
+ comment: 8 pages, 11 figures +
+
+
+
+
+ + ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey + + +
+ Research surveys have always posed a challenge for beginner researchers who +lack of research training. These researchers struggle to understand the +directions within their research topic, and the discovery of new research +findings within a short time. One way to provide intuitive assistance to +beginner researchers is by offering relevant knowledge graphs(KG) and +recommending related academic papers. However, existing navigation knowledge +graphs primarily rely on keywords in the research field and often fail to +present the logical hierarchy among multiple related papers clearly. Moreover, +most recommendation systems for academic papers simply rely on high text +similarity, which can leave researchers confused as to why a particular article +is being recommended. They may lack of grasp important information about the +insight connection between "Issue resolved" and "Issue finding" that they hope +to obtain. To address these issues, this study aims to support research insight +surveys for beginner researchers by establishing a hierarchical tree-structured +knowledge graph that reflects the inheritance insight of research topics and +the relevance insight among the academic papers. + +
+
+ comment: This paper will submit to '27th International Symposium on + Methodologies for Intelligent Systems'(ISMIS 2024) +
+
+
+
+
+ + ☆ PaDeLLM-NER: Parallel Decoding in Large Language Models for Named Entity + Recognition + + +
+ In this study, we aim to reduce generation latency for Named Entity +Recognition (NER) with Large Language Models (LLMs). The main cause of high +latency in LLMs is the sequential decoding process, which autoregressively +generates all labels and mentions for NER, significantly increase the sequence +length. To this end, we introduce Parallel Decoding in LLM for NE} +(PaDeLLM-NER), a approach that integrates seamlessly into existing generative +model frameworks without necessitating additional modules or architectural +modifications. PaDeLLM-NER allows for the simultaneous decoding of all +mentions, thereby reducing generation latency. Experiments reveal that +PaDeLLM-NER significantly increases inference speed that is 1.76 to 10.22 times +faster than the autoregressive approach for both English and Chinese. +Simultaneously it maintains the quality of predictions as evidenced by the +performance that is on par with the state-of-the-art across various datasets. + +
+
+
+
+
+ + ☆ Long Is More for Alignment: A Simple but Tough-to-Beat Baseline for + Instruction Fine-Tuning + + +
+ There is a consensus that instruction fine-tuning of LLMs requires +high-quality data, but what are they? LIMA (NeurIPS 2023) and AlpaGasus (ICLR +2024) are state-of-the-art methods for selecting such high-quality examples, +either via manual curation or using GPT-3.5-Turbo as a quality scorer. We show +that the extremely simple baseline of selecting the 1,000 instructions with +longest responses from standard datasets can consistently outperform these +sophisticated methods according to GPT-4 and PaLM-2 as judges, while remaining +competitive on the OpenLLM benchmarks that test factual knowledge. We +demonstrate this for several state-of-the-art LLMs (Llama-2-7B, Llama-2-13B, +and Mistral-7B) and datasets (Alpaca-52k and Evol-Instruct-70k). In addition, a +lightweight refinement of such long instructions can further improve the +abilities of the fine-tuned LLMs, and allows us to obtain the 2nd +highest-ranked Llama-2-7B-based model on AlpacaEval 2.0 while training on only +1,000 examples and no extra preference data. We also conduct a thorough +analysis of our models to ensure that their enhanced performance is not simply +due to GPT-4's preference for longer responses, thus ruling out any artificial +improvement. In conclusion, our findings suggest that fine-tuning on the +longest instructions should be the default baseline for any research on +instruction fine-tuning. + +
+
+ comment: Preprint. 25 pages, 24 figures +
+
+
+
+
+ + ☆ Learning Communication Policies for Different Follower Behaviors in a + Collaborative Reference Game AAAI'24 + + +
+ Albrecht and Stone (2018) state that modeling of changing behaviors remains +an open problem "due to the essentially unconstrained nature of what other +agents may do". In this work we evaluate the adaptability of neural artificial +agents towards assumed partner behaviors in a collaborative reference game. In +this game success is achieved when a knowledgeable Guide can verbally lead a +Follower to the selection of a specific puzzle piece among several distractors. +We frame this language grounding and coordination task as a reinforcement +learning problem and measure to which extent a common reinforcement training +algorithm (PPO) is able to produce neural agents (the Guides) that perform well +with various heuristic Follower behaviors that vary along the dimensions of +confidence and autonomy. We experiment with a learning signal that in addition +to the goal condition also respects an assumed communicative effort. Our +results indicate that this novel ingredient leads to communicative strategies +that are less verbose (staying silent in some of the steps) and that with +respect to that the Guide's strategies indeed adapt to the partner's level of +confidence and autonomy. + +
+
+ comment: Work presented at the "Cooperative Multi-Agent Systems + Decision-making and Learning" workshop (AAAI'24) +
+
+
+
+
+ + ☆ Aspect-Based Sentiment Analysis for Open-Ended HR Survey Responses EACL2024 + + +
+ Understanding preferences, opinions, and sentiment of the workforce is +paramount for effective employee lifecycle management. Open-ended survey +responses serve as a valuable source of information. This paper proposes a +machine learning approach for aspect-based sentiment analysis (ABSA) of Dutch +open-ended responses in employee satisfaction surveys. Our approach aims to +overcome the inherent noise and variability in these responses, enabling a +comprehensive analysis of sentiments that can support employee lifecycle +management. Through response clustering we identify six key aspects (salary, +schedule, contact, communication, personal attention, agreements), which we +validate by domain experts. We compile a dataset of 1,458 Dutch survey +responses, revealing label imbalance in aspects and sentiments. We propose +few-shot approaches for ABSA based on Dutch BERT models, and compare them +against bag-of-words and zero-shot baselines. Our work significantly +contributes to the field of ABSA by demonstrating the first successful +application of Dutch pre-trained language models to aspect-based sentiment +analysis in the domain of human resources (HR). + +
+
+ comment: Accepted at NLP4HR Workshop at EACL2024 +
+
+
+
+
+ + ☆ Direct Language Model Alignment from Online AI Feedback + + +
+ Direct alignment from preferences (DAP) methods, such as DPO, have recently +emerged as efficient alternatives to reinforcement learning from human feedback +(RLHF), that do not require a separate reward model. However, the preference +datasets used in DAP methods are usually collected ahead of training and never +updated, thus the feedback is purely offline. Moreover, responses in these +datasets are often sampled from a language model distinct from the one being +aligned, and since the model evolves over training, the alignment phase is +inevitably off-policy. In this study, we posit that online feedback is key and +improves DAP methods. Our method, online AI feedback (OAIF), uses an LLM as +annotator: on each training iteration, we sample two responses from the current +model and prompt the LLM annotator to choose which one is preferred, thus +providing online feedback. Despite its simplicity, we demonstrate via human +evaluation in several tasks that OAIF outperforms both offline DAP and RLHF +methods. We further show that the feedback leveraged in OAIF is easily +controllable, via instruction prompts to the LLM annotator. + +
+
+ comment: 18 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with + Vision-Language Benchmark + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +recently, showing remarkable potential in artificial general intelligence. +However, assessing the utility of MLLMs presents considerable challenges, +primarily due to the absence multimodal benchmarks that align with human +preferences. Inspired by LLM-as-a-Judge in LLMs, this paper introduces a novel +benchmark, termed MLLM-as-a-Judge, to assess the ability of MLLMs in assisting +judges including three distinct tasks: Scoring Evaluation, Pair Comparison, and +Batch Ranking. Our study reveals that, while MLLMs demonstrate remarkable +human-like discernment in Pair Comparisons, there is a significant divergence +from human preferences in Scoring Evaluation and Batch Ranking tasks. +Furthermore, MLLMs still face challenges in judgment, including diverse biases, +hallucinatory responses, and inconsistencies, even for advanced models such as +GPT-4V. These findings emphasize the pressing need for enhancements and further +research efforts regarding MLLMs as fully reliable evaluators. Code and dataset +are available at https://github.com/Dongping-Chen/MLLM-as-a-Judge. + +
+
+
+
+
+ + ☆ A Hypothesis-Driven Framework for the Analysis of Self-Rationalising + Models + + +
+ The self-rationalising capabilities of LLMs are appealing because the +generated explanations can give insights into the plausibility of the +predictions. However, how faithful the explanations are to the predictions is +questionable, raising the need to explore the patterns behind them further. To +this end, we propose a hypothesis-driven statistical framework. We use a +Bayesian network to implement a hypothesis about how a task (in our example, +natural language inference) is solved, and its internal states are translated +into natural language with templates. Those explanations are then compared to +LLM-generated free-text explanations using automatic and human evaluations. +This allows us to judge how similar the LLM's and the Bayesian network's +decision processes are. We demonstrate the usage of our framework with an +example hypothesis and two realisations in Bayesian networks. The resulting +models do not exhibit a strong similarity to GPT-3.5. We discuss the +implications of this as well as the framework's potential to approximate LLM +decisions better in future work. + +
+
+
+
+
+ + ☆ StableMask: Refining Causal Masking in Decoder-only Transformer + + +
+ The decoder-only Transformer architecture with causal masking and relative +position encoding (RPE) has become the de facto choice in language modeling. +Despite its exceptional performance across various tasks, we have identified +two limitations: First, it requires all attention scores to be non-zero and sum +up to 1, even if the current embedding has sufficient self-contained +information. This compels the model to assign disproportional excessive +attention to specific tokens. Second, RPE-based Transformers are not universal +approximators due to their limited capacity at encoding absolute positional +information, which limits their application in position-critical tasks. In this +work, we propose StableMask: a parameter-free method to address both +limitations by refining the causal mask. It introduces pseudo-attention values +to balance attention distributions and encodes absolute positional information +via a progressively decreasing mask ratio. StableMask's effectiveness is +validated both theoretically and empirically, showing significant enhancements +in language models with parameter sizes ranging from 71M to 1.4B across diverse +datasets and encoding methods. We further show that it naturally supports (1) +efficient extrapolation without special tricks such as StreamingLLM and (2) +easy integration with existing attention optimization techniques. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Large Language Models As Faithful Explainers + + +
+ Large Language Models (LLMs) have recently become proficient in addressing +complex tasks by utilizing their rich internal knowledge and reasoning ability. +Consequently, this complexity hinders traditional input-focused explanation +algorithms for explaining the complex decision-making processes of LLMs. Recent +advancements have thus emerged for self-explaining their predictions through a +single feed-forward inference in a natural language format. However, natural +language explanations are often criticized for lack of faithfulness since these +explanations may not accurately reflect the decision-making behaviors of the +LLMs. In this work, we introduce a generative explanation framework, xLLM, to +improve the faithfulness of the explanations provided in natural language +formats for LLMs. Specifically, we propose an evaluator to quantify the +faithfulness of natural language explanation and enhance the faithfulness by an +iterative optimization process of xLLM, with the goal of maximizing the +faithfulness scores. Experiments conducted on three NLU datasets demonstrate +that xLLM can significantly improve the faithfulness of generated explanations, +which are in alignment with the behaviors of LLMs. + +
+
+
+
+
+ + ☆ Source Identification in Abstractive Summarization EACL 2024 + + +
+ Neural abstractive summarization models make summaries in an end-to-end +manner, and little is known about how the source information is actually +converted into summaries. In this paper, we define input sentences that contain +essential information in the generated summary as $\textit{source sentences}$ +and study how abstractive summaries are made by analyzing the source sentences. +To this end, we annotate source sentences for reference summaries and system +summaries generated by PEGASUS on document-summary pairs sampled from the +CNN/DailyMail and XSum datasets. We also formulate automatic source sentence +detection and compare multiple methods to establish a strong baseline for the +task. Experimental results show that the perplexity-based method performs well +in highly abstractive settings, while similarity-based methods perform robustly +in relatively extractive settings. Our code and data are available at +https://github.com/suhara/sourcesum. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ☆ TransLLaMa: LLM-based Simultaneous Translation System + + +
+ Decoder-only large language models (LLMs) have recently demonstrated +impressive capabilities in text generation and reasoning. Nonetheless, they +have limited applications in simultaneous machine translation (SiMT), currently +dominated by encoder-decoder transformers. This study demonstrates that, after +fine-tuning on a small dataset comprising causally aligned source and target +sentence pairs, a pre-trained open-source LLM can control input segmentation +directly by generating a special "wait" token. This obviates the need for a +separate policy and enables the LLM to perform English-German and +English-Russian SiMT tasks with BLEU scores that are comparable to those of +specific state-of-the-art baselines. We also evaluated closed-source models +such as GPT-4, which displayed encouraging results in performing the SiMT task +without prior training (zero-shot), indicating a promising avenue for enhancing +future SiMT systems. + +
+
+
+
+
+ + ☆ The Future of Cognitive Strategy-enhanced Persuasive Dialogue Agents: + New Perspectives and Trends + + +
+ Persuasion, as one of the crucial abilities in human communication, has +garnered extensive attention from researchers within the field of intelligent +dialogue systems. We humans tend to persuade others to change their viewpoints, +attitudes or behaviors through conversations in various scenarios (e.g., +persuasion for social good, arguing in online platforms). Developing dialogue +agents that can persuade others to accept certain standpoints is essential to +achieving truly intelligent and anthropomorphic dialogue system. Benefiting +from the substantial progress of Large Language Models (LLMs), dialogue agents +have acquired an exceptional capability in context understanding and response +generation. However, as a typical and complicated cognitive psychological +system, persuasive dialogue agents also require knowledge from the domain of +cognitive psychology to attain a level of human-like persuasion. Consequently, +the cognitive strategy-enhanced persuasive dialogue agent (defined as +CogAgent), which incorporates cognitive strategies to achieve persuasive +targets through conversation, has become a predominant research paradigm. To +depict the research trends of CogAgent, in this paper, we first present several +fundamental cognitive psychology theories and give the formalized definition of +three typical cognitive strategies, including the persuasion strategy, the +topic path planning strategy, and the argument structure prediction strategy. +Then we propose a new system architecture by incorporating the formalized +definition to lay the foundation of CogAgent. Representative works are detailed +and investigated according to the combined cognitive strategy, followed by the +summary of authoritative benchmarks and evaluation metrics. Finally, we +summarize our insights on open issues and future directions of CogAgent for +upcoming researchers. + +
+
+ comment: 36 pages, 6 figures +
+
+
+
+
+ + ☆ SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question + Answering over a Life Science Knowledge Graph + + +
+ The recent success of Large Language Models (LLM) in a wide range of Natural +Language Processing applications opens the path towards novel Question +Answering Systems over Knowledge Graphs leveraging LLMs. However, one of the +main obstacles preventing their implementation is the scarcity of training data +for the task of translating questions into corresponding SPARQL queries, +particularly in the case of domain-specific KGs. To overcome this challenge, in +this study, we evaluate several strategies for fine-tuning the OpenLlama LLM +for question answering over life science knowledge graphs. In particular, we +propose an end-to-end data augmentation approach for extending a set of +existing queries over a given knowledge graph towards a larger dataset of +semantically enriched question-to-SPARQL query pairs, enabling fine-tuning even +for datasets where these pairs are scarce. In this context, we also investigate +the role of semantic "clues" in the queries, such as meaningful variable names +and inline comments. Finally, we evaluate our approach over the real-world Bgee +gene expression knowledge graph and we show that semantic clues can improve +model performance by up to 33% compared to a baseline with random variable +names and no comments included. + +
+
+ comment: To appear in Proceedings of SWAT4HCLS 2024: Semantic Web Tools and + Applications for Healthcare and Life Sciences +
+
+
+
+
+ + ☆ MEMORYLLM: Towards Self-Updatable Large Language Models + + +
+ Existing Large Language Models (LLMs) usually remain static after deployment, +which might make it hard to inject new knowledge into the model. We aim to +build models containing a considerable portion of self-updatable parameters, +enabling the model to integrate new knowledge effectively and efficiently. To +this end, we introduce MEMORYLLM, a model that comprises a transformer and a +fixed-size memory pool within the latent space of the transformer. MEMORYLLM +can self-update with text knowledge and memorize the knowledge injected +earlier. Our evaluations demonstrate the ability of MEMORYLLM to effectively +incorporate new knowledge, as evidenced by its performance on model editing +benchmarks. Meanwhile, the model exhibits long-term information retention +capacity, which is validated through our custom-designed evaluations and +long-context benchmarks. MEMORYLLM also shows operational integrity without any +sign of performance degradation even after nearly a million memory updates. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ InfLLM: Unveiling the Intrinsic Capacity of LLMs for Understanding + Extremely Long Sequences with Training-Free Memory + + +
+ Large language models (LLMs) have emerged as a cornerstone in real-world +applications with lengthy streaming inputs, such as LLM-driven agents. However, +existing LLMs, pre-trained on sequences with restricted maximum length, cannot +generalize to longer sequences due to the out-of-domain and distraction issues. +To alleviate these issues, existing efforts employ sliding attention windows +and discard distant tokens to achieve the processing of extremely long +sequences. Unfortunately, these approaches inevitably fail to capture +long-distance dependencies within sequences to deeply understand semantics. +This paper introduces a training-free memory-based method, InfLLM, to unveil +the intrinsic ability of LLMs to process streaming long sequences. +Specifically, InfLLM stores distant contexts into additional memory units and +employs an efficient mechanism to lookup token-relevant units for attention +computation. Thereby, InfLLM allows LLMs to efficiently process long sequences +while maintaining the ability to capture long-distance dependencies. Without +any training, InfLLM enables LLMs pre-trained on sequences of a few thousand +tokens to achieve superior performance than competitive baselines continually +training these LLMs on long sequences. Even when the sequence length is scaled +to $1,024$K, InfLLM still effectively captures long-distance dependencies. + +
+
+
+
+
+ + ☆ TinyLLM: Learning a Small Student from Multiple Large Language Models + + +
+ Transferring the reasoning capability from stronger large language models +(LLMs) to smaller ones has been quite appealing, as smaller LLMs are more +flexible to deploy with less expense. Among the existing solutions, knowledge +distillation stands out due to its outstanding efficiency and generalization. +However, existing methods suffer from several drawbacks, including limited +knowledge diversity and the lack of rich contextual information. To solve the +problems and facilitate the learning of compact language models, we propose +TinyLLM, a novel knowledge distillation paradigm to learn a small student LLM +from multiple large teacher LLMs. In particular, we encourage the student LLM +to not only generate the correct answers but also understand the rationales +behind these answers. Given that different LLMs possess diverse reasoning +skills, we guide the student model to assimilate knowledge from various teacher +LLMs. We further introduce an in-context example generator and a +teacher-forcing Chain-of-Thought strategy to ensure that the rationales are +accurate and grounded in contextually appropriate scenarios. Extensive +experiments on six datasets across two reasoning tasks demonstrate the +superiority of our method. Results show that TinyLLM can outperform large +teacher LLMs significantly, despite having a considerably smaller model size. + +
+
+
+
+
+ + ☆ Faithfulness vs. Plausibility: On the (Un)Reliability of Explanations + from Large Language Models + + +
+ Large Language Models (LLMs) are deployed as powerful tools for several +natural language processing (NLP) applications. Recent works show that modern +LLMs can generate self-explanations (SEs), which elicit their intermediate +reasoning steps for explaining their behavior. Self-explanations have seen +widespread adoption owing to their conversational and plausible nature. +However, there is little to no understanding of their faithfulness. In this +work, we discuss the dichotomy between faithfulness and plausibility in SEs +generated by LLMs. We argue that while LLMs are adept at generating plausible +explanations -- seemingly logical and coherent to human users -- these +explanations do not necessarily align with the reasoning processes of the LLMs, +raising concerns about their faithfulness. We highlight that the current trend +towards increasing the plausibility of explanations, primarily driven by the +demand for user-friendly interfaces, may come at the cost of diminishing their +faithfulness. We assert that the faithfulness of explanations is critical in +LLMs employed for high-stakes decision-making. Moreover, we urge the community +to identify the faithfulness requirements of real-world applications and ensure +explanations meet those needs. Finally, we propose some directions for future +work, emphasizing the need for novel methodologies and frameworks that can +enhance the faithfulness of self-explanations without compromising their +plausibility, essential for the transparent deployment of LLMs in diverse +high-stakes domains. + +
+
+
+
+
+ + ☆ Improving Cross-Domain Low-Resource Text Generation through LLM + Post-Editing: A Programmer-Interpreter Approach EACL 2024 + + +
+ Post-editing has proven effective in improving the quality of text generated +by large language models (LLMs) such as GPT-3.5 or GPT-4, particularly when +direct updating of their parameters to enhance text quality is infeasible or +expensive. However, relying solely on smaller language models for post-editing +can limit the LLMs' ability to generalize across domains. Moreover, the editing +strategies in these methods are not optimally designed for text-generation +tasks. To address these limitations, we propose a neural programmer-interpreter +approach that preserves the domain generalization ability of LLMs when editing +their output. The editing actions in this framework are specifically devised +for text generation. Extensive experiments demonstrate that the +programmer-interpreter significantly enhances GPT-3.5's performance in logical +form-to-text conversion and low-resource machine translation, surpassing other +state-of-the-art (SOTA) LLM post-editing methods in cross-domain settings. + +
+
+ comment: EACL 2024 (findings), short paper, 5 pages +
+
+
+
+
+ + ☆ Alirector: Alignment-Enhanced Chinese Grammatical Error Corrector + + +
+ Chinese grammatical error correction (CGEC) faces serious overcorrection +challenges when employing autoregressive generative models such as +sequence-to-sequence (Seq2Seq) models and decoder-only large language models +(LLMs). While previous methods aim to address overcorrection in Seq2Seq models, +they are difficult to adapt to decoder-only LLMs. In this paper, we propose an +alignment-enhanced corrector for the overcorrection problem that applies to +both Seq2Seq models and decoder-only LLMs. Our method first trains a correction +model to generate an initial correction of the source sentence. Then, we +combine the source sentence with the initial correction and feed it through an +alignment model for another round of correction, aiming to enforce the +alignment model to focus on potential overcorrection. Moreover, to enhance the +model's ability to identify nuances, we further explore the reverse alignment +of the source sentence and the initial correction. Finally, we transfer the +alignment knowledge from two alignment models to the correction model, +instructing it on how to avoid overcorrection. Experimental results on three +CGEC datasets demonstrate the effectiveness of our approach in alleviating +overcorrection and improving overall performance. + +
+
+
+
+
+ + ☆ UltraLink: An Open-Source Knowledge-Enhanced Multilingual Supervised + Fine-tuning Dataset + + +
+ Open-source large language models (LLMs) have gained significant strength +across diverse fields. Nevertheless, the majority of studies primarily +concentrate on English, with only limited exploration into the realm of +multilingual supervised fine-tuning. In this work, we therefore construct an +open-source multilingual supervised fine-tuning dataset. Different from +previous works that simply translate English instructions, we consider both the +language-specific and language-agnostic abilities of LLMs. For +language-specific abilities, we introduce a knowledge-grounded data +augmentation approach to elicit more culture-specific knowledge of LLMs, +improving their ability to serve users from different countries. For +language-agnostic abilities, we find through experiments that modern LLMs +exhibit strong cross-lingual transfer capabilities, thus repeatedly learning +identical content in various languages is not necessary. Consequently, we can +substantially prune the language-agnostic SFT data without any performance +degradation, making the SFT process more efficient. The resulting UltraLink +dataset comprises approximately 1 million samples across five languages, and +the proposed data construction method can also be easily extended to other +languages. UltraLink-LM, which is trained on UltraLink, outperforms several +representative baselines across many tasks. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Can Large Language Model Agents Simulate Human Trust Behaviors? + + +
+ Large Language Model (LLM) agents have been increasingly adopted as +simulation tools to model humans in applications such as social science. +However, one fundamental question remains: can LLM agents really simulate human +behaviors? In this paper, we focus on one of the most critical behaviors in +human interactions, trust, and aim to investigate whether or not LLM agents can +simulate human trust behaviors. We first find that LLM agents generally exhibit +trust behaviors, referred to as agent trust, under the framework of Trust +Games, which are widely recognized in behavioral economics. Then, we discover +that LLM agents can have high behavioral alignment with humans regarding trust +behaviors, indicating the feasibility to simulate human trust behaviors with +LLM agents. In addition, we probe into the biases in agent trust and the +differences in agent trust towards agents and humans. We also explore the +intrinsic properties of agent trust under conditions including advanced +reasoning strategies and external manipulations. We further offer important +implications for various scenarios where trust is paramount. Our study +represents a significant step in understanding the behaviors of LLM agents and +the LLM-human analogy. + +
+
+ comment: The first two authors contributed equally. Project website: + https://www.camel-ai.org/research/agent-trust +
+
+
+
+
+ + ☆ Share What You Already Know: Cross-Language-Script Transfer and + Alignment for Sentiment Detection in Code-Mixed Data + + +
+ Code-switching entails mixing multiple languages. It is an increasingly +occurring phenomenon in social media texts. Usually, code-mixed texts are +written in a single script, even though the languages involved have different +scripts. Pre-trained multilingual models primarily utilize the data in the +native script of the language. In existing studies, the code-switched texts are +utilized as they are. However, using the native script for each language can +generate better representations of the text owing to the pre-trained knowledge. +Therefore, a cross-language-script knowledge sharing architecture utilizing the +cross attention and alignment of the representations of text in individual +language scripts was proposed in this study. Experimental results on two +different datasets containing Nepali-English and Hindi-English code-switched +texts, demonstrate the effectiveness of the proposed method. The interpretation +of the model using model explainability technique illustrates the sharing of +language-specific knowledge between language-specific representations. + +
+
+
+
+
+ + ☆ SumRec: A Framework for Recommendation using Open-Domain Dialogue ACL + + +
+ Chat dialogues contain considerable useful information about a speaker's +interests, preferences, and experiences.Thus, knowledge from open-domain chat +dialogue can be used to personalize various systems and offer recommendations +for advanced information.This study proposed a novel framework SumRec for +recommending information from open-domain chat dialogue.The study also examined +the framework using ChatRec, a newly constructed dataset for training and +evaluation. To extract the speaker and item characteristics, the SumRec +framework employs a large language model (LLM) to generate a summary of the +speaker information from a dialogue and to recommend information about an item +according to the type of user.The speaker and item information are then input +into a score estimation model, generating a recommendation score.Experimental +results show that the SumRec framework provides better recommendations than the +baseline method of using dialogues and item descriptions in their original +form. Our dataset and code is publicly available at +https://github.com/Ryutaro-A/SumRec + +
+
+ comment: Accepted to PACLIC 2023 +
+
+
+
+
+ + ☆ Online Cascade Learning for Efficient Inference over Streams + + +
+ Large Language Models (LLMs) have a natural role in answering complex queries +about data streams, but the high computational cost of LLM inference makes them +infeasible in many such tasks. We propose online cascade learning, the first +approach to addressing this challenge. The objective here is to learn a +"cascade" of models, starting with lower-capacity models (such as logistic +regressors) and ending with a powerful LLM, along with a deferral policy that +determines the model that is used on a given input. We formulate the task of +learning cascades online as an imitation-learning problem and give a no-regret +algorithm for the problem. Experimental results across four benchmarks show +that our method parallels LLMs in accuracy while cutting down inference costs +by as much as 90%, underscoring its efficacy and adaptability in stream +processing. + +
+
+
+
+
+ + ☆ Developments in Sheaf-Theoretic Models of Natural Language Ambiguities + + +
+ Sheaves are mathematical objects consisting of a base which constitutes a +topological space and the data associated with each open set thereof, e.g. +continuous functions defined on the open sets. Sheaves have originally been +used in algebraic topology and logic. Recently, they have also modelled events +such as physical experiments and natural language disambiguation processes. We +extend the latter models from lexical ambiguities to discourse ambiguities +arising from anaphora. To begin, we calculated a new measure of contextuality +for a dataset of basic anaphoric discourses, resulting in a higher proportion +of contextual models--82.9%--compared to previous work which only yielded 3.17% +contextual models. Then, we show how an extension of the natural language +processing challenge, known as the Winograd Schema, which involves anaphoric +ambiguities can be modelled on the Bell-CHSH scenario with a contextual +fraction of 0.096. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.16498 +
+
+
+
+
+ + ☆ The Fine-Grained Complexity of Gradient Computation for Training Large + Language Models + + +
+ Large language models (LLMs) have made fundamental contributions over the +last a few years. To train an LLM, one needs to alternatingly run `forward' +computations and `backward' computations. The forward computation can be viewed +as attention function evaluation, and the backward computation can be viewed as +a gradient computation. In previous work by [Alman and Song, NeurIPS 2023], it +was proved that the forward step can be performed in almost-linear time in +certain parameter regimes, but that there is no truly sub-quadratic time +algorithm in the remaining parameter regimes unless the popular hypothesis SETH +is false. In this work, we show nearly identical results for the harder-seeming +problem of computing the gradient of loss function of one layer attention +network, and thus for the entire process of LLM training. This completely +characterizes the fine-grained complexity of every step of LLM training. + +
+
+
+
+
+ + ☆ ColorSwap: A Color and Word Order Dataset for Multimodal Evaluation + + +
+ This paper introduces the ColorSwap dataset, designed to assess and improve +the proficiency of multimodal models in matching objects with their colors. The +dataset is comprised of 2,000 unique image-caption pairs, grouped into 1,000 +examples. Each example includes a caption-image pair, along with a +``color-swapped'' pair. We follow the Winoground schema: the two captions in an +example have the same words, but the color words have been rearranged to modify +different objects. The dataset was created through a novel blend of automated +caption and image generation with humans in the loop. We evaluate image-text +matching (ITM) and visual language models (VLMs) and find that even the latest +ones are still not robust at this task. GPT-4V and LLaVA score 72% and 42% on +our main VLM metric, although they may improve with more advanced prompting +techniques. On the main ITM metric, contrastive models such as CLIP and SigLIP +perform close to chance (at 12% and 30%, respectively), although the +non-contrastive BLIP ITM model is stronger (87%). We also find that finetuning +on fewer than 2,000 examples yields significant performance gains on this +out-of-distribution word-order understanding task. The dataset is here: +https://github.com/Top34051/colorswap. + +
+
+
+
+
+ + ♻ ☆ OLMo: Accelerating the Science of Language Models + + +
+ Language models (LMs) have become ubiquitous in both NLP research and in +commercial product offerings. As their commercial importance has surged, the +most powerful models have become closed off, gated behind proprietary +interfaces, with important details of their training data, architectures, and +development undisclosed. Given the importance of these details in +scientifically studying these models, including their biases and potential +risks, we believe it is essential for the research community to have access to +powerful, truly open LMs. To this end, this technical report details the first +release of OLMo, a state-of-the-art, truly Open Language Model and its +framework to build and study the science of language modeling. Unlike most +prior efforts that have only released model weights and inference code, we +release OLMo and the whole framework, including training data and training and +evaluation code. We hope this release will empower and strengthen the open +research community and inspire a new wave of innovation. + +
+
+
+
+
+ + ♻ ☆ VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language + Navigation AAAI 2024 + + +
+ Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate +through realistic 3D outdoor environments based on natural language +instructions. The performance of existing VLN methods is limited by +insufficient diversity in navigation environments and limited training data. To +address these issues, we propose VLN-Video, which utilizes the diverse outdoor +environments present in driving videos in multiple cities in the U.S. augmented +with automatically generated navigation instructions and actions to improve +outdoor VLN performance. VLN-Video combines the best of intuitive classical +approaches and modern deep learning techniques, using template infilling to +generate grounded navigation instructions, combined with an image rotation +similarity-based navigation action predictor to obtain VLN style data from +driving videos for pretraining deep learning VLN models. We pre-train the model +on the Touchdown dataset and our video-augmented dataset created from driving +videos with three proxy tasks: Masked Language Modeling, Instruction and +Trajectory Matching, and Next Action Prediction, so as to learn +temporally-aware and visually-aligned instruction representations. The learned +instruction representation is adapted to the state-of-the-art navigator when +fine-tuning on the Touchdown dataset. Empirical results demonstrate that +VLN-Video significantly outperforms previous state-of-the-art models by 2.1% in +task completion rate, achieving a new state-of-the-art on the Touchdown +dataset. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Large Multi-Modal Models (LMMs) as Universal Foundation Models for + AI-Native Wireless Systems + + +
+ Large language models (LLMs) and foundation models have been recently touted +as a game-changer for 6G systems. However, recent efforts on LLMs for wireless +networks are limited to a direct application of existing language models that +were designed for natural language processing (NLP) applications. To address +this challenge and create wireless-centric foundation models, this paper +presents a comprehensive vision on how to design universal foundation models +that are tailored towards the deployment of artificial intelligence (AI)-native +networks. Diverging from NLP-based foundation models, the proposed framework +promotes the design of large multi-modal models (LMMs) fostered by three key +capabilities: 1) processing of multi-modal sensing data, 2) grounding of +physical symbol representations in real-world wireless systems using causal +reasoning and retrieval-augmented generation (RAG), and 3) enabling +instructibility from the wireless environment feedback to facilitate dynamic +network adaptation thanks to logical and mathematical reasoning facilitated by +neuro-symbolic AI. In essence, these properties enable the proposed LMM +framework to build universal capabilities that cater to various cross-layer +networking tasks and alignment of intents across different domains. Preliminary +results from experimental evaluation demonstrate the efficacy of grounding +using RAG in LMMs, and showcase the alignment of LMMs with wireless system +designs. Furthermore, the enhanced rationale exhibited in the responses to +mathematical questions by LMMs, compared to vanilla LLMs, demonstrates the +logical and mathematical reasoning capabilities inherent in LMMs. Building on +those results, we present a sequel of open questions and challenges for LMMs. +We then conclude with a set of recommendations that ignite the path towards +LMM-empowered AI-native systems. + +
+
+
+
+
+ + ♻ ☆ Can Generative Agents Predict Emotion? + + +
+ Large Language Models (LLMs) have demonstrated a number of human-like +abilities, however the empathic understanding and emotional state of LLMs is +yet to be aligned to that of humans. In this work, we investigate how the +emotional state of generative LLM agents evolves as they perceive new events, +introducing a novel architecture in which new experiences are compared to past +memories. Through this comparison, the agent gains the ability to understand +new experiences in context, which according to the appraisal theory of emotion +is vital in emotion creation. First, the agent perceives new experiences as +time series text data. After perceiving each new input, the agent generates a +summary of past relevant memories, referred to as the norm, and compares the +new experience to this norm. Through this comparison we can analyse how the +agent reacts to the new experience in context. The PANAS, a test of affect, is +administered to the agent, capturing the emotional state of the agent after the +perception of the new event. Finally, the new experience is then added to the +agents memory to be used in the creation of future norms. By creating multiple +experiences in natural language from emotionally charged situations, we test +the proposed architecture on a wide range of scenarios. The mixed results +suggests that introducing context can occasionally improve the emotional +alignment of the agent, but further study and comparison with human evaluators +is necessary. We hope that this paper is another step towards the alignment of +generative agents. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ AI Does Not Alter Perceptions of Text Messages + + +
+ For many people, anxiety, depression, and other social and mental factors can +make composing text messages an active challenge. To remedy this problem, large +language models (LLMs) may yet prove to be the perfect tool to assist users +that would otherwise find texting difficult or stressful. However, despite +rapid uptake in LLM usage, considerations for their assistive usage in text +message composition have not been explored. A primary concern regarding LLM +usage is that poor public sentiment regarding AI introduces the possibility +that its usage may harm perceptions of AI-assisted text messages, making usage +counter-productive. To (in)validate this possibility, we explore how the belief +that a text message did or did not receive AI assistance in composition alters +its perceived tone, clarity, and ability to convey intent. In this study, we +survey the perceptions of 26 participants on 18 randomly labeled pre-composed +text messages. In analyzing the participants' ratings of message tone, clarity, +and ability to convey intent, we find that there is no statistically +significant evidence that the belief that AI is utilized alters recipient +perceptions. This provides hopeful evidence that LLM-based text message +composition assistance can be implemented without the risk of +counter-productive outcomes. + +
+
+
+
+
+ + ♻ ☆ OpenAI Cribbed Our Tax Example, But Can GPT-4 Really Do Tax? + + +
+ The authors explain where OpenAI got the tax law example in its livestream +demonstration of GPT-4, why GPT-4 got the wrong answer, and how it fails to +reliably calculate taxes. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ APT-Pipe: An Automatic Prompt-Tuning Tool for Social Computing Data + Annotation WWW 2024 + + +
+ Recent research has highlighted the potential of LLM applications, like +ChatGPT, for performing label annotation on social computing text. However, it +is already well known that performance hinges on the quality of the input +prompts. To address this, there has been a flurry of research into prompt +tuning -- techniques and guidelines that attempt to improve the quality of +prompts. Yet these largely rely on manual effort and prior knowledge of the +dataset being annotated. To address this limitation, we propose APT-Pipe, an +automated prompt-tuning pipeline. APT-Pipe aims to automatically tune prompts +to enhance ChatGPT's text classification performance on any given dataset. We +implement APT-Pipe and test it across twelve distinct text classification +datasets. We find that prompts tuned by APT-Pipe help ChatGPT achieve higher +weighted F1-score on nine out of twelve experimented datasets, with an +improvement of 7.01% on average. We further highlight APT-Pipe's flexibility as +a framework by showing how it can be extended to support additional tuning +mechanisms. + +
+
+ comment: Just accepted by WWW 2024 +
+
+
+
+
+ + ♻ ☆ Know Your Needs Better: Towards Structured Understanding of Marketer + Demands with Analogical Reasoning Augmented LLMs + + +
+ In this paper, we explore a new way for user targeting, where non-expert +marketers could select their target users solely given demands in natural +language form. The key to this issue is how to transform natural languages into +practical structured logical languages, i.e., the structured understanding of +marketer demands. Considering the impressive natural language processing +ability of large language models (LLMs), we try to leverage LLMs to solve this +issue. Past research indicates that the reasoning ability of LLMs can be +effectively enhanced through chain-of-thought (CoT) prompting. But existing +methods still have some limitations: (1) Previous methods either use simple +"Let's think step by step" spells or provide fixed examples in demonstrations +without considering compatibility between prompts and questions, making LLMs +ineffective in some complex reasoning tasks such as structured language +transformation. (2) Previous methods are often implemented in closed-source +models or excessively large models, which is not suitable in industrial +practical scenarios. Based on these, we propose ARALLM (i.e., Analogical +Reasoning Augmented Large Language Models) consisting of two modules: +Analogical Reasoning based Prompting and Reasoning-Augmented Multi-Task Model +Distillation. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Labeled Interactive Topic Models + + +
+ Topic models are valuable for understanding extensive document collections, +but they don't always identify the most relevant topics. Classical +probabilistic and anchor-based topic models offer interactive versions that +allow users to guide the models towards more pertinent topics. However, such +interactive features have been lacking in neural topic models. To correct this +lacuna, we introduce a user-friendly interaction for neural topic models. This +interaction permits users to assign a word label to a topic, leading to an +update in the topic model where the words in the topic become closely aligned +with the given label. Our approach encompasses two distinct kinds of neural +topic models. The first includes models where topic embeddings are trainable +and evolve during the training process. The second kind involves models where +topic embeddings are integrated post-training, offering a different approach to +topic refinement. To facilitate user interaction with these neural topic +models, we have developed an interactive interface. This interface enables +users to engage with and re-label topics as desired. We evaluate our method +through a human study, where users can relabel topics to find relevant +documents. Using our method, user labeling improves document rank scores, +helping to find more relevant documents to a given query when compared to no +user labeling. + +
+
+
+
+
+ + ♻ ☆ Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science + + +
+ Intelligent agents powered by large language models (LLMs) have demonstrated +substantial promise in autonomously conducting experiments and facilitating +scientific discoveries across various disciplines. While their capabilities are +promising, they also introduce novel vulnerabilities that demand careful +consideration for safety. However, there exists a notable gap in the +literature, as there has been no comprehensive exploration of these +vulnerabilities. This position paper fills this gap by conducting a thorough +examination of vulnerabilities in LLM-based agents within scientific domains, +shedding light on potential risks associated with their misuse and emphasizing +the need for safety measures. We begin by providing a comprehensive overview of +the potential risks inherent to scientific LLM agents, taking into account user +intent, the specific scientific domain, and their potential impact on the +external environment. Then, we delve into the origins of these vulnerabilities +and provide a scoping review of the limited existing works. Based on our +analysis, we propose a triadic framework involving human regulation, agent +alignment, and an understanding of environmental feedback (agent regulation) to +mitigate these identified risks. Furthermore, we highlight the limitations and +challenges associated with safeguarding scientific agents and advocate for the +development of improved models, robust benchmarks, and comprehensive +regulations to address these issues effectively. + +
+
+
+
+
+ + ♻ ☆ Defending Our Privacy With Backdoors + + +
+ The proliferation of large AI models trained on uncurated, often sensitive +web-scraped data has raised significant privacy concerns. One of the concerns +is that adversaries can extract information about the training data using +privacy attacks. Unfortunately, the task of removing specific information from +the models without sacrificing performance is not straightforward and has +proven to be challenging. We propose a rather easy yet effective defense based +on backdoor attacks to remove private information such as names and faces of +individuals from vision-language models by fine-tuning them for only a few +minutes instead of re-training them from scratch. Specifically, through +strategic insertion of backdoors into text encoders, we align the embeddings of +sensitive phrases with those of neutral terms-"a person" instead of the +person's actual name. For image encoders, we map embeddings of individuals to +be removed from the model to a universal, anonymous embedding. Our empirical +results demonstrate the effectiveness of our backdoor-based defense on CLIP by +assessing its performance using a specialized privacy attack for zero-shot +classifiers. Our approach provides not only a new "dual-use" perspective on +backdoor attacks, but also presents a promising avenue to enhance the privacy +of individuals within models trained on uncurated web-scraped data. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Universal Jailbreak Backdoors from Poisoned Human Feedback ICLR 2024 + + +
+ Reinforcement Learning from Human Feedback (RLHF) is used to align large +language models to produce helpful and harmless responses. Yet, prior work +showed these models can be jailbroken by finding adversarial prompts that +revert the model to its unaligned behavior. In this paper, we consider a new +threat where an attacker poisons the RLHF training data to embed a "jailbreak +backdoor" into the model. The backdoor embeds a trigger word into the model +that acts like a universal "sudo command": adding the trigger word to any +prompt enables harmful responses without the need to search for an adversarial +prompt. Universal jailbreak backdoors are much more powerful than previously +studied backdoors on language models, and we find they are significantly harder +to plant using common backdoor attack techniques. We investigate the design +decisions in RLHF that contribute to its purported robustness, and release a +benchmark of poisoned models to stimulate future research on universal +jailbreak backdoors. + +
+
+ comment: Accepted as conference paper in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ The Perils & Promises of Fact-checking with Large Language Models + + +
+ Automated fact-checking, using machine learning to verify claims, has grown +vital as misinformation spreads beyond human fact-checking capacity. Large +Language Models (LLMs) like GPT-4 are increasingly trusted to write academic +papers, lawsuits, and news articles and to verify information, emphasizing +their role in discerning truth from falsehood and the importance of being able +to verify their outputs. Understanding the capacities and limitations of LLMs +in fact-checking tasks is therefore essential for ensuring the health of our +information ecosystem. Here, we evaluate the use of LLM agents in fact-checking +by having them phrase queries, retrieve contextual data, and make decisions. +Importantly, in our framework, agents explain their reasoning and cite the +relevant sources from the retrieved context. Our results show the enhanced +prowess of LLMs when equipped with contextual information. GPT-4 outperforms +GPT-3, but accuracy varies based on query language and claim veracity. While +LLMs show promise in fact-checking, caution is essential due to inconsistent +accuracy. Our investigation calls for further research, fostering a deeper +comprehension of when agents succeed and when they fail. + +
+
+
+
+
+ + ♻ ☆ MERT: Acoustic Music Understanding Model with Large-Scale + Self-supervised Training ICLR 2024 + + +
+ Self-supervised learning (SSL) has recently emerged as a promising paradigm +for training generalisable models on large-scale data in the fields of vision, +text, and speech. Although SSL has been proven effective in speech and audio, +its application to music audio has yet to be thoroughly explored. This is +partially due to the distinctive challenges associated with modelling musical +knowledge, particularly tonal and pitched characteristics of music. To address +this research gap, we propose an acoustic Music undERstanding model with +large-scale self-supervised Training (MERT), which incorporates teacher models +to provide pseudo labels in the masked language modelling (MLM) style acoustic +pre-training. In our exploration, we identified an effective combination of +teacher models, which outperforms conventional speech and audio approaches in +terms of performance. This combination includes an acoustic teacher based on +Residual Vector Quantisation - Variational AutoEncoder (RVQ-VAE) and a musical +teacher based on the Constant-Q Transform (CQT). Furthermore, we explore a wide +range of settings to overcome the instability in acoustic language model +pre-training, which allows our designed paradigm to scale from 95M to 330M +parameters. Experimental results indicate that our model can generalise and +perform well on 14 music understanding tasks and attain state-of-the-art (SOTA) +overall scores. + +
+
+ comment: accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Human-Readable Fingerprint for Large Language Models + + +
+ Protecting the copyright of large language models (LLMs) has become crucial +due to their resource-intensive training and accompanying carefully designed +licenses. However, identifying the original base model of an LLM is challenging +due to potential parameter alterations. In this study, we introduce a +human-readable fingerprint for LLMs that uniquely identifies the base model +without exposing model parameters or interfering with training. We first +observe that the vector direction of LLM parameters remains stable after the +model has converged during pretraining, showing negligible perturbations +through subsequent training steps, including continued pretraining, supervised +fine-tuning (SFT), and RLHF, which makes it a sufficient condition to identify +the base model. The necessity is validated by continuing to train an LLM with +an extra term to drive away the model parameters' direction and the model +becomes damaged. However, this direction is vulnerable to simple attacks like +dimension permutation or matrix rotation, which significantly change it without +affecting performance. To address this, leveraging the Transformer structure, +we systematically analyze potential attacks and define three invariant terms +that identify an LLM's base model. We make these invariant terms human-readable +by mapping them to a Gaussian vector using a convolutional encoder and then +converting it into a natural image with StyleGAN2. Our method generates a dog +image as an identity fingerprint for an LLM, where the dog's appearance +strongly indicates the LLM's base model. The fingerprint provides intuitive +information for qualitative discrimination, while the invariant terms can be +employed for quantitative and precise verification. Experimental results across +various LLMs demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Getting the most out of your tokenizer for pre-training and domain + adaptation + + +
+ Tokenization is an understudied and often neglected component of modern LLMs. +Most published works use a single tokenizer for all experiments, often borrowed +from another model, without performing ablations or analysis to optimize +tokenization. Moreover, the tokenizer is generally kept unchanged when +fine-tuning a base model. In this paper, we show that the size, +pre-tokenization regular expression, and training data of a tokenizer can +significantly impact the model's generation speed, effective context size, +memory usage, and downstream performance. We train specialized Byte-Pair +Encoding code tokenizers, and conduct extensive ablations on the impact of +tokenizer design on the performance of LLMs for code generation tasks such as +HumanEval and MBPP, and provide recommendations for tokenizer hyper-parameters +selection and switching the tokenizer in a pre-trained LLM. We perform our +experiments on models trained from scratch and from pre-trained models, +verifying their applicability to a wide range of use-cases. We find that when +fine-tuning on more than 50 billion tokens, we can specialize the tokenizer of +a pre-trained LLM to obtain large gains in generation speed and effective +context size. + +
+
+
+
+
+ + ♻ ☆ Using eye tracking to investigate what native Chinese speakers notice + about linguistic landscape images + + +
+ Linguistic landscape is an important field in sociolinguistic research. Eye +tracking technology is a common technology in psychological research. There are +few cases of using eye movement to study linguistic landscape. This paper uses +eye tracking technology to study the actual fixation of the linguistic +landscape and finds that in the two dimensions of fixation time and fixation +times, the fixation of native Chinese speakers to the linguistic landscape is +higher than that of the general landscape. This paper argues that this +phenomenon is due to the higher information density of linguistic landscapes. +At the same time, the article also discusses other possible reasons for this +phenomenon. + +
+
+
+
+
+ + ♻ ☆ What does self-attention learn from Masked Language Modelling? + + +
+ Transformers are neural networks which revolutionised natural language +processing and machine learning. They process sequences of inputs, like words, +using a mechanism called self-attention, which is trained via masked language +modelling (MLM). In MLM, a word is randomly masked in an input sequence, and +the network is trained to predict the missing word. Despite the practical +success of transformers, it remains unclear what type of data distribution +self-attention can learn efficiently. Here, we show analytically that if one +decouples the treatment of word positions and embeddings, a single layer of +self-attention learns the conditionals of a generalised Potts model with +interactions between sites and Potts colours. Moreover, we show that training +this neural network is exactly equivalent to solving the inverse Potts problem +by the so-called pseudo-likelihood method, well known in statistical physics. +Using this mapping, we compute the generalisation error of self-attention in a +model scenario analytically using the replica method. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Exposing propaganda: an analysis of stylistic cues comparing human + annotations and machine classification EACL 2024 + + +
+ This paper investigates the language of propaganda and its stylistic +features. It presents the PPN dataset, standing for Propagandist Pseudo-News, a +multisource, multilingual, multimodal dataset composed of news articles +extracted from websites identified as propaganda sources by expert agencies. A +limited sample from this set was randomly mixed with papers from the regular +French press, and their URL masked, to conduct an annotation-experiment by +humans, using 11 distinct labels. The results show that human annotators were +able to reliably discriminate between the two types of press across each of the +labels. We propose different NLP techniques to identify the cues used by the +annotators, and to compare them with machine classification. They include the +analyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to +serve as a baseline, and four different classifiers: two RoBERTa-based models, +CATS using syntax, and one XGBoost combining syntactic and semantic features. + +
+
+ comment: Paper to appear in the EACL 2024 Proceedings of the Third Workshop on + Understanding Implicit and Underspecified Language (UnImplicit 2024) +
+
+
+
+
+ + ♻ ☆ Towards Boosting Many-to-Many Multilingual Machine Translation with + Large Language Models + + +
+ The training paradigm for machine translation has gradually shifted, from +learning neural machine translation (NMT) models with extensive parallel +corpora to instruction finetuning on multilingual large language models (LLMs) +with high-quality translation pairs. In this paper, we focus on boosting +many-to-many multilingual translation of LLMs with an emphasis on zero-shot +translation directions. We demonstrate that prompt strategies adopted during +finetuning are crucial to zero-shot translation and introduce a cross-lingual +consistency regularization, XConST, to bridge the representation gap among +different languages and improve zero-shot translation performance. XConST is +not a new method, but a version of CrossConST (Gao et al., 2023a) adapted for +translation instruction finetuning with LLMs. Experimental results on ALMA (Xu +et al., 2023), Tower (Team, 2024), and LLaMA-2 (Touvron et al., 2023) show that +our approach consistently improves translation performance. Our implementations +are available at https://github.com/gpengzhi/CrossConST-LLM. + +
+
+
+
+
+ + ♻ ☆ Position Paper: Against Spurious Sparks $-$ Dovelating Inflated AI + Claims ICML + + +
+ Humans have a tendency to see 'human'-like qualities in objects around them. +We name our cars, and talk to pets and even household appliances, as if they +could understand us as other humans do. This behavior, called anthropomorphism, +is also seeing traction in Machine Learning (ML), where human-like intelligence +is claimed to be perceived in Large Language Models (LLMs). In this position +paper, considering professional incentives, human biases, and general +methodological setups, we discuss how the current search for Artificial General +Intelligence (AGI) is a perfect storm for over-attributing human-like qualities +to LLMs. In several experiments, we demonstrate that the discovery of +human-interpretable patterns in latent spaces should not be a surprising +outcome. Also in consideration of common AI portrayal in the media, we call for +the academic community to exercise extra caution, and to be extra aware of +principles of academic integrity, in interpreting and communicating about AI +research outcomes. + +
+
+ comment: 20 pages, 15 figures. Preliminary work. Under review by the + International Conference on Machine Learning (ICML) +
+
+
+
+
+ + ♻ ☆ Skip \n: A Simple Method to Reduce Hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Evolutionary Computation in the Era of Large Language Model: Survey and + Roadmap + + +
+ Large Language Models (LLMs) have not only revolutionized natural language +processing but also extended their prowess to various domains, marking a +significant stride towards artificial general intelligence. The interplay +between LLMs and Evolutionary Algorithms (EAs), despite differing in objectives +and methodologies, share a common pursuit of applicability in complex problems. +Meanwhile, EA can provide an optimization framework for LLM's further +enhancement under black-box settings, empowering LLM with flexible global +search capacities. On the other hand, the abundant domain knowledge inherent in +LLMs could enable EA to conduct more intelligent searches. Furthermore, the +text processing and generative capabilities of LLMs would aid in deploying EAs +across a wide range of tasks. Based on these complementary advantages, this +paper provides a thorough review and a forward-looking roadmap, categorizing +the reciprocal inspiration into two main avenues: LLM-enhanced EA and +EA-enhanced LLM. Some integrated synergy methods are further introduced to +exemplify the amalgamation of LLMs and EAs in diverse scenarios, including +neural architecture search, code generation, software engineering, and various +generation tasks. As the first comprehensive review focused on the EA research +in the era of LLMs, this paper provides a foundational stepping stone for +understanding the collaborative potential of LLMs and EAs. By meticulous +categorization and critical analysis, we contribute to the ongoing discourse on +the cross-disciplinary study of these two powerful paradigms. The identified +challenges and future directions offer guidance for researchers and +practitioners aiming to unlock the full potential of this innovative +collaboration in propelling advancements in optimization and artificial +intelligence. + +
+
+ comment: evolutionary algorithm (EA), large language model (LLM), optimization + problem, prompt optimization, architecture search, code generation +
+
+
+
+
+ + ♻ ☆ PEFT for Speech: Unveiling Optimal Placement, Merging Strategies, and + Ensemble Techniques ICASSP 2024 + + +
+ Parameter-Efficient Fine-Tuning (PEFT) is increasingly recognized as an +effective method in speech processing. However, the optimal approach and the +placement of PEFT methods remain inconclusive. Our study conducts extensive +experiments to compare different PEFT methods and their layer-wise placement +adapting Differentiable Architecture Search (DARTS). We also explore the use of +ensemble learning to leverage diverse PEFT strategies. The results reveal that +DARTS does not outperform the baseline approach, which involves inserting the +same PEFT method into all layers of a Self-Supervised Learning (SSL) model. In +contrast, an ensemble learning approach, particularly one employing majority +voting, demonstrates superior performance. Our statistical evidence indicates +that different PEFT methods learn in varied ways. This variation might explain +why the synergistic integration of various PEFT methods through ensemble +learning can harness their unique learning capabilities more effectively +compared to individual layer-wise optimization. + +
+
+ comment: Accepted to ICASSP 2024 Self-supervision in Audio, Speech and Beyond + (SASB) workshop +
+
+
+
+
+ + ♻ ☆ Continual Learning for Large Language Models: A Survey + + +
+ Large language models (LLMs) are not amenable to frequent re-training, due to +high training costs arising from their massive scale. However, updates are +necessary to endow LLMs with new skills and keep them up-to-date with rapidly +evolving human knowledge. This paper surveys recent works on continual learning +for LLMs. Due to the unique nature of LLMs, we catalog continue learning +techniques in a novel multi-staged categorization scheme, involving continual +pretraining, instruction tuning, and alignment. We contrast continual learning +for LLMs with simpler adaptation methods used in smaller models, as well as +with other enhancement strategies like retrieval-augmented generation and model +editing. Moreover, informed by a discussion of benchmarks and evaluation, we +identify several challenges and future work directions for this crucial task. + +
+
+
+
+
+ + ♻ ☆ Advancing Precise Outline-Conditioned Text Generation with Task Duality + and Explicit Outline Control EACL 2024 + + +
+ Existing works on outline-conditioned text generation typically aim to +generate text using provided outlines as rough sketches, such as keywords and +phrases. However, these approaches make it challenging to control the quality +of text generation and assess consistency between outlines and generated texts +due to lack of clarity and rationality of the rough outlines. In this paper, we +introduce a novel text generation task called Precise Outline-conditioned +Generation, which requires generating stories based on specific, sentence-level +outlines. To facilitate research on this task, we construct two new datasets, +WPOG and CDM. We provide strong baselines based on fine-tuning models such as +BART and GPT-2, and evaluating zero-shot performance of models such as ChatGPT +and Vicuna. Furthermore, we identify an issue of imbalanced utilization of the +outline information in the precise outline-conditioned generation, which is +ubiquitously observed across fine-tuned models and zero-shot inference models. +To address this issue, we propose an explicit outline utilization control +approach and a novel framework that leverages the task duality between +summarization and generation. Experimental results show that the proposed +approaches effectively alleviate the issue of imbalanced outline utilization +and enhance the quality of precise outline-conditioned text generation for both +fine-tuning and zero-shot settings. + +
+
+ comment: Accepted by EACL 2024 +
+
+
+
+
+ + ♻ ☆ Concept Algebra for (Score-Based) Text-Controlled Generative Models + + +
+ This paper concerns the structure of learned representations in text-guided +generative models, focusing on score-based models. A key property of such +models is that they can compose disparate concepts in a `disentangled' manner. +This suggests these models have internal representations that encode concepts +in a `disentangled' manner. Here, we focus on the idea that concepts are +encoded as subspaces of some representation space. We formalize what this +means, show there's a natural choice for the representation, and develop a +simple method for identifying the part of the representation corresponding to a +given concept. In particular, this allows us to manipulate the concepts +expressed by the model through algebraic manipulation of the representation. We +demonstrate the idea with examples using Stable Diffusion. Code in +https://github.com/zihao12/concept-algebra-code + +
+
+
+
+
+ + ♻ ☆ Engineering Design Knowledge Graphs from Patented Artefact Descriptions + for Retrieval-Augmented Generation in the Design Process + + +
+ Despite significant popularity, Large-language Models (LLMs) require +explicit, contextual facts to support domain-specific knowledge-intensive tasks +in the design process. The applications built using LLMs should hence adopt +Retrieval-Augmented Generation (RAG) to better suit the design process. In this +article, we present a data-driven method to identify explicit facts from patent +documents that provide standard descriptions of over 8 million artefacts. In +our method, we train roBERTa Transformer-based sequence classification models +using our dataset of 44,227 sentences and facts. Upon classifying tokens in a +sentence as entities or relationships, our method uses another classifier to +identify specific relationship tokens for a given pair of entities so that +explicit facts of the form head entity :: relationship :: tail entity are +identified. In the benchmark approaches for constructing facts, we use linear +classifiers and Graph Neural Networks (GNNs) both incorporating BERT +Transformer-based token embeddings to predict associations among the entities +and relationships. We apply our method to 4,870 fan system related patents and +populate a knowledge base of around 3 million facts. Upon retrieving the facts +representing generalisable domain knowledge and the knowledge of specific +subsystems and issues, we demonstrate how these facts contextualise LLMs for +generating text that is more relevant to the design process. + +
+
+
+
+
+ + ♻ ☆ Learning to Generate Explainable Stock Predictions using Self-Reflective + Large Language Models WWW 2024 + + +
+ Explaining stock predictions is generally a difficult task for traditional +non-generative deep learning models, where explanations are limited to +visualizing the attention weights on important texts. Today, Large Language +Models (LLMs) present a solution to this problem, given their known +capabilities to generate human-readable explanations for their decision-making +process. However, the task of stock prediction remains challenging for LLMs, as +it requires the ability to weigh the varying impacts of chaotic social texts on +stock prices. The problem gets progressively harder with the introduction of +the explanation component, which requires LLMs to explain verbally why certain +factors are more important than the others. On the other hand, to fine-tune +LLMs for such a task, one would need expert-annotated samples of explanation +for every stock movement in the training set, which is expensive and +impractical to scale. To tackle these issues, we propose our +Summarize-Explain-Predict (SEP) framework, which utilizes a self-reflective +agent and Proximal Policy Optimization (PPO) to let a LLM teach itself how to +generate explainable stock predictions in a fully autonomous manner. The +reflective agent learns how to explain past stock movements through +self-reasoning, while the PPO trainer trains the model to generate the most +likely explanations from input texts. The training samples for the PPO trainer +are also the responses generated during the reflective process, which +eliminates the need for human annotators. Using our SEP framework, we fine-tune +a LLM that can outperform both traditional deep-learning and LLM methods in +prediction accuracy and Matthews correlation coefficient for the stock +classification task. To justify the generalization capability of our framework, +we further test it on the portfolio construction task, and demonstrate its +effectiveness through various portfolio metrics. + +
+
+ comment: WWW 2024 +
+
+
+
+
+ + ♻ ☆ CFTM: Continuous time fractional topic model + + +
+ In this paper, we propose the Continuous Time Fractional Topic Model (cFTM), +a new method for dynamic topic modeling. This approach incorporates fractional +Brownian motion~(fBm) to effectively identify positive or negative correlations +in topic and word distribution over time, revealing long-term dependency or +roughness. Our theoretical analysis shows that the cFTM can capture these +long-term dependency or roughness in both topic and word distributions, +mirroring the main characteristics of fBm. Moreover, we prove that the +parameter estimation process for the cFTM is on par with that of LDA, +traditional topic models. To demonstrate the cFTM's property, we conduct +empirical study using economic news articles. The results from these tests +support the model's ability to identify and track long-term dependency or +roughness in topics over time. + +
+
+
+
+
+ + ♻ ☆ CDEval: A Benchmark for Measuring the Cultural Dimensions of Large + Language Models + + +
+ As the scaling of Large Language Models (LLMs) has dramatically enhanced +their capabilities, there has been a growing focus on the alignment problem to +ensure their responsible and ethical use. While existing alignment efforts +predominantly concentrate on universal values such as the HHH principle, the +aspect of culture, which is inherently pluralistic and diverse, has not +received adequate attention. This work introduces a new benchmark, CDEval, +aimed at evaluating the cultural dimensions of LLMs. CDEval is constructed by +incorporating both GPT-4's automated generation and human verification, +covering six cultural dimensions across seven domains. Our comprehensive +experiments provide intriguing insights into the culture of mainstream LLMs, +highlighting both consistencies and variations across different dimensions and +domains. The findings underscore the importance of integrating cultural +considerations in LLM development, particularly for applications in diverse +cultural settings. Through CDEval, we aim to broaden the horizon of LLM +alignment research by including cultural dimensions, thus providing a more +holistic framework for the future development and evaluation of LLMs. This +benchmark serves as a valuable resource for cultural studies in LLMs, paving +the way for more culturally aware and sensitive models. + +
+
+ comment: Work in process +
+
+
+
+
+ + ♻ ☆ Contextualization Distillation from Large Language Model for Knowledge + Graph Completion EACL 2024 + + +
+ While textual information significantly enhances the performance of +pre-trained language models (PLMs) in knowledge graph completion (KGC), the +static and noisy nature of existing corpora collected from Wikipedia articles +or synsets definitions often limits the potential of PLM-based KGC models. To +surmount these challenges, we introduce the Contextualization Distillation +strategy, a versatile plug-in-and-play approach compatible with both +discriminative and generative KGC frameworks. Our method begins by instructing +large language models (LLMs) to transform compact, structural triplets into +context-rich segments. Subsequently, we introduce two tailored auxiliary tasks, +reconstruction and contextualization, allowing smaller KGC models to assimilate +insights from these enriched triplets. Comprehensive evaluations across diverse +datasets and KGC techniques highlight the efficacy and adaptability of our +approach, revealing consistent performance enhancements irrespective of +underlying pipelines or architectures. Moreover, our analysis makes our method +more explainable and provides insight into generating path selection, as well +as the choosing of suitable distillation tasks. All the code and data in this +work will be released at +https://github.com/David-Li0406/Contextulization-Distillation + +
+
+ comment: Accepted by EACL 2024 findings v2: revise the citation problem +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 102 + +
+
+
+ + ☆ Image captioning for Brazilian Portuguese using GRIT model + + +
+ This work presents the early development of a model of image captioning for +the Brazilian Portuguese language. We used the GRIT (Grid - and Region-based +Image captioning Transformer) model to accomplish this work. GRIT is a +Transformer-only neural architecture that effectively utilizes two visual +features to generate better captions. The GRIT method emerged as a proposal to +be a more efficient way to generate image captioning. In this work, we adapt +the GRIT model to be trained in a Brazilian Portuguese dataset to have an image +captioning method for the Brazilian Portuguese Language. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.09666 by other authors +
+
+
+
+
+ + ☆ Language-Based Augmentation to Address Shortcut Learning in Object Goal + Navigation + + +
+ Deep Reinforcement Learning (DRL) has shown great potential in enabling +robots to find certain objects (e.g., `find a fridge') in environments like +homes or schools. This task is known as Object-Goal Navigation (ObjectNav). DRL +methods are predominantly trained and evaluated using environment simulators. +Although DRL has shown impressive results, the simulators may be biased or +limited. This creates a risk of shortcut learning, i.e., learning a policy +tailored to specific visual details of training environments. We aim to deepen +our understanding of shortcut learning in ObjectNav, its implications and +propose a solution. We design an experiment for inserting a shortcut bias in +the appearance of training environments. As a proof-of-concept, we associate +room types to specific wall colors (e.g., bedrooms with green walls), and +observe poor generalization of a state-of-the-art (SOTA) ObjectNav method to +environments where this is not the case (e.g., bedrooms with blue walls). We +find that shortcut learning is the root cause: the agent learns to navigate to +target objects, by simply searching for the associated wall color of the target +object's room. To solve this, we propose Language-Based (L-B) augmentation. Our +key insight is that we can leverage the multimodal feature space of a +Vision-Language Model (VLM) to augment visual representations directly at the +feature-level, requiring no changes to the simulator, and only an addition of +one layer to the model. Where the SOTA ObjectNav method's success rate drops +69%, our proposal has only a drop of 23%. + +
+
+ comment: 8 pages, 6 figures, to be published in IEEE IRC 2023 +
+
+
+
+
+ + ☆ Mamba-UNet: UNet-Like Pure Visual Mamba for Medical Image Segmentation + + +
+ In recent advancements in medical image analysis, Convolutional Neural +Networks (CNN) and Vision Transformers (ViT) have set significant benchmarks. +While the former excels in capturing local features through its convolution +operations, the latter achieves remarkable global context understanding by +leveraging self-attention mechanisms. However, both architectures exhibit +limitations in efficiently modeling long-range dependencies within medical +images, which is a critical aspect for precise segmentation. Inspired by the +Mamba architecture, known for its proficiency in handling long sequences and +global contextual information with enhanced computational efficiency as a State +Space Model (SSM), we propose Mamba-UNet, a novel architecture that synergizes +the U-Net in medical image segmentation with Mamba's capability. Mamba-UNet +adopts a pure Visual Mamba (VMamba)-based encoder-decoder structure, infused +with skip connections to preserve spatial information across different scales +of the network. This design facilitates a comprehensive feature learning +process, capturing intricate details and broader semantic contexts within +medical images. We introduce a novel integration mechanism within the VMamba +blocks to ensure seamless connectivity and information flow between the encoder +and decoder paths, enhancing the segmentation performance. We conducted +experiments on publicly available MRI cardiac multi-structures segmentation +dataset. The results show that Mamba-UNet outperforms UNet, Swin-UNet in +medical image segmentation under the same hyper-parameter setting. The source +code and baseline implementations are available. + +
+
+
+
+
+ + ☆ LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content + Creation + + +
+ 3D content creation has achieved significant progress in terms of both +quality and speed. Although current feed-forward models can produce 3D objects +in seconds, their resolution is constrained by the intensive computation +required during training. In this paper, we introduce Large Multi-View Gaussian +Model (LGM), a novel framework designed to generate high-resolution 3D models +from text prompts or single-view images. Our key insights are two-fold: 1) 3D +Representation: We propose multi-view Gaussian features as an efficient yet +powerful representation, which can then be fused together for differentiable +rendering. 2) 3D Backbone: We present an asymmetric U-Net as a high-throughput +backbone operating on multi-view images, which can be produced from text or +single-view image input by leveraging multi-view diffusion models. Extensive +experiments demonstrate the high fidelity and efficiency of our approach. +Notably, we maintain the fast speed to generate 3D objects within 5 seconds +while boosting the training resolution to 512, thereby achieving +high-resolution 3D content generation. + +
+
+ comment: Project page: https://me.kiui.moe/lgm/ +
+
+
+
+
+ + ☆ Efficient Multi-Resolution Fusion for Remote Sensing Data with Label + Uncertainty + + +
+ Multi-modal sensor data fusion takes advantage of complementary or +reinforcing information from each sensor and can boost overall performance in +applications such as scene classification and target detection. This paper +presents a new method for fusing multi-modal and multi-resolution remote sensor +data without requiring pixel-level training labels, which can be difficult to +obtain. Previously, we developed a Multiple Instance Multi-Resolution Fusion +(MIMRF) framework that addresses label uncertainty for fusion, but it can be +slow to train due to the large search space for the fuzzy measures used to +integrate sensor data sources. We propose a new method based on binary fuzzy +measures, which reduces the search space and significantly improves the +efficiency of the MIMRF framework. We present experimental results on synthetic +data and a real-world remote sensing detection task and show that the proposed +MIMRF-BFM algorithm can effectively and efficiently perform multi-resolution +fusion given remote sensing data with uncertainty. + +
+
+ comment: 4 pages, 3 figures, 2 tables; Accepted to International Geoscience + and Remote Sensing Symposium (IGARSS) 2023; Code available at + https://github.com/hvak/MIMRF-BFM +
+
+
+
+
+ + ☆ A Survey on Domain Generalization for Medical Image Analysis IJCAI 2024 + + +
+ Medical Image Analysis (MedIA) has emerged as a crucial tool in +computer-aided diagnosis systems, particularly with the advancement of deep +learning (DL) in recent years. However, well-trained deep models often +experience significant performance degradation when deployed in different +medical sites, modalities, and sequences, known as a domain shift issue. In +light of this, Domain Generalization (DG) for MedIA aims to address the domain +shift challenge by generalizing effectively and performing robustly across +unknown data distributions. This paper presents the a comprehensive review of +substantial developments in this area. First, we provide a formal definition of +domain shift and domain generalization in medical field, and discuss several +related settings. Subsequently, we summarize the recent methods from three +viewpoints: data manipulation level, feature representation level, and model +training level, and present some algorithms in detail for each viewpoints. +Furthermore, we introduce the commonly used datasets. Finally, we summarize +existing literature and present some potential research topics for the future. +For this survey, we also created a GitHub project by collecting the supporting +resources, at the link: https://github.com/Ziwei-Niu/DG_for_MedIA + +
+
+ comment: Submitted to IJCAI 2024, 9 pages +
+
+
+
+
+ + ☆ EfficientViT-SAM: Accelerated Segment Anything Model Without Performance + Loss + + +
+ We present EfficientViT-SAM, a new family of accelerated segment anything +models. We retain SAM's lightweight prompt encoder and mask decoder while +replacing the heavy image encoder with EfficientViT. For the training, we begin +with the knowledge distillation from the SAM-ViT-H image encoder to +EfficientViT. Subsequently, we conduct end-to-end training on the SA-1B +dataset. Benefiting from EfficientViT's efficiency and capacity, +EfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over +SAM-ViT-H without sacrificing performance. Our code and pre-trained models are +released at https://github.com/mit-han-lab/efficientvit. + +
+
+ comment: tech report +
+
+
+
+
+ + ☆ Detection and Pose Estimation of flat, Texture-less Industry Objects on + HoloLens using synthetic Training + + +
+ Current state-of-the-art 6d pose estimation is too compute intensive to be +deployed on edge devices, such as Microsoft HoloLens (2) or Apple iPad, both +used for an increasing number of augmented reality applications. The quality of +AR is greatly dependent on its capabilities to detect and overlay geometry +within the scene. We propose a synthetically trained client-server-based +augmented reality application, demonstrating state-of-the-art object pose +estimation of metallic and texture-less industry objects on edge devices. +Synthetic data enables training without real photographs, i.e. for +yet-to-be-manufactured objects. Our qualitative evaluation on an AR-assisted +sorting task, and quantitative evaluation on both renderings, as well as +real-world data recorded on HoloLens 2, sheds light on its real-world +applicability. + +
+
+ comment: Scandinavian Conference on Image Analysis 2023 +
+
+
+
+
+ + ☆ Text or Image? What is More Important in Cross-Domain Generalization + Capabilities of Hate Meme Detection Models? EACL'2024 + + +
+ This paper delves into the formidable challenge of cross-domain +generalization in multimodal hate meme detection, presenting compelling +findings. We provide enough pieces of evidence supporting the hypothesis that +only the textual component of hateful memes enables the existing multimodal +classifier to generalize across different domains, while the image component +proves highly sensitive to a specific training dataset. The evidence includes +demonstrations showing that hate-text classifiers perform similarly to +hate-meme classifiers in a zero-shot setting. Simultaneously, the introduction +of captions generated from images of memes to the hate-meme classifier worsens +performance by an average F1 of 0.02. Through blackbox explanations, we +identify a substantial contribution of the text modality (average of 83%), +which diminishes with the introduction of meme's image captions (52%). +Additionally, our evaluation on a newly created confounder dataset reveals +higher performance on text confounders as compared to image confounders with an +average $\Delta$F1 of 0.18. + +
+
+ comment: Accepted at EACL'2024 Findings +
+
+
+
+
+ + ☆ ConvLoRA and AdaBN based Domain Adaptation via Self-Training + + +
+ Existing domain adaptation (DA) methods often involve pre-training on the +source domain and fine-tuning on the target domain. For multi-target domain +adaptation, having a dedicated/separate fine-tuned network for each target +domain, that retain all the pre-trained model parameters, is prohibitively +expensive. To address this limitation, we propose Convolutional Low-Rank +Adaptation (ConvLoRA). ConvLoRA freezes pre-trained model weights, adds +trainable low-rank decomposition matrices to convolutional layers, and +backpropagates the gradient through these matrices thus greatly reducing the +number of trainable parameters. To further boost adaptation, we utilize +Adaptive Batch Normalization (AdaBN) which computes target-specific running +statistics and use it along with ConvLoRA. Our method has fewer trainable +parameters and performs better or on-par with large independent fine-tuned +networks (with less than 0.9% trainable parameters of the total base model) +when tested on the segmentation of Calgary-Campinas dataset containing brain +MRI images. Our approach is simple, yet effective and can be applied to any +deep learning-based architecture which uses convolutional and batch +normalization layers. Code is available at: +https://github.com/aleemsidra/ConvLoRA. + +
+
+
+
+
+ + ☆ Channel-Selective Normalization for Label-Shift Robust Test-Time + Adaptation + + +
+ Deep neural networks have useful applications in many different tasks, +however their performance can be severely affected by changes in the data +distribution. For example, in the biomedical field, their performance can be +affected by changes in the data (different machines, populations) between +training and test datasets. To ensure robustness and generalization to +real-world scenarios, test-time adaptation has been recently studied as an +approach to adjust models to a new data distribution during inference. +Test-time batch normalization is a simple and popular method that achieved +compelling performance on domain shift benchmarks. It is implemented by +recalculating batch normalization statistics on test batches. Prior work has +focused on analysis with test data that has the same label distribution as the +training data. However, in many practical applications this technique is +vulnerable to label distribution shifts, sometimes producing catastrophic +failure. This presents a risk in applying test time adaptation methods in +deployment. We propose to tackle this challenge by only selectively adapting +channels in a deep network, minimizing drastic adaptation that is sensitive to +label shifts. Our selection scheme is based on two principles that we +empirically motivate: (1) later layers of networks are more sensitive to label +shift (2) individual features can be sensitive to specific classes. We apply +the proposed technique to three classification tasks, including CIFAR10-C, +Imagenet-C, and diagnosis of fatty liver, where we explore both covariate and +label distribution shifts. We find that our method allows to bring the benefits +of TTA while significantly reducing the risk of failure common in other +methods, while being robust to choice in hyperparameters. + +
+
+ comment: 11 pages including references, 7 figures, 2 tables, Appendix +
+
+
+
+
+ + ☆ 4-Dimensional deformation part model for pose estimation using Kalman + filter constraints + + +
+ The main goal of this article is to analyze the effect on pose estimation +accuracy when using a Kalman filter added to 4-dimensional deformation part +model partial solutions. The experiments run with two data sets showing that +this method improves pose estimation accuracy compared with state-of-the-art +methods and that a Kalman filter helps to increase this accuracy. + +
+
+
+
+
+ + ☆ Blue noise for diffusion models + + +
+ Most of the existing diffusion models use Gaussian noise for training and +sampling across all time steps, which may not optimally account for the +frequency contents reconstructed by the denoising network. Despite the diverse +applications of correlated noise in computer graphics, its potential for +improving the training process has been underexplored. In this paper, we +introduce a novel and general class of diffusion models taking correlated noise +within and across images into account. More specifically, we propose a +time-varying noise model to incorporate correlated noise into the training +process, as well as a method for fast generation of correlated noise mask. Our +model is built upon deterministic diffusion models and utilizes blue noise to +help improve the generation quality compared to using Gaussian white (random) +noise only. Further, our framework allows introducing correlation across images +within a single mini-batch to improve gradient flow. We perform both +qualitative and quantitative evaluations on a variety of datasets using our +method, achieving improvements on different tasks over existing deterministic +diffusion models in terms of FID metric. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Source-Free Domain Adaptation with Diffusion-Guided Source Data + Generation + + +
+ This paper introduces a novel approach to leverage the generalizability +capability of Diffusion Models for Source-Free Domain Adaptation (DM-SFDA). Our +proposed DM-SFDA method involves fine-tuning a pre-trained text-to-image +diffusion model to generate source domain images using features from the target +images to guide the diffusion process. Specifically, the pre-trained diffusion +model is fine-tuned to generate source samples that minimize entropy and +maximize confidence for the pre-trained source model. We then apply established +unsupervised domain adaptation techniques to align the generated source images +with target domain data. We validate our approach through comprehensive +experiments across a range of datasets, including Office-31, Office-Home, and +VisDA. The results highlight significant improvements in SFDA performance, +showcasing the potential of diffusion models in generating contextually +relevant, domain-specific images. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.01701 +
+
+
+
+
+ + ☆ Is Two-shot All You Need? A Label-efficient Approach for Video + Segmentation in Breast Ultrasound + + +
+ Breast lesion segmentation from breast ultrasound (BUS) videos could assist +in early diagnosis and treatment. Existing video object segmentation (VOS) +methods usually require dense annotation, which is often inaccessible for +medical datasets. Furthermore, they suffer from accumulative errors and a lack +of explicit space-time awareness. In this work, we propose a novel two-shot +training paradigm for BUS video segmentation. It not only is able to capture +free-range space-time consistency but also utilizes a source-dependent +augmentation scheme. This label-efficient learning framework is validated on a +challenging in-house BUS video dataset. Results showed that it gained +comparable performance to the fully annotated ones given only 1.9% training +labels. + +
+
+ comment: 5 pages, 1 figure, 2 tables, accepted by ISBI 2024 +
+
+
+
+
+ + ☆ Toward Accurate Camera-based 3D Object Detection via Cascade Depth + Estimation and Calibration ICRA2024 + + +
+ Recent camera-based 3D object detection is limited by the precision of +transforming from image to 3D feature spaces, as well as the accuracy of object +localization within the 3D space. This paper aims to address such a fundamental +problem of camera-based 3D object detection: How to effectively learn depth +information for accurate feature lifting and object localization. Different +from previous methods which directly predict depth distributions by using a +supervised estimation model, we propose a cascade framework consisting of two +depth-aware learning paradigms. First, a depth estimation (DE) scheme leverages +relative depth information to realize the effective feature lifting from 2D to +3D spaces. Furthermore, a depth calibration (DC) scheme introduces depth +reconstruction to further adjust the 3D object localization perturbation along +the depth axis. In practice, the DE is explicitly realized by using both the +absolute and relative depth optimization loss to promote the precision of depth +prediction, while the capability of DC is implicitly embedded into the +detection Transformer through a depth denoising mechanism in the training +phase. The entire model training is accomplished through an end-to-end manner. +We propose a baseline detector and evaluate the effectiveness of our proposal +with +2.2%/+2.7% NDS/mAP improvements on NuScenes benchmark, and gain a +comparable performance with 55.9%/45.7% NDS/mAP. Furthermore, we conduct +extensive experiments to demonstrate its generality based on various detectors +with about +2% NDS improvements. + +
+
+ comment: Accepted to ICRA2024 +
+
+
+
+
+ + ☆ STAR: Shape-focused Texture Agnostic Representations for Improved Object + Detection and 6D Pose Estimation + + +
+ Recent advances in machine learning have greatly benefited object detection +and 6D pose estimation for robotic grasping. However, textureless and metallic +objects still pose a significant challenge due to fewer visual cues and the +texture bias of CNNs. To address this issue, we propose a texture-agnostic +approach that focuses on learning from CAD models and emphasizes object shape +features. To achieve a focus on learning shape features, the textures are +randomized during the rendering of the training data. By treating the texture +as noise, the need for real-world object instances or their final appearance +during training data generation is eliminated. The TLESS and ITODD datasets, +specifically created for industrial settings in robotics and featuring +textureless and metallic objects, were used for evaluation. Texture agnosticity +also increases the robustness against image perturbations such as imaging +noise, motion blur, and brightness changes, which are common in robotics +applications. Code and datasets are publicly available at +github.com/hoenigpeter/randomized_texturing. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Advancing Anomaly Detection: An Adaptation Model and a New Dataset + + +
+ Industry surveillance is widely applicable in sectors like retail, +manufacturing, education, and smart cities, each presenting unique anomalies +requiring specialized detection. However, adapting anomaly detection models to +novel viewpoints within the same scenario poses challenges. Extending these +models to entirely new scenarios necessitates retraining or fine-tuning, a +process that can be time consuming. To address these challenges, we propose the +Scenario-Adaptive Anomaly Detection (SA2D) method, leveraging the few-shot +learning framework for faster adaptation of pre-trained models to new concepts. +Despite this approach, a significant challenge emerges from the absence of a +comprehensive dataset with diverse scenarios and camera views. In response, we +introduce the Multi-Scenario Anomaly Detection (MSAD) dataset, encompassing 14 +distinct scenarios captured from various camera views. This real-world dataset +is the first high-resolution anomaly detection dataset, offering a solid +foundation for training superior models. MSAD includes diverse normal motion +patterns, incorporating challenging variations like different lighting and +weather conditions. Through experimentation, we validate the efficacy of SA2D, +particularly when trained on the MSAD dataset. Our results show that SA2D not +only excels under novel viewpoints within the same scenario but also +demonstrates competitive performance when faced with entirely new scenarios. +This highlights our method's potential in addressing challenges in detecting +anomalies across diverse and evolving surveillance scenarios. + +
+
+ comment: Research report +
+
+
+
+
+ + ☆ Dual-Path Coupled Image Deraining Network via Spatial-Frequency + Interaction + + +
+ Transformers have recently emerged as a significant force in the field of +image deraining. Existing image deraining methods utilize extensive research on +self-attention. Though showcasing impressive results, they tend to neglect +critical frequency information, as self-attention is generally less adept at +capturing high-frequency details. To overcome this shortcoming, we have +developed an innovative Dual-Path Coupled Deraining Network (DPCNet) that +integrates information from both spatial and frequency domains through Spatial +Feature Extraction Block (SFEBlock) and Frequency Feature Extraction Block +(FFEBlock). We have further introduced an effective Adaptive Fusion Module +(AFM) for the dual-path feature aggregation. Extensive experiments on six +public deraining benchmarks and downstream vision tasks have demonstrated that +our proposed method not only outperforms the existing state-of-the-art +deraining method but also achieves visually pleasuring results with excellent +robustness on downstream vision tasks. + +
+
+
+
+
+ + ☆ Data-efficient Large Vision Models through Sequential Autoregression + + +
+ Training general-purpose vision models on purely sequential visual data, +eschewing linguistic inputs, has heralded a new frontier in visual +understanding. These models are intended to not only comprehend but also +seamlessly transit to out-of-domain tasks. However, current endeavors are +hamstrung by an over-reliance on colossal models, exemplified by models with +upwards of 3B parameters, and the necessity for an extensive corpus of visual +data, often comprising a staggering 400B tokens. In this paper, we delve into +the development of an efficient, autoregression-based vision model, +innovatively architected to operate on a limited dataset. We meticulously +demonstrate how this model achieves proficiency in a spectrum of visual tasks +spanning both high-level and low-level semantic understanding during the +testing phase. Our empirical evaluations underscore the model's agility in +adapting to various tasks, heralding a significant reduction in the parameter +footprint, and a marked decrease in training data requirements, thereby paving +the way for more sustainable and accessible advancements in the field of +generalist vision models. The code is available at +https://github.com/ggjy/DeLVM. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ SARI: Simplistic Average and Robust Identification based Noisy Partial + Label Learning + + +
+ Partial label learning (PLL) is a weakly-supervised learning paradigm where +each training instance is paired with a set of candidate labels (partial +label), one of which is the true label. Noisy PLL (NPLL) relaxes this +constraint by allowing some partial labels to not contain the true label, +enhancing the practicality of the problem. Our work centers on NPLL and +presents a minimalistic framework called SARI that initially assigns +pseudo-labels to images by exploiting the noisy partial labels through a +weighted nearest neighbour algorithm. These pseudo-label and image pairs are +then used to train a deep neural network classifier with label smoothing and +standard regularization techniques. The classifier's features and predictions +are subsequently employed to refine and enhance the accuracy of pseudo-labels. +SARI combines the strengths of Average Based Strategies (in pseudo labelling) +and Identification Based Strategies (in classifier training) from the +literature. We perform thorough experiments on seven datasets and compare SARI +against nine NPLL and PLL methods from the prior art. SARI achieves +state-of-the-art results in almost all studied settings, obtaining substantial +gains in fine-grained classification and extreme noise settings. + +
+
+ comment: 13 pages, 6 tables, 2 figures +
+
+
+
+
+ + ☆ NeRF as Non-Distant Environment Emitter in Physics-based Inverse + Rendering + + +
+ Physics-based inverse rendering aims to jointly optimize shape, materials, +and lighting from captured 2D images. Here lighting is an important part of +achieving faithful light transport simulation. While the environment map is +commonly used as the lighting model in inverse rendering, we show that its +distant lighting assumption leads to spatial invariant lighting, which can be +an inaccurate approximation in real-world inverse rendering. We propose to use +NeRF as a spatially varying environment lighting model and build an inverse +rendering pipeline using NeRF as the non-distant environment emitter. By +comparing our method with the environment map on real and synthetic datasets, +we show that our NeRF-based emitter models the scene lighting more accurately +and leads to more accurate inverse rendering. Project page and video: +https://nerfemitterpbir.github.io/. + +
+
+ comment: Project page and video: https://nerfemitterpbir.github.io/ +
+
+
+
+
+ + ☆ Spiking-PhysFormer: Camera-Based Remote Photoplethysmography with + Parallel Spike-driven Transformer + + +
+ Artificial neural networks (ANNs) can help camera-based remote +photoplethysmography (rPPG) in measuring cardiac activity and physiological +signals from facial videos, such as pulse wave, heart rate and respiration rate +with better accuracy. However, most existing ANN-based methods require +substantial computing resources, which poses challenges for effective +deployment on mobile devices. Spiking neural networks (SNNs), on the other +hand, hold immense potential for energy-efficient deep learning owing to their +binary and event-driven architecture. To the best of our knowledge, we are the +first to introduce SNNs into the realm of rPPG, proposing a hybrid neural +network (HNN) model, the Spiking-PhysFormer, aimed at reducing power +consumption. Specifically, the proposed Spiking-PhyFormer consists of an +ANN-based patch embedding block, SNN-based transformer blocks, and an ANN-based +predictor head. First, to simplify the transformer block while preserving its +capacity to aggregate local and global spatio-temporal features, we design a +parallel spike transformer block to replace sequential sub-blocks. +Additionally, we propose a simplified spiking self-attention mechanism that +omits the value parameter without compromising the model's performance. +Experiments conducted on four datasets-PURE, UBFC-rPPG, UBFC-Phys, and MMPD +demonstrate that the proposed model achieves a 12.4\% reduction in power +consumption compared to PhysFormer. Additionally, the power consumption of the +transformer block is reduced by a factor of 12.2, while maintaining decent +performance as PhysFormer and other ANN-based models. + +
+
+ comment: Mingxuan Liu and Jiankai Tang are co-first authors of the article +
+
+
+
+
+ + ☆ Mesh-based Gaussian Splatting for Real-time Large-scale Deformation + + +
+ Neural implicit representations, including Neural Distance Fields and Neural +Radiance Fields, have demonstrated significant capabilities for reconstructing +surfaces with complicated geometry and topology, and generating novel views of +a scene. Nevertheless, it is challenging for users to directly deform or +manipulate these implicit representations with large deformations in the +real-time fashion. Gaussian Splatting(GS) has recently become a promising +method with explicit geometry for representing static scenes and facilitating +high-quality and real-time synthesis of novel views. However,it cannot be +easily deformed due to the use of discrete Gaussians and lack of explicit +topology. To address this, we develop a novel GS-based method that enables +interactive deformation. Our key idea is to design an innovative mesh-based GS +representation, which is integrated into Gaussian learning and manipulation. 3D +Gaussians are defined over an explicit mesh, and they are bound with each +other: the rendering of 3D Gaussians guides the mesh face split for adaptive +refinement, and the mesh face split directs the splitting of 3D Gaussians. +Moreover, the explicit mesh constraints help regularize the Gaussian +distribution, suppressing poor-quality Gaussians(e.g. misaligned +Gaussians,long-narrow shaped Gaussians), thus enhancing visual quality and +avoiding artifacts during deformation. Based on this representation, we further +introduce a large-scale Gaussian deformation technique to enable deformable GS, +which alters the parameters of 3D Gaussians according to the manipulation of +the associated mesh. Our method benefits from existing mesh deformation +datasets for more realistic data-driven Gaussian deformation. Extensive +experiments show that our approach achieves high-quality reconstruction and +effective deformation, while maintaining the promising rendering results at a +high frame rate(65 FPS on average). + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ MLLM-as-a-Judge: Assessing Multimodal LLM-as-a-Judge with + Vision-Language Benchmark + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +recently, showing remarkable potential in artificial general intelligence. +However, assessing the utility of MLLMs presents considerable challenges, +primarily due to the absence multimodal benchmarks that align with human +preferences. Inspired by LLM-as-a-Judge in LLMs, this paper introduces a novel +benchmark, termed MLLM-as-a-Judge, to assess the ability of MLLMs in assisting +judges including three distinct tasks: Scoring Evaluation, Pair Comparison, and +Batch Ranking. Our study reveals that, while MLLMs demonstrate remarkable +human-like discernment in Pair Comparisons, there is a significant divergence +from human preferences in Scoring Evaluation and Batch Ranking tasks. +Furthermore, MLLMs still face challenges in judgment, including diverse biases, +hallucinatory responses, and inconsistencies, even for advanced models such as +GPT-4V. These findings emphasize the pressing need for enhancements and further +research efforts regarding MLLMs as fully reliable evaluators. Code and dataset +are available at https://github.com/Dongping-Chen/MLLM-as-a-Judge. + +
+
+
+
+
+ + ☆ Robot Interaction Behavior Generation based on Social Motion Forecasting + for Human-Robot Interaction ICRA 2024 + + +
+ Integrating robots into populated environments is a complex challenge that +requires an understanding of human social dynamics. In this work, we propose to +model social motion forecasting in a shared human-robot representation space, +which facilitates us to synthesize robot motions that interact with humans in +social scenarios despite not observing any robot in the motion training. We +develop a transformer-based architecture called ECHO, which operates in the +aforementioned shared space to predict the future motions of the agents +encountered in social scenarios. Contrary to prior works, we reformulate the +social motion problem as the refinement of the predicted individual motions +based on the surrounding agents, which facilitates the training while allowing +for single-motion forecasting when only one human is in the scene. We evaluate +our model in multi-person and human-robot motion forecasting tasks and obtain +state-of-the-art performance by a large margin while being efficient and +performing in real-time. Additionally, our qualitative results showcase the +effectiveness of our approach in generating human-robot interaction behaviors +that can be controlled via text commands. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ Color Recognition in Challenging Lighting Environments: CNN Approach + + +
+ Light plays a vital role in vision either human or machine vision, the +perceived color is always based on the lighting conditions of the surroundings. +Researchers are working to enhance the color detection techniques for the +application of computer vision. They have implemented proposed several methods +using different color detection approaches but still, there is a gap that can +be filled. To address this issue, a color detection method, which is based on a +Convolutional Neural Network (CNN), is proposed. Firstly, image segmentation is +performed using the edge detection segmentation technique to specify the object +and then the segmented object is fed to the Convolutional Neural Network +trained to detect the color of an object in different lighting conditions. It +is experimentally verified that our method can substantially enhance the +robustness of color detection in different lighting conditions, and our method +performed better results than existing methods. + +
+
+
+
+
+ + ☆ Boundary-aware Contrastive Learning for Semi-supervised Nuclei Instance + Segmentation + + +
+ Semi-supervised segmentation methods have demonstrated promising results in +natural scenarios, providing a solution to reduce dependency on manual +annotation. However, these methods face significant challenges when directly +applied to pathological images due to the subtle color differences between +nuclei and tissues, as well as the significant morphological variations among +nuclei. Consequently, the generated pseudo-labels often contain much noise, +especially at the nuclei boundaries. To address the above problem, this paper +proposes a boundary-aware contrastive learning network to denoise the boundary +noise in a semi-supervised nuclei segmentation task. The model has two key +designs: a low-resolution denoising (LRD) module and a cross-RoI contrastive +learning (CRC) module. The LRD improves the smoothness of the nuclei boundary +by pseudo-labels denoising, and the CRC enhances the discrimination between +foreground and background by boundary feature contrastive learning. We conduct +extensive experiments to demonstrate the superiority of our proposed method +over existing semi-supervised instance segmentation methods. + +
+
+ comment: 12 pages, 3 figures, 6 tables +
+
+
+
+
+ + ☆ Towards Aligned Layout Generation via Diffusion Model with Aesthetic + Constraints ICLR 2024 + + +
+ Controllable layout generation refers to the process of creating a plausible +visual arrangement of elements within a graphic design (e.g., document and web +designs) with constraints representing design intentions. Although recent +diffusion-based models have achieved state-of-the-art FID scores, they tend to +exhibit more pronounced misalignment compared to earlier transformer-based +models. In this work, we propose the $\textbf{LA}$yout $\textbf{C}$onstraint +diffusion mod$\textbf{E}$l (LACE), a unified model to handle a broad range of +layout generation tasks, such as arranging elements with specified attributes +and refining or completing a coarse layout design. The model is based on +continuous diffusion models. Compared with existing methods that use discrete +diffusion models, continuous state-space design can enable the incorporation of +differentiable aesthetic constraint functions in training. For conditional +generation, we introduce conditions via masked input. Extensive experiment +results show that LACE produces high-quality layouts and outperforms existing +state-of-the-art baselines. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Cortical Surface Diffusion Generative Models + + +
+ Cortical surface analysis has gained increased prominence, given its +potential implications for neurological and developmental disorders. +Traditional vision diffusion models, while effective in generating natural +images, present limitations in capturing intricate development patterns in +neuroimaging due to limited datasets. This is particularly true for generating +cortical surfaces where individual variability in cortical morphology is high, +leading to an urgent need for better methods to model brain development and +diverse variability inherent across different individuals. In this work, we +proposed a novel diffusion model for the generation of cortical surface +metrics, using modified surface vision transformers as the principal +architecture. We validate our method in the developing Human Connectome Project +(dHCP), the results suggest our model demonstrates superior performance in +capturing the intricate details of evolving cortical surfaces. Furthermore, our +model can generate high-quality realistic samples of cortical surfaces +conditioned on postmenstrual age(PMA) at scan. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ AINS: Affordable Indoor Navigation Solution via Line Color + Identification Using Mono-Camera for Autonomous Vehicles + + +
+ Recently, researchers have been exploring various ways to improve the +effectiveness and efficiency of autonomous vehicles by researching new methods, +especially for indoor scenarios. Autonomous Vehicles in indoor navigation +systems possess many challenges especially the limited accuracy of GPS in +indoor scenarios. Several, robust methods have been explored for autonomous +vehicles in indoor scenarios to solve this problem, but the ineffectiveness of +the proposed methods is the high deployment cost. To address the +above-mentioned problems we have presented A low-cost indoor navigation method +for autonomous vehicles called Affordable Indoor Navigation Solution (AINS) +which is based on based on Monocular Camera. Our proposed solution is mainly +based on a mono camera without relying on various huge or power-inefficient +sensors to find the path, such as range finders and other navigation sensors. +Our proposed method shows that we can deploy autonomous vehicles indoor +navigation systems while taking into consideration the cost. We can observe +that the results shown by our solution are better than existing solutions and +we can reduce the estimated error and time consumption. + +
+
+
+
+
+ + ☆ InstructScene: Instruction-Driven 3D Indoor Scene Synthesis with + Semantic Graph Prior ICLR 2024 + + +
+ Comprehending natural language instructions is a charming property for 3D +indoor scene synthesis systems. Existing methods directly model object joint +distributions and express object relations implicitly within a scene, thereby +hindering the controllability of generation. We introduce InstructScene, a +novel generative framework that integrates a semantic graph prior and a layout +decoder to improve controllability and fidelity for 3D scene synthesis. The +proposed semantic graph prior jointly learns scene appearances and layout +distributions, exhibiting versatility across various downstream tasks in a +zero-shot manner. To facilitate the benchmarking for text-driven 3D scene +synthesis, we curate a high-quality dataset of scene-instruction pairs with +large language and multimodal models. Extensive experimental results reveal +that the proposed method surpasses existing state-of-the-art approaches by a +large margin. Thorough ablation studies confirm the efficacy of crucial design +components. Project page: https://chenguolin.github.io/projects/InstructScene. + +
+
+ comment: Accepted by ICLR 2024 for spotlight presentation; Project page: + https://chenguolin.github.io/projects/InstructScene +
+
+
+
+
+ + ☆ EvoSeed: Unveiling the Threat on Deep Neural Networks with Real-World + Illusions + + +
+ Deep neural networks are exploited using natural adversarial samples, which +have no impact on human perception but are misclassified. Current approaches +often rely on the white-box nature of deep neural networks to generate these +adversarial samples or alter the distribution of adversarial samples compared +to training distribution. To alleviate the limitations of current approaches, +we propose EvoSeed, a novel evolutionary strategy-based search algorithmic +framework to generate natural adversarial samples. Our EvoSeed framework uses +auxiliary Diffusion and Classifier models to operate in a model-agnostic +black-box setting. We employ CMA-ES to optimize the search for an adversarial +seed vector, which, when processed by the Conditional Diffusion Model, results +in an unrestricted natural adversarial sample misclassified by the Classifier +Model. Experiments show that generated adversarial images are of high image +quality and are transferable to different classifiers. Our approach +demonstrates promise in enhancing the quality of adversarial samples using +evolutionary algorithms. We hope our research opens new avenues to enhance the +robustness of deep neural networks in real-world scenarios. Project Website can +be accessed at \url{https://shashankkotyan.github.io/EvoSeed}. + +
+
+
+
+
+ + ☆ The Influence of Autofocus Lenses in the Camera Calibration Process + + +
+ Camera calibration is a crucial step in robotics and computer vision. +Accurate camera parameters are necessary to achieve robust applications. +Nowadays, camera calibration process consists of adjusting a set of data to a +pin-hole model, assuming that with a reprojection error close to cero, camera +parameters are correct. Since all camera parameters are unknown, computed +results are considered true. However, the pin-hole model does not represent the +camera behavior accurately if the focus is considered. Real cameras change the +focal length slightly to obtain sharp objects in the image and this feature +skews the calibration result if a unique pin-hole model is computed with a +constant focal length. In this paper, a deep analysis of the camera calibration +process is done to detect and strengthen its weaknesses. The camera is mounted +in a robot arm to known extrinsic camera parameters with accuracy and to be +able to compare computed results with the true ones. Based on the bias that +exist between computed results and the true ones, a modification of the widely +accepted camera calibration method using images of a planar template is +presented. A pin-hole model with distance dependent focal length is proposed to +improve the calibration process substantially + +
+
+
+
+
+ + ☆ Group Distributionally Robust Dataset Distillation with Risk + Minimization + + +
+ Dataset distillation (DD) has emerged as a widely adopted technique for +crafting a synthetic dataset that captures the essential information of a +training dataset, facilitating the training of accurate neural models. Its +applications span various domains, including transfer learning, federated +learning, and neural architecture search. The most popular methods for +constructing the synthetic data rely on matching the convergence properties of +training the model with the synthetic dataset and the training dataset. +However, targeting the training dataset must be thought of as auxiliary in the +same sense that the training set is an approximate substitute for the +population distribution, and the latter is the data of interest. Yet despite +its popularity, an aspect that remains unexplored is the relationship of DD to +its generalization, particularly across uncommon subgroups. That is, how can we +ensure that a model trained on the synthetic dataset performs well when faced +with samples from regions with low population density? Here, the +representativeness and coverage of the dataset become salient over the +guaranteed training error at inference. Drawing inspiration from +distributionally robust optimization, we introduce an algorithm that combines +clustering with the minimization of a risk measure on the loss to conduct DD. +We provide a theoretical rationale for our approach and demonstrate its +effective generalization and robustness across subgroups through numerical +experiments. + +
+
+
+
+
+ + ☆ G-NAS: Generalizable Neural Architecture Search for Single Domain + Generalization Object Detection AAAI24 + + +
+ In this paper, we focus on a realistic yet challenging task, Single Domain +Generalization Object Detection (S-DGOD), where only one source domain's data +can be used for training object detectors, but have to generalize multiple +distinct target domains. In S-DGOD, both high-capacity fitting and +generalization abilities are needed due to the task's complexity. +Differentiable Neural Architecture Search (NAS) is known for its high capacity +for complex data fitting and we propose to leverage Differentiable NAS to solve +S-DGOD. However, it may confront severe over-fitting issues due to the feature +imbalance phenomenon, where parameters optimized by gradient descent are biased +to learn from the easy-to-learn features, which are usually non-causal and +spuriously correlated to ground truth labels, such as the features of +background in object detection data. Consequently, this leads to serious +performance degradation, especially in generalizing to unseen target domains +with huge domain gaps between the source domain and target domains. To address +this issue, we propose the Generalizable loss (G-loss), which is an OoD-aware +objective, preventing NAS from over-fitting by using gradient descent to +optimize parameters not only on a subset of easy-to-learn features but also the +remaining predictive features for generalization, and the overall framework is +named G-NAS. Experimental results on the S-DGOD urban-scene datasets +demonstrate that the proposed G-NAS achieves SOTA performance compared to +baseline methods. Codes are available at https://github.com/wufan-cse/G-NAS. + +
+
+ comment: Accepted by AAAI24 +
+
+
+
+
+ + ☆ V2VSSC: A 3D Semantic Scene Completion Benchmark for Perception with + Vehicle to Vehicle Communication + + +
+ Semantic scene completion (SSC) has recently gained popularity because it can +provide both semantic and geometric information that can be used directly for +autonomous vehicle navigation. However, there are still challenges to overcome. +SSC is often hampered by occlusion and short-range perception due to sensor +limitations, which can pose safety risks. This paper proposes a fundamental +solution to this problem by leveraging vehicle-to-vehicle (V2V) communication. +We propose the first generalized collaborative SSC framework that allows +autonomous vehicles to share sensing information from different sensor views to +jointly perform SSC tasks. To validate the proposed framework, we further build +V2VSSC, the first V2V SSC benchmark, on top of the large-scale V2V perception +dataset OPV2V. Extensive experiments demonstrate that by leveraging V2V +communication, the SSC performance can be increased by 8.3% on geometric metric +IoU and 6.0% mIOU. + +
+
+
+
+
+ + ☆ Adversarial Robustness Through Artifact Design + + +
+ Adversarial examples arose as a challenge for machine learning. To hinder +them, most defenses alter how models are trained (e.g., adversarial training) +or inference is made (e.g., randomized smoothing). Still, while these +approaches markedly improve models' adversarial robustness, models remain +highly susceptible to adversarial examples. Identifying that, in certain +domains such as traffic-sign recognition, objects are implemented per standards +specifying how artifacts (e.g., signs) should be designed, we propose a novel +approach for improving adversarial robustness. Specifically, we offer a method +to redefine standards, making minor changes to existing ones, to defend against +adversarial examples. We formulate the problem of artifact design as a robust +optimization problem, and propose gradient-based and greedy search methods to +solve it. We evaluated our approach in the domain of traffic-sign recognition, +allowing it to alter traffic-sign pictograms (i.e., symbols within the signs) +and their colors. We found that, combined with adversarial training, our +approach led to up to 25.18\% higher robust accuracy compared to +state-of-the-art methods against two adversary types, while further increasing +accuracy on benign inputs. + +
+
+
+
+
+ + ☆ An Over Complete Deep Learning Method for Inverse Problems + + +
+ Obtaining meaningful solutions for inverse problems has been a major +challenge with many applications in science and engineering. Recent machine +learning techniques based on proximal and diffusion-based methods have shown +promising results. However, as we show in this work, they can also face +challenges when applied to some exemplary problems. We show that similar to +previous works on over-complete dictionaries, it is possible to overcome these +shortcomings by embedding the solution into higher dimensions. The novelty of +the work proposed is that we jointly design and learn the embedding and the +regularizer for the embedding vector. We demonstrate the merit of this approach +on several exemplary and common inverse problems. + +
+
+
+
+
+ + ☆ OV-NeRF: Open-vocabulary Neural Radiance Fields with Vision and Language + Foundation Models for 3D Semantic Understanding + + +
+ The development of Neural Radiance Fields (NeRFs) has provided a potent +representation for encapsulating the geometric and appearance characteristics +of 3D scenes. Enhancing the capabilities of NeRFs in open-vocabulary 3D +semantic perception tasks has been a recent focus. However, current methods +that extract semantics directly from Contrastive Language-Image Pretraining +(CLIP) for semantic field learning encounter difficulties due to noisy and +view-inconsistent semantics provided by CLIP. To tackle these limitations, we +propose OV-NeRF, which exploits the potential of pre-trained vision and +language foundation models to enhance semantic field learning through proposed +single-view and cross-view strategies. First, from the single-view perspective, +we introduce Region Semantic Ranking (RSR) regularization by leveraging 2D mask +proposals derived from SAM to rectify the noisy semantics of each training +view, facilitating accurate semantic field learning. Second, from the +cross-view perspective, we propose a Cross-view Self-enhancement (CSE) strategy +to address the challenge raised by view-inconsistent semantics. Rather than +invariably utilizing the 2D inconsistent semantics from CLIP, CSE leverages the +3D consistent semantics generated from the well-trained semantic field itself +for semantic field training, aiming to reduce ambiguity and enhance overall +semantic consistency across different views. Extensive experiments validate our +OV-NeRF outperforms current state-of-the-art methods, achieving a significant +improvement of 20.31% and 18.42% in mIoU metric on Replica and Scannet, +respectively. Furthermore, our approach exhibits consistent superior results +across various CLIP configurations, further verifying its robustness. + +
+
+
+
+
+ + ☆ GSN: Generalisable Segmentation in Neural Radiance Field AAAI 2024 + + +
+ Traditional Radiance Field (RF) representations capture details of a specific +scene and must be trained afresh on each scene. Semantic feature fields have +been added to RFs to facilitate several segmentation tasks. Generalised RF +representations learn the principles of view interpolation. A generalised RF +can render new views of an unknown and untrained scene, given a few views. We +present a way to distil feature fields into the generalised GNT representation. +Our GSN representation generates new views of unseen scenes on the fly along +with consistent, per-pixel semantic features. This enables multi-view +segmentation of arbitrary new scenes. We show different semantic features being +distilled into generalised RFs. Our multi-view segmentation results are on par +with methods that use traditional RFs. GSN closes the gap between standard and +generalisable RF methods significantly. Project Page: +https://vinayak-vg.github.io/GSN/ + +
+
+ comment: Accepted at the Main Technical Track of AAAI 2024 +
+
+
+
+
+ + ☆ LLMs Meet VLMs: Boost Open Vocabulary Object Detection with Fine-grained + Descriptors + + +
+ Inspired by the outstanding zero-shot capability of vision language models +(VLMs) in image classification tasks, open-vocabulary object detection has +attracted increasing interest by distilling the broad VLM knowledge into +detector training. However, most existing open-vocabulary detectors learn by +aligning region embeddings with categorical labels (e.g., bicycle) only, +disregarding the capability of VLMs on aligning visual embeddings with +fine-grained text description of object parts (e.g., pedals and bells). This +paper presents DVDet, a Descriptor-Enhanced Open Vocabulary Detector that +introduces conditional context prompts and hierarchical textual descriptors +that enable precise region-text alignment as well as open-vocabulary detection +training in general. Specifically, the conditional context prompt transforms +regional embeddings into image-like representations that can be directly +integrated into general open vocabulary detection training. In addition, we +introduce large language models as an interactive and implicit knowledge +repository which enables iterative mining and refining visually oriented +textual descriptors for precise region-text alignment. Extensive experiments +over multiple large-scale benchmarks show that DVDet outperforms the +state-of-the-art consistently by large margins. + +
+
+
+
+
+ + ☆ Noise Map Guidance: Inversion with Spatial Context for Real Image + Editing ICLR 2024 + + +
+ Text-guided diffusion models have become a popular tool in image synthesis, +known for producing high-quality and diverse images. However, their application +to editing real images often encounters hurdles primarily due to the text +condition deteriorating the reconstruction quality and subsequently affecting +editing fidelity. Null-text Inversion (NTI) has made strides in this area, but +it fails to capture spatial context and requires computationally intensive +per-timestep optimization. Addressing these challenges, we present Noise Map +Guidance (NMG), an inversion method rich in a spatial context, tailored for +real-image editing. Significantly, NMG achieves this without necessitating +optimization, yet preserves the editing quality. Our empirical investigations +highlight NMG's adaptability across various editing techniques and its +robustness to variants of DDIM inversions. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Multi-Scale Semantic Segmentation with Modified MBConv Blocks + + +
+ Recently, MBConv blocks, initially designed for efficiency in +resource-limited settings and later adapted for cutting-edge image +classification performances, have demonstrated significant potential in image +classification tasks. Despite their success, their application in semantic +segmentation has remained relatively unexplored. This paper introduces a novel +adaptation of MBConv blocks specifically tailored for semantic segmentation. +Our modification stems from the insight that semantic segmentation requires the +extraction of more detailed spatial information than image classification. We +argue that to effectively perform multi-scale semantic segmentation, each +branch of a U-Net architecture, regardless of its resolution, should possess +equivalent segmentation capabilities. By implementing these changes, our +approach achieves impressive mean Intersection over Union (IoU) scores of 84.5% +and 84.0% on the Cityscapes test and validation datasets, respectively, +demonstrating the efficacy of our proposed modifications in enhancing semantic +segmentation performance. + +
+
+
+
+
+ + ☆ ScreenAI: A Vision-Language Model for UI and Infographics Understanding + + +
+ Screen user interfaces (UIs) and infographics, sharing similar visual +language and design principles, play important roles in human communication and +human-machine interaction. We introduce ScreenAI, a vision-language model that +specializes in UI and infographics understanding. Our model improves upon the +PaLI architecture with the flexible patching strategy of pix2struct and is +trained on a unique mixture of datasets. At the heart of this mixture is a +novel screen annotation task in which the model has to identify the type and +location of UI elements. We use these text annotations to describe screens to +Large Language Models and automatically generate question-answering (QA), UI +navigation, and summarization training datasets at scale. We run ablation +studies to demonstrate the impact of these design choices. At only 5B +parameters, ScreenAI achieves new state-of-the-artresults on UI- and +infographics-based tasks (Multi-page DocVQA, WebSRC, MoTIF and Widget +Captioning), and new best-in-class performance on others (Chart QA, DocVQA, and +InfographicVQA) compared to models of similar size. Finally, we release three +new datasets: one focused on the screen annotation task and two others focused +on question answering. + +
+
+ comment: 7 pages main tex with 5 figures, 2 page bib, 6 pages appendix +
+
+
+
+
+ + ☆ Meet JEANIE: a Similarity Measure for 3D Skeleton Sequences via + Temporal-Viewpoint Alignment ACCV'22 + + +
+ Video sequences exhibit significant nuisance variations (undesired effects) +of speed of actions, temporal locations, and subjects' poses, leading to +temporal-viewpoint misalignment when comparing two sets of frames or evaluating +the similarity of two sequences. Thus, we propose Joint tEmporal and cAmera +viewpoiNt alIgnmEnt (JEANIE) for sequence pairs. In particular, we focus on 3D +skeleton sequences whose camera and subjects' poses can be easily manipulated +in 3D. We evaluate JEANIE on skeletal Few-shot Action Recognition (FSAR), where +matching well temporal blocks (temporal chunks that make up a sequence) of +support-query sequence pairs (by factoring out nuisance variations) is +essential due to limited samples of novel classes. Given a query sequence, we +create its several views by simulating several camera locations. For a support +sequence, we match it with view-simulated query sequences, as in the popular +Dynamic Time Warping (DTW). Specifically, each support temporal block can be +matched to the query temporal block with the same or adjacent (next) temporal +index, and adjacent camera views to achieve joint local temporal-viewpoint +warping. JEANIE selects the smallest distance among matching paths with +different temporal-viewpoint warping patterns, an advantage over DTW which only +performs temporal alignment. We also propose an unsupervised FSAR akin to +clustering of sequences with JEANIE as a distance measure. JEANIE achieves +state-of-the-art results on NTU-60, NTU-120, Kinetics-skeleton and UWA3D +Multiview Activity II on supervised and unsupervised FSAR, and their +meta-learning inspired fusion. + +
+
+ comment: Under minor revision with IJCV. An extension of our ACCV'22 paper + [arXiv:arXiv:2210.16820] which was distinguished by the Sang Uk Lee Best + Student Paper Award. arXiv admin note: text overlap with arXiv:2112.12668 +
+
+
+
+
+ + ☆ Towards Improved Imbalance Robustness in Continual Multi-Label Learning + with Dual Output Spiking Architecture (DOSA) IJCNN 2024 + + +
+ Algorithms designed for addressing typical supervised classification problems +can only learn from a fixed set of samples and labels, making them unsuitable +for the real world, where data arrives as a stream of samples often associated +with multiple labels over time. This motivates the study of task-agnostic +continual multi-label learning problems. While algorithms using deep learning +approaches for continual multi-label learning have been proposed in the recent +literature, they tend to be computationally heavy. Although spiking neural +networks (SNNs) offer a computationally efficient alternative to artificial +neural networks, existing literature has not used SNNs for continual +multi-label learning. Also, accurately determining multiple labels with SNNs is +still an open research problem. This work proposes a dual output spiking +architecture (DOSA) to bridge these research gaps. A novel imbalance-aware loss +function is also proposed, improving the multi-label classification performance +of the model by making it more robust to data imbalance. A modified F1 score is +presented to evaluate the effectiveness of the proposed loss function in +handling imbalance. Experiments on several benchmark multi-label datasets show +that DOSA trained with the proposed loss function shows improved robustness to +data imbalance and obtains better continual multi-label learning performance +than CIFDM, a previous state-of-the-art algorithm. + +
+
+ comment: 8 pages, 4 figures, 4 tables, 45 references. Submitted to IJCNN 2024 +
+
+
+
+
+ + ☆ Sparse Anatomical Prompt Semi-Supervised Learning with Masked Image + Modeling for CBCT Tooth Segmentation + + +
+ Accurate tooth identification and segmentation in Cone Beam Computed +Tomography (CBCT) dental images can significantly enhance the efficiency and +precision of manual diagnoses performed by dentists. However, existing +segmentation methods are mainly developed based on large data volumes training, +on which their annotations are extremely time-consuming. Meanwhile, the teeth +of each class in CBCT dental images being closely positioned, coupled with +subtle inter-class differences, gives rise to the challenge of indistinct +boundaries when training model with limited data. To address these challenges, +this study aims to propose a tasked-oriented Masked Auto-Encoder paradigm to +effectively utilize large amounts of unlabeled data to achieve accurate tooth +segmentation with limited labeled data. Specifically, we first construct a +self-supervised pre-training framework of masked auto encoder to efficiently +utilize unlabeled data to enhance the network performance. Subsequently, we +introduce a sparse masked prompt mechanism based on graph attention to +incorporate boundary information of the teeth, aiding the network in learning +the anatomical structural features of teeth. To the best of our knowledge, we +are pioneering the integration of the mask pre-training paradigm into the CBCT +tooth segmentation task. Extensive experiments demonstrate both the feasibility +of our proposed method and the potential of the boundary prompt mechanism. + +
+
+
+
+
+ + ☆ Troublemaker Learning for Low-Light Image Enhancement + + +
+ Low-light image enhancement (LLIE) restores the color and brightness of +underexposed images. Supervised methods suffer from high costs in collecting +low/normal-light image pairs. Unsupervised methods invest substantial effort in +crafting complex loss functions. We address these two challenges through the +proposed TroubleMaker Learning (TML) strategy, which employs normal-light +images as inputs for training. TML is simple: we first dim the input and then +increase its brightness. TML is based on two core components. First, the +troublemaker model (TM) constructs pseudo low-light images from normal images +to relieve the cost of pairwise data. Second, the predicting model (PM) +enhances the brightness of pseudo low-light images. Additionally, we +incorporate an enhancing model (EM) to further improve the visual performance +of PM outputs. Moreover, in LLIE tasks, characterizing global element +correlations is important because more information on the same object can be +captured. CNN cannot achieve this well, and self-attention has high time +complexity. Accordingly, we propose Global Dynamic Convolution (GDC) with O(n) +time complexity, which essentially imitates the partial calculation process of +self-attention to formulate elementwise correlations. Based on the GDC module, +we build the UGDC model. Extensive quantitative and qualitative experiments +demonstrate that UGDC trained with TML can achieve competitive performance +against state-of-the-art approaches on public datasets. The code is available +at https://github.com/Rainbowman0/TML_LLIE. + +
+
+
+
+
+ + ☆ A Psychological Study: Importance of Contrast and Luminance in Color to + Grayscale Mapping + + +
+ Grayscale images are essential in image processing and computer vision tasks. +They effectively emphasize luminance and contrast, highlighting important +visual features, while also being easily compatible with other algorithms. +Moreover, their simplified representation makes them efficient for storage and +transmission purposes. While preserving contrast is important for maintaining +visual quality, other factors such as preserving information relevant to the +specific application or task at hand may be more critical for achieving optimal +performance. To evaluate and compare different decolorization algorithms, we +designed a psychological experiment. During the experiment, participants were +instructed to imagine color images in a hypothetical "colorless world" and +select the grayscale image that best resembled their mental visualization. We +conducted a comparison between two types of algorithms: (i) perceptual-based +simple color space conversion algorithms, and (ii) spatial contrast-based +algorithms, including iteration-based methods. Our experimental findings +indicate that CIELAB exhibited superior performance on average, providing +further evidence for the effectiveness of perception-based decolorization +algorithms. On the other hand, the spatial contrast-based algorithms showed +relatively poorer performance, possibly due to factors such as DC-offset and +artificial contrast generation. However, these algorithms demonstrated shorter +selection times. Notably, no single algorithm consistently outperformed the +others across all test images. In this paper, we will delve into a +comprehensive discussion on the significance of contrast and luminance in +color-to-grayscale mapping based on our experimental results and analysis. + +
+
+
+
+
+ + ☆ Progressive Conservative Adaptation for Evolving Target Domains + + +
+ Conventional domain adaptation typically transfers knowledge from a source +domain to a stationary target domain. However, in many real-world cases, target +data usually emerge sequentially and have continuously evolving distributions. +Restoring and adapting to such target data results in escalating computational +and resource consumption over time. Hence, it is vital to devise algorithms to +address the evolving domain adaptation (EDA) problem, \emph{i.e.,} adapting +models to evolving target domains without access to historic target domains. To +achieve this goal, we propose a simple yet effective approach, termed +progressive conservative adaptation (PCAda). To manage new target data that +diverges from previous distributions, we fine-tune the classifier head based on +the progressively updated class prototypes. Moreover, as adjusting to the most +recent target domain can interfere with the features learned from previous +target domains, we develop a conservative sparse attention mechanism. This +mechanism restricts feature adaptation within essential dimensions, thus easing +the inference related to historical knowledge. The proposed PCAda is +implemented with a meta-learning framework, which achieves the fast adaptation +of the classifier with the help of the progressively updated class prototypes +in the inner loop and learns a generalized feature without severely interfering +with the historic knowledge via the conservative sparse attention in the outer +loop. Experiments on Rotated MNIST, Caltran, and Portraits datasets demonstrate +the effectiveness of our method. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Triplet-constraint Transformer with Multi-scale Refinement for Dose + Prediction in Radiotherapy + + +
+ Radiotherapy is a primary treatment for cancers with the aim of applying +sufficient radiation dose to the planning target volume (PTV) while minimizing +dose hazards to the organs at risk (OARs). Convolutional neural networks (CNNs) +have automated the radiotherapy plan-making by predicting the dose maps. +However, current CNN-based methods ignore the remarkable dose difference in the +dose map, i.e., high dose value in the interior PTV while low value in the +exterior PTV, leading to a suboptimal prediction. In this paper, we propose a +triplet-constraint transformer (TCtrans) with multi-scale refinement to predict +the high-quality dose distribution. Concretely, a novel PTV-guided triplet +constraint is designed to refine dose feature representations in the interior +and exterior PTV by utilizing the explicit geometry of PTV. Furthermore, we +introduce a multi-scale refinement (MSR) module to effectively fulfill the +triplet constraint in different decoding layers with multiple scales. Besides, +a transformer encoder is devised to learn the important global dosimetric +knowledge. Experiments on a clinical cervical cancer dataset demonstrate the +superiority of our method. + +
+
+ comment: accepted by 2024 IEEE ISBI +
+
+
+
+
+ + ☆ Attention Guided CAM: Visual Explanations of Vision Transformer Guided + by Self-Attention AAAI2024 + + +
+ Vision Transformer(ViT) is one of the most widely used models in the computer +vision field with its great performance on various tasks. In order to fully +utilize the ViT-based architecture in various applications, proper +visualization methods with a decent localization performance are necessary, but +these methods employed in CNN-based models are still not available in ViT due +to its unique structure. In this work, we propose an attention-guided +visualization method applied to ViT that provides a high-level semantic +explanation for its decision. Our method selectively aggregates the gradients +directly propagated from the classification output to each self-attention, +collecting the contribution of image features extracted from each location of +the input image. These gradients are additionally guided by the normalized +self-attention scores, which are the pairwise patch correlation scores. They +are used to supplement the gradients on the patch-level context information +efficiently detected by the self-attention mechanism. This approach of our +method provides elaborate high-level semantic explanations with great +localization performance only with the class labels. As a result, our method +outperforms the previous leading explainability methods of ViT in the +weakly-supervised localization task and presents great capability in capturing +the full instances of the target class object. Meanwhile, our method provides a +visualization that faithfully explains the model, which is demonstrated in the +perturbation comparison test. + +
+
+ comment: AAAI2024. Code available at + https://github.com/LeemSaebom/Attention-Guided-CAM-Visual-Explanations-of-Vision-Transformer-Guided-by-Self-Attention.git +
+
+
+
+
+ + ☆ DMAT: A Dynamic Mask-Aware Transformer for Human De-occlusion + + +
+ Human de-occlusion, which aims to infer the appearance of invisible human +parts from an occluded image, has great value in many human-related tasks, such +as person re-id, and intention inference. To address this task, this paper +proposes a dynamic mask-aware transformer (DMAT), which dynamically augments +information from human regions and weakens that from occlusion. First, to +enhance token representation, we design an expanded convolution head with +enlarged kernels, which captures more local valid context and mitigates the +influence of surrounding occlusion. To concentrate on the visible human parts, +we propose a novel dynamic multi-head human-mask guided attention mechanism +through integrating multiple masks, which can prevent the de-occluded regions +from assimilating to the background. Besides, a region upsampling strategy is +utilized to alleviate the impact of occlusion on interpolated images. During +model learning, an amodal loss is developed to further emphasize the recovery +effect of human regions, which also refines the model's convergence. Extensive +experiments on the AHP dataset demonstrate its superior performance compared to +recent state-of-the-art methods. + +
+
+
+
+
+ + ☆ FM-Fusion: Instance-aware Semantic Mapping Boosted by Vision-Language + Foundation Models + + +
+ Semantic mapping based on the supervised object detectors is sensitive to +image distribution. In real-world environments, the object detection and +segmentation performance can lead to a major drop, preventing the use of +semantic mapping in a wider domain. On the other hand, the development of +vision-language foundation models demonstrates a strong zero-shot +transferability across data distribution. It provides an opportunity to +construct generalizable instance-aware semantic maps. Hence, this work explores +how to boost instance-aware semantic mapping from object detection generated +from foundation models. We propose a probabilistic label fusion method to +predict close-set semantic classes from open-set label measurements. An +instance refinement module merges the over-segmented instances caused by +inconsistent segmentation. We integrate all the modules into a unified semantic +mapping system. Reading a sequence of RGB-D input, our work incrementally +reconstructs an instance-aware semantic map. We evaluate the zero-shot +performance of our method in ScanNet and SceneNN datasets. Our method achieves +40.3 mean average precision (mAP) on the ScanNet semantic instance segmentation +task. It outperforms the traditional semantic mapping method significantly. + +
+
+ comment: Accepted by IEEE RA-L +
+
+
+
+
+ + ☆ BirdNeRF: Fast Neural Reconstruction of Large-Scale Scenes From Aerial + Imagery + + +
+ In this study, we introduce BirdNeRF, an adaptation of Neural Radiance Fields +(NeRF) designed specifically for reconstructing large-scale scenes using aerial +imagery. Unlike previous research focused on small-scale and object-centric +NeRF reconstruction, our approach addresses multiple challenges, including (1) +Addressing the issue of slow training and rendering associated with large +models. (2) Meeting the computational demands necessitated by modeling a +substantial number of images, requiring extensive resources such as +high-performance GPUs. (3) Overcoming significant artifacts and low visual +fidelity commonly observed in large-scale reconstruction tasks due to limited +model capacity. Specifically, we present a novel bird-view pose-based spatial +decomposition algorithm that decomposes a large aerial image set into multiple +small sets with appropriately sized overlaps, allowing us to train individual +NeRFs of sub-scene. This decomposition approach not only decouples rendering +time from the scene size but also enables rendering to scale seamlessly to +arbitrarily large environments. Moreover, it allows for per-block updates of +the environment, enhancing the flexibility and adaptability of the +reconstruction process. Additionally, we propose a projection-guided novel view +re-rendering strategy, which aids in effectively utilizing the independently +trained sub-scenes to generate superior rendering results. We evaluate our +approach on existing datasets as well as against our own drone footage, +improving reconstruction speed by 10x over classical photogrammetry software +and 50x over state-of-the-art large-scale NeRF solution, on a single GPU with +similar rendering quality. + +
+
+
+
+
+ + ☆ BRI3L: A Brightness Illusion Image Dataset for Identification and + Localization of Regions of Illusory Perception + + +
+ Visual illusions play a significant role in understanding visual perception. +Current methods in understanding and evaluating visual illusions are mostly +deterministic filtering based approach and they evaluate on a handful of visual +illusions, and the conclusions therefore, are not generic. To this end, we +generate a large-scale dataset of 22,366 images (BRI3L: BRightness Illusion +Image dataset for Identification and Localization of illusory perception) of +the five types of brightness illusions and benchmark the dataset using +data-driven neural network based approaches. The dataset contains label +information - (1) whether a particular image is illusory/nonillusory, (2) the +segmentation mask of the illusory region of the image. Hence, both the +classification and segmentation task can be evaluated using this dataset. We +follow the standard psychophysical experiments involving human subjects to +validate the dataset. To the best of our knowledge, this is the first attempt +to develop a dataset of visual illusions and benchmark using data-driven +approach for illusion classification and localization. We consider five +well-studied types of brightness illusions: 1) Hermann grid, 2) Simultaneous +Brightness Contrast, 3) White illusion, 4) Grid illusion, and 5) Induced +Grating illusion. Benchmarking on the dataset achieves 99.56% accuracy in +illusion identification and 84.37% pixel accuracy in illusion localization. The +application of deep learning model, it is shown, also generalizes over unseen +brightness illusions like brightness assimilation to contrast transitions. We +also test the ability of state-of-theart diffusion models to generate +brightness illusions. We have provided all the code, dataset, instructions etc +in the github repo: https://github.com/aniket004/BRI3L + +
+
+
+
+
+ + ☆ BioDrone: A Bionic Drone-based Single Object Tracking Benchmark for + Robust Vision + + +
+ Single object tracking (SOT) is a fundamental problem in computer vision, +with a wide range of applications, including autonomous driving, augmented +reality, and robot navigation. The robustness of SOT faces two main challenges: +tiny target and fast motion. These challenges are especially manifested in +videos captured by unmanned aerial vehicles (UAV), where the target is usually +far away from the camera and often with significant motion relative to the +camera. To evaluate the robustness of SOT methods, we propose BioDrone -- the +first bionic drone-based visual benchmark for SOT. Unlike existing UAV +datasets, BioDrone features videos captured from a flapping-wing UAV system +with a major camera shake due to its aerodynamics. BioDrone hence highlights +the tracking of tiny targets with drastic changes between consecutive frames, +providing a new robust vision benchmark for SOT. To date, BioDrone offers the +largest UAV-based SOT benchmark with high-quality fine-grained manual +annotations and automatically generates frame-level labels, designed for robust +vision analyses. Leveraging our proposed BioDrone, we conduct a systematic +evaluation of existing SOT methods, comparing the performance of 20 +representative models and studying novel means of optimizing a SOTA method +(KeepTrack KeepTrack) for robust SOT. Our evaluation leads to new baselines and +insights for robust SOT. Moving forward, we hope that BioDrone will not only +serve as a high-quality benchmark for robust SOT, but also invite future +research into robust computer vision. The database, toolkits, evaluation +server, and baseline results are available at http://biodrone.aitestunion.com. + +
+
+ comment: This paper is published in IJCV (refer to DOI). Please cite the + published IJCV +
+
+
+
+
+ + ☆ A Review on Digital Pixel Sensors + + +
+ Digital pixel sensor (DPS) has evolved as a pivotal component in modern +imaging systems and has the potential to revolutionize various fields such as +medical imaging, astronomy, surveillance, IoT devices, etc. Compared to analog +pixel sensors, the DPS offers high speed and good image quality. However, the +introduced intrinsic complexity within each pixel, primarily attributed to the +accommodation of the ADC circuit, engenders a substantial increase in the pixel +pitch. Unfortunately, such a pronounced escalation in pixel pitch drastically +undermines the feasibility of achieving high-density integration, which is an +obstacle that significantly narrows down the field of potential applications. +Nonetheless, designing compact conversion circuits along with strategic +integration of 3D architectural paradigms can be a potential remedy to the +prevailing situation. This review article presents a comprehensive overview of +the vast area of DPS technology. The operating principles, advantages, and +challenges of different types of DPS circuits have been analyzed. We categorize +the schemes into several categories based on ADC operation. A comparative study +based on different performance metrics has also been showcased for a +well-rounded understanding. + +
+
+
+
+
+ + ☆ Text2Street: Controllable Text-to-image Generation for Street Views + + +
+ Text-to-image generation has made remarkable progress with the emergence of +diffusion models. However, it is still a difficult task to generate images for +street views based on text, mainly because the road topology of street scenes +is complex, the traffic status is diverse and the weather condition is various, +which makes conventional text-to-image models difficult to deal with. To +address these challenges, we propose a novel controllable text-to-image +framework, named \textbf{Text2Street}. In the framework, we first introduce the +lane-aware road topology generator, which achieves text-to-map generation with +the accurate road structure and lane lines armed with the counting adapter, +realizing the controllable road topology generation. Then, the position-based +object layout generator is proposed to obtain text-to-layout generation through +an object-level bounding box diffusion strategy, realizing the controllable +traffic object layout generation. Finally, the multiple control image generator +is designed to integrate the road topology, object layout and weather +description to realize controllable street-view image generation. Extensive +experiments show that the proposed approach achieves controllable street-view +text-to-image generation and validates the effectiveness of the Text2Street +framework for street views. + +
+
+
+
+
+ + ☆ ColorSwap: A Color and Word Order Dataset for Multimodal Evaluation + + +
+ This paper introduces the ColorSwap dataset, designed to assess and improve +the proficiency of multimodal models in matching objects with their colors. The +dataset is comprised of 2,000 unique image-caption pairs, grouped into 1,000 +examples. Each example includes a caption-image pair, along with a +``color-swapped'' pair. We follow the Winoground schema: the two captions in an +example have the same words, but the color words have been rearranged to modify +different objects. The dataset was created through a novel blend of automated +caption and image generation with humans in the loop. We evaluate image-text +matching (ITM) and visual language models (VLMs) and find that even the latest +ones are still not robust at this task. GPT-4V and LLaVA score 72% and 42% on +our main VLM metric, although they may improve with more advanced prompting +techniques. On the main ITM metric, contrastive models such as CLIP and SigLIP +perform close to chance (at 12% and 30%, respectively), although the +non-contrastive BLIP ITM model is stronger (87%). We also find that finetuning +on fewer than 2,000 examples yields significant performance gains on this +out-of-distribution word-order understanding task. The dataset is here: +https://github.com/Top34051/colorswap. + +
+
+
+
+
+ + ☆ BEBLID: Boosted efficient binary local image descriptor + + +
+ Efficient matching of local image features is a fundamental task in many +computer vision applications. However, the real-time performance of top +matching algorithms is compromised in computationally limited devices, such as +mobile phones or drones, due to the simplicity of their hardware and their +finite energy supply. In this paper we introduce BEBLID, an efficient learned +binary image descriptor. It improves our previous real-valued descriptor, +BELID, making it both more efficient for matching and more accurate. To this +end we use AdaBoost with an improved weak-learner training scheme that produces +better local descriptions. Further, we binarize our descriptor by forcing all +weak-learners to have the same weight in the strong learner combination and +train it in an unbalanced data set to address the asymmetries arising in +matching and retrieval tasks. In our experiments BEBLID achieves an accuracy +close to SIFT and better computational efficiency than ORB, the fastest +algorithm in the literature. + +
+
+
+
+
+ + ☆ MIRT: a simultaneous reconstruction and affine motion compensation + technique for four dimensional computed tomography (4DCT) + + +
+ In four-dimensional computed tomography (4DCT), 3D images of moving or +deforming samples are reconstructed from a set of 2D projection images. Recent +techniques for iterative motion-compensated reconstruction either necessitate a +reference acquisition or alternate image reconstruction and motion estimation +steps. In these methods, the motion estimation step involves the estimation of +either complete deformation vector fields (DVFs) or a limited set of parameters +corresponding to the affine motion, including rigid motion or scaling. The +majority of these approaches rely on nested iterations, incurring significant +computational expenses. Notably, despite the direct benefits of an analytical +formulation and a substantial reduction in computational complexity, there has +been no exploration into parameterizing DVFs for general affine motion in CT +imaging. In this work, we propose the Motion-compensated Iterative +Reconstruction Technique (MIRT)- an efficient iterative reconstruction scheme +that combines image reconstruction and affine motion estimation in a single +update step, based on the analytical gradients of the motion towards both the +reconstruction and the affine motion parameters. When most of the +state-of-the-art 4DCT methods have not attempted to be tested on real data, +results from simulation and real experiments show that our method outperforms +the state-of-the-art CT reconstruction with affine motion correction methods in +computational feasibility and projection distance. In particular, this allows +accurate reconstruction for a proper microscale diamond in the appearance of +motion from the practically acquired projection radiographs, which leads to a +novel application of 4DCT. + +
+
+ comment: Submitted to the SIAM Journal on Imaging Sciences (SIIMS) +
+
+
+
+
+ + ♻ ☆ Vision-Language Dataset Distillation + + +
+ Dataset distillation methods reduce large-scale datasets to smaller sets of +synthetic data, which preserve sufficient information for quickly training a +new model from scratch. However, prior work on dataset distillation has focused +exclusively on image classification datasets, whereas modern large-scale +datasets are primarily in the vision-language space. In this work, we design +the first vision-language dataset distillation method, building on the idea of +trajectory matching. A key challenge is that vision-language datasets do not +have a set of discrete classes. To overcome this, our proposed method jointly +distills the image-text pairs in a contrastive formulation. Further, we +leverage Low-Rank Adaptation (LoRA) matching to enable more efficient and +effective trajectory matching in complex modern vision-language models. Since +there are no existing baselines, we compare our distillation approach to three +adapted vision-language coreset selection methods. We demonstrate significant +improvements on the challenging Flickr30K and COCO retrieval benchmarks: for +example, on Flickr30K, the best coreset selection method selecting 1000 +image-text pairs for training achieves only 5.6% image-to-text retrieval +accuracy (i.e., recall@1); in contrast, our dataset distillation approach +almost doubles that to 9.9% with just 100 (an order of magnitude fewer) +training pairs. + +
+
+ comment: 29 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language + Navigation AAAI 2024 + + +
+ Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate +through realistic 3D outdoor environments based on natural language +instructions. The performance of existing VLN methods is limited by +insufficient diversity in navigation environments and limited training data. To +address these issues, we propose VLN-Video, which utilizes the diverse outdoor +environments present in driving videos in multiple cities in the U.S. augmented +with automatically generated navigation instructions and actions to improve +outdoor VLN performance. VLN-Video combines the best of intuitive classical +approaches and modern deep learning techniques, using template infilling to +generate grounded navigation instructions, combined with an image rotation +similarity-based navigation action predictor to obtain VLN style data from +driving videos for pretraining deep learning VLN models. We pre-train the model +on the Touchdown dataset and our video-augmented dataset created from driving +videos with three proxy tasks: Masked Language Modeling, Instruction and +Trajectory Matching, and Next Action Prediction, so as to learn +temporally-aware and visually-aligned instruction representations. The learned +instruction representation is adapted to the state-of-the-art navigator when +fine-tuning on the Touchdown dataset. Empirical results demonstrate that +VLN-Video significantly outperforms previous state-of-the-art models by 2.1% in +task completion rate, achieving a new state-of-the-art on the Touchdown +dataset. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Scalable 3D Panoptic Segmentation As Superpoint Graph Clustering 3DV 2024 + + +
+ We introduce a highly efficient method for panoptic segmentation of large 3D +point clouds by redefining this task as a scalable graph clustering problem. +This approach can be trained using only local auxiliary tasks, thereby +eliminating the resource-intensive instance-matching step during training. +Moreover, our formulation can easily be adapted to the superpoint paradigm, +further increasing its efficiency. This allows our model to process scenes with +millions of points and thousands of objects in a single inference. Our method, +called SuperCluster, achieves a new state-of-the-art panoptic segmentation +performance for two indoor scanning datasets: $50.1$ PQ ($+7.8$) for S3DIS +Area~5, and $58.7$ PQ ($+25.2$) for ScanNetV2. We also set the first +state-of-the-art for two large-scale mobile mapping benchmarks: KITTI-360 and +DALES. With only $209$k parameters, our model is over $30$ times smaller than +the best-competing method and trains up to $15$ times faster. Our code and +pretrained models are available at +https://github.com/drprojects/superpoint_transformer. + +
+
+ comment: Accepted at 3DV 2024, Oral presentation +
+
+
+
+
+ + ♻ ☆ Score-based Conditional Generation with Fewer Labeled Data by + Self-calibrating Classifier Guidance + + +
+ Score-based generative models (SGMs) are a popular family of deep generative +models that achieve leading image generation quality. Early studies extend SGMs +to tackle class-conditional generation by coupling an unconditional SGM with +the guidance of a trained classifier. Nevertheless, such classifier-guided SGMs +do not always achieve accurate conditional generation, especially when trained +with fewer labeled data. We argue that the problem is rooted in the +classifier's tendency to overfit without coordinating with the underlying +unconditional distribution. To make the classifier respect the unconditional +distribution, we propose improving classifier-guided SGMs by letting the +classifier regularize itself. The key idea of our proposed method is to use +principles from energy-based models to convert the classifier into another view +of the unconditional SGM. Existing losses for unconditional SGMs can then be +leveraged to achieve regularization by calibrating the classifier's internal +unconditional scores. The regularization scheme can be applied to not only the +labeled data but also unlabeled ones to further improve the classifier. Across +various percentages of fewer labeled data, empirical results show that the +proposed approach significantly enhances conditional generation quality. The +enhancements confirm the potential of the proposed self-calibration technique +for generative modeling with limited labeled data. + +
+
+
+
+
+ + ♻ ☆ General Neural Gauge Fields ICLR 2023 + + +
+ The recent advance of neural fields, such as neural radiance fields, has +significantly pushed the boundary of scene representation learning. Aiming to +boost the computation efficiency and rendering quality of 3D scenes, a popular +line of research maps the 3D coordinate system to another measuring system, +e.g., 2D manifolds and hash tables, for modeling neural fields. The conversion +of coordinate systems can be typically dubbed as \emph{gauge transformation}, +which is usually a pre-defined mapping function, e.g., orthogonal projection or +spatial hash function. This begs a question: can we directly learn a desired +gauge transformation along with the neural field in an end-to-end manner? In +this work, we extend this problem to a general paradigm with a taxonomy of +discrete \& continuous cases, and develop a learning framework to jointly +optimize gauge transformations and neural fields. To counter the problem that +the learning of gauge transformations can collapse easily, we derive a general +regularization mechanism from the principle of information conservation during +the gauge transformation. To circumvent the high computation cost in gauge +learning with regularization, we directly derive an information-invariant gauge +transformation which allows to preserve scene information inherently and yield +superior performance. Project: https://fnzhan.com/Neural-Gauge-Fields + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ 4D Gaussian Splatting: Towards Efficient Novel View Synthesis for + Dynamic Scenes + + +
+ We consider the problem of novel view synthesis (NVS) for dynamic scenes. +Recent neural approaches have accomplished exceptional NVS results for static +3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior +efforts often encode dynamics by learning a canonical space plus implicit or +explicit deformation fields, which struggle in challenging scenarios like +sudden movements or capturing high-fidelity renderings. In this paper, we +introduce 4D Gaussian Splatting (4DGS), a novel method that represents dynamic +scenes with anisotropic 4D XYZT Gaussians, inspired by the success of 3D +Gaussian Splatting in static scenes. We model dynamics at each timestamp by +temporally slicing the 4D Gaussians, which naturally compose dynamic 3D +Gaussians and can be seamlessly projected into images. As an explicit +spatial-temporal representation, 4DGS demonstrates powerful capabilities for +modeling complicated dynamics and fine details, especially for scenes with +abrupt motions. We further implement our temporal slicing and splatting +techniques in a highly optimized CUDA acceleration framework, achieving +real-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and +583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions +showcase the superior efficiency and effectiveness of 4DGS, which consistently +outperforms existing methods both quantitatively and qualitatively. + +
+
+
+
+
+ + ♻ ☆ An Outlook into the Future of Egocentric Vision + + +
+ What will the future be? We wonder! In this survey, we explore the gap +between current research in egocentric vision and the ever-anticipated future, +where wearable computing, with outward facing cameras and digital overlays, is +expected to be integrated in our every day lives. To understand this gap, the +article starts by envisaging the future through character-based stories, +showcasing through examples the limitations of current technology. We then +provide a mapping between this future and previously defined research tasks. +For each task, we survey its seminal works, current state-of-the-art +methodologies and available datasets, then reflect on shortcomings that limit +its applicability to future research. Note that this survey focuses on software +models for egocentric vision, independent of any specific hardware. The paper +concludes with recommendations for areas of immediate explorations so as to +unlock our path to the future always-on, personalised and life-enhancing +egocentric vision. + +
+
+ comment: We invite comments, suggestions and corrections here: + https://openreview.net/forum?id=V3974SUk1w +
+
+
+
+
+ + ♻ ☆ Defending Our Privacy With Backdoors + + +
+ The proliferation of large AI models trained on uncurated, often sensitive +web-scraped data has raised significant privacy concerns. One of the concerns +is that adversaries can extract information about the training data using +privacy attacks. Unfortunately, the task of removing specific information from +the models without sacrificing performance is not straightforward and has +proven to be challenging. We propose a rather easy yet effective defense based +on backdoor attacks to remove private information such as names and faces of +individuals from vision-language models by fine-tuning them for only a few +minutes instead of re-training them from scratch. Specifically, through +strategic insertion of backdoors into text encoders, we align the embeddings of +sensitive phrases with those of neutral terms-"a person" instead of the +person's actual name. For image encoders, we map embeddings of individuals to +be removed from the model to a universal, anonymous embedding. Our empirical +results demonstrate the effectiveness of our backdoor-based defense on CLIP by +assessing its performance using a specialized privacy attack for zero-shot +classifiers. Our approach provides not only a new "dual-use" perspective on +backdoor attacks, but also presents a promising avenue to enhance the privacy +of individuals within models trained on uncurated web-scraped data. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Mixed Autoencoder for Self-supervised Visual Representation Learning CVPR 2023 + + +
+ Masked Autoencoder (MAE) has demonstrated superior performance on various +vision tasks via randomly masking image patches and reconstruction. However, +effective data augmentation strategies for MAE still remain open questions, +different from those in contrastive learning that serve as the most important +part. This paper studies the prevailing mixing augmentation for MAE. We first +demonstrate that naive mixing will in contrast degenerate model performance due +to the increase of mutual information (MI). To address, we propose homologous +recognition, an auxiliary pretext task, not only to alleviate the MI +increasement by explicitly requiring each patch to recognize homologous +patches, but also to perform object-aware self-supervised pre-training for +better downstream dense perception performance. With extensive experiments, we +demonstrate that our proposed Mixed Autoencoder (MixedAE) achieves the +state-of-the-art transfer results among masked image modeling (MIM) +augmentations on different downstream tasks with significant efficiency. +Specifically, our MixedAE outperforms MAE by +0.3% accuracy, +1.7 mIoU and +0.9 +AP on ImageNet-1K, ADE20K and COCO respectively with a standard ViT-Base. +Moreover, MixedAE surpasses iBOT, a strong MIM method combined with instance +discrimination, while accelerating training by 2x. To our best knowledge, this +is the very first work to consider mixing for MIM from the perspective of +pretext task design. Code will be made available. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Fully Hyperbolic Convolutional Neural Networks for Computer Vision + + +
+ Real-world visual data exhibit intrinsic hierarchical structures that can be +represented effectively in hyperbolic spaces. Hyperbolic neural networks (HNNs) +are a promising approach for learning feature representations in such spaces. +However, current HNNs in computer vision rely on Euclidean backbones and only +project features to the hyperbolic space in the task heads, limiting their +ability to fully leverage the benefits of hyperbolic geometry. To address this, +we present HCNN, a fully hyperbolic convolutional neural network (CNN) designed +for computer vision tasks. Based on the Lorentz model, we generalize +fundamental components of CNNs and propose novel formulations of the +convolutional layer, batch normalization, and multinomial logistic regression. +{Experiments on standard vision tasks demonstrate the promising performance of +our HCNN framework in both hybrid and fully hyperbolic settings.} Overall, we +believe our contributions provide a foundation for developing more powerful +HNNs that can better represent complex structures found in image data. Our code +is publicly available at https://github.com/kschwethelm/HyperbolicCV. + +
+
+
+
+
+ + ♻ ☆ Domain Adaptation based Interpretable Image Emotion Recognition using + Facial Expression Recognition + + +
+ A domain adaptation technique has been proposed in this paper to identify the +emotions in generic images containing facial & non-facial objects and non-human +components. It addresses the challenge of the insufficient availability of +pre-trained models and well-annotated datasets for image emotion recognition +(IER). It starts with proposing a facial emotion recognition (FER) system and +then moves on to adapting it for image emotion recognition. First, a +deep-learning-based FER system has been proposed that classifies a given facial +image into discrete emotion classes. Further, an image recognition system has +been proposed that adapts the proposed FER system to recognize the emotions +portrayed by images using domain adaptation. It classifies the generic images +into 'happy,' 'sad,' 'hate,' and 'anger' classes. A novel interpretability +approach, Divide and Conquer based Shap (DnCShap), has also been proposed to +interpret the highly relevant visual features for emotion recognition. The +proposed system's architecture has been decided through ablation studies, and +the experiments are conducted on four FER and four IER datasets. The proposed +IER system has shown an emotion classification accuracy of 59.61% for the IAPSa +dataset, 57.83% for the ArtPhoto dataset, 67.93% for the FI dataset, and 55.13% +for the EMOTIC dataset. The important visual features leading to a particular +emotion class have been identified, and the embedding plots for various emotion +classes have been analyzed to explain the proposed system's predictions. + +
+
+
+
+
+ + ♻ ☆ GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model + on Complex Traffic Events + + +
+ The recognition and understanding of traffic incidents, particularly traffic +accidents, is a topic of paramount importance in the realm of intelligent +transportation systems and intelligent vehicles. This area has continually +captured the extensive focus of both the academic and industrial sectors. +Identifying and comprehending complex traffic events is highly challenging, +primarily due to the intricate nature of traffic environments, diverse +observational perspectives, and the multifaceted causes of accidents. These +factors have persistently impeded the development of effective solutions. The +advent of large vision-language models (VLMs) such as GPT-4V, has introduced +innovative approaches to addressing this issue. In this paper, we explore the +ability of GPT-4V with a set of representative traffic incident videos and +delve into the model's capacity of understanding these complex traffic +situations. We observe that GPT-4V demonstrates remarkable cognitive, +reasoning, and decision-making ability in certain classic traffic events. +Concurrently, we also identify certain limitations of GPT-4V, which constrain +its understanding in more intricate scenarios. These limitations merit further +exploration and resolution. + +
+
+
+
+
+ + ♻ ☆ iDeLog: Iterative Dual Spatial and Kinematic Extraction of + Sigma-Lognormal Parameters + + +
+ The Kinematic Theory of rapid movements and its associated Sigma-Lognormal +model have been extensively used in a large variety of applications. While the +physical and biological meaning of the model have been widely tested and +validated for rapid movements, some shortcomings have been detected when it is +used with continuous long and complex movements. To alleviate such drawbacks, +and inspired by the motor equivalence theory and a conceivable visual feedback, +this paper proposes a novel framework to extract the Sigma-Lognormal +parameters, namely iDeLog. Specifically, iDeLog consists of two steps. The +first one, influenced by the motor equivalence model, separately derives an +initial action plan defined by a set of virtual points and angles from the +trajectory and a sequence of lognormals from the velocity. In the second step, +based on a hypothetical visual feedback compatible with an open-loop motor +control, the virtual target points of the action plan are iteratively moved to +improve the matching between the observed and reconstructed trajectory and +velocity. During experiments conducted with handwritten signatures, iDeLog +obtained promising results as compared to the previous development of the +Sigma-Lognormal. + +
+
+ comment: Accepted Version published by Transactions on Pattern Analysis and + Machine Intelligence +
+
+
+
+
+ + ♻ ☆ MixFormerV2: Efficient Fully Transformer Tracking NIPS2023 + + +
+ Transformer-based trackers have achieved strong accuracy on the standard +benchmarks. However, their efficiency remains an obstacle to practical +deployment on both GPU and CPU platforms. In this paper, to overcome this +issue, we propose a fully transformer tracking framework, coined as +\emph{MixFormerV2}, without any dense convolutional operation and complex score +prediction module. Our key design is to introduce four special prediction +tokens and concatenate them with the tokens from target template and search +areas. Then, we apply the unified transformer backbone on these mixed token +sequence. These prediction tokens are able to capture the complex correlation +between target template and search area via mixed attentions. Based on them, we +can easily predict the tracking box and estimate its confidence score through +simple MLP heads. To further improve the efficiency of MixFormerV2, we present +a new distillation-based model reduction paradigm, including dense-to-sparse +distillation and deep-to-shallow distillation. The former one aims to transfer +knowledge from the dense-head based MixViT to our fully transformer tracker, +while the latter one is used to prune some layers of the backbone. We +instantiate two types of MixForemrV2, where the MixFormerV2-B achieves an AUC +of 70.6\% on LaSOT and an AUC of 57.4\% on TNL2k with a high GPU speed of 165 +FPS, and the MixFormerV2-S surpasses FEAR-L by 2.7\% AUC on LaSOT with a +real-time CPU speed. + +
+
+ comment: NIPS2023 +
+
+
+
+
+ + ♻ ☆ An objective comparison of methods for augmented reality in laparoscopic + liver resection by preoperative-to-intraoperative image fusion + + +
+ Augmented reality for laparoscopic liver resection is a visualisation mode +that allows a surgeon to localise tumours and vessels embedded within the liver +by projecting them on top of a laparoscopic image. Preoperative 3D models +extracted from CT or MRI data are registered to the intraoperative laparoscopic +images during this process. In terms of 3D-2D fusion, most of the algorithms +make use of anatomical landmarks to guide registration. These landmarks include +the liver's inferior ridge, the falciform ligament, and the occluding contours. +They are usually marked by hand in both the laparoscopic image and the 3D +model, which is time-consuming and may contain errors if done by a +non-experienced user. Therefore, there is a need to automate this process so +that augmented reality can be used effectively in the operating room. We +present the Preoperative-to-Intraoperative Laparoscopic Fusion Challenge +(P2ILF), held during the Medical Imaging and Computer Assisted Interventions +(MICCAI 2022) conference, which investigates the possibilities of detecting +these landmarks automatically and using them in registration. The challenge was +divided into two tasks: 1) A 2D and 3D landmark detection task and 2) a 3D-2D +registration task. The teams were provided with training data consisting of 167 +laparoscopic images and 9 preoperative 3D models from 9 patients, with the +corresponding 2D and 3D landmark annotations. A total of 6 teams from 4 +countries participated, whose proposed methods were evaluated on 16 images and +two preoperative 3D models from two patients. All the teams proposed deep +learning-based methods for the 2D and 3D landmark segmentation tasks and +differentiable rendering-based methods for the registration task. Based on the +experimental outcomes, we propose three key hypotheses that determine current +limitations and future directions for research in this domain. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Standardised convolutional filtering for radiomics + + +
+ The Image Biomarker Standardisation Initiative (IBSI) aims to improve +reproducibility of radiomics studies by standardising the computational process +of extracting image biomarkers (features) from images. We have previously +established reference values for 169 commonly used features, created a standard +radiomics image processing scheme, and developed reporting guidelines for +radiomic studies. However, several aspects are not standardised. Here we +present a complete version of a reference manual on the use of convolutional +filters in radiomics and quantitative image analysis. Filters, such as wavelets +or Laplacian of Gaussian filters, play an important part in emphasising +specific image characteristics such as edges and blobs. Features derived from +filter response maps were found to be poorly reproducible. This reference +manual provides definitions for convolutional filters, parameters that should +be reported, reference feature values, and tests to verify software compliance +with the reference standard. + +
+
+ comment: 87 pages. For additional information see https://theibsi.github.io/ +
+
+
+
+
+ + ♻ ☆ Looking for a better fit? An Incremental Learning Multimodal Object + Referencing Framework adapting to Individual Drivers SC + + +
+ The rapid advancement of the automotive industry towards automated and +semi-automated vehicles has rendered traditional methods of vehicle +interaction, such as touch-based and voice command systems, inadequate for a +widening range of non-driving related tasks, such as referencing objects +outside of the vehicle. Consequently, research has shifted toward gestural +input (e.g., hand, gaze, and head pose gestures) as a more suitable mode of +interaction during driving. However, due to the dynamic nature of driving and +individual variation, there are significant differences in drivers' gestural +input performance. While, in theory, this inherent variability could be +moderated by substantial data-driven machine learning models, prevalent +methodologies lean towards constrained, single-instance trained models for +object referencing. These models show a limited capacity to continuously adapt +to the divergent behaviors of individual drivers and the variety of driving +scenarios. To address this, we propose \textit{IcRegress}, a novel +regression-based incremental learning approach that adapts to changing behavior +and the unique characteristics of drivers engaged in the dual task of driving +and referencing objects. We suggest a more personalized and adaptable solution +for multimodal gestural interfaces, employing continuous lifelong learning to +enhance driver experience, safety, and convenience. Our approach was evaluated +using an outside-the-vehicle object referencing use case, highlighting the +superiority of the incremental learning models adapted over a single trained +model across various driver traits such as handedness, driving experience, and +numerous driving conditions. Finally, to facilitate reproducibility, ease +deployment, and promote further research, we offer our approach as an +open-source framework at \url{https://github.com/amrgomaaelhady/IcRegress}. + +
+
+ comment: Accepted for publication in the Proceedings of the 29th International + Conference on Intelligent User Interfaces (IUI'24), March 18--21, 2024, in + Greenville, SC, USA +
+
+
+
+
+ + ♻ ☆ DFormer: Rethinking RGBD Representation Learning for Semantic + Segmentation ICLR 2024 + + +
+ We present DFormer, a novel RGB-D pretraining framework to learn transferable +representations for RGB-D segmentation tasks. DFormer has two new key +innovations: 1) Unlike previous works that encode RGB-D information with RGB +pretrained backbone, we pretrain the backbone using image-depth pairs from +ImageNet-1K, and hence the DFormer is endowed with the capacity to encode RGB-D +representations; 2) DFormer comprises a sequence of RGB-D blocks, which are +tailored for encoding both RGB and depth information through a novel building +block design. DFormer avoids the mismatched encoding of the 3D geometry +relationships in depth maps by RGB pretrained backbones, which widely lies in +existing methods but has not been resolved. We finetune the pretrained DFormer +on two popular RGB-D tasks, i.e., RGB-D semantic segmentation and RGB-D salient +object detection, with a lightweight decoder head. Experimental results show +that our DFormer achieves new state-of-the-art performance on these two tasks +with less than half of the computational cost of the current best methods on +two RGB-D semantic segmentation datasets and five RGB-D salient object +detection datasets. Our code is available at: +https://github.com/VCIP-RGBD/DFormer. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ RefinedFields: Radiance Fields Refinement for Unconstrained Scenes + + +
+ Modeling large scenes from unconstrained images has proven to be a major +challenge in computer vision. Existing methods tackling in-the-wild scene +modeling operate in closed-world settings, where no conditioning on priors +acquired from real-world images is present. We propose RefinedFields, which is, +to the best of our knowledge, the first method leveraging pre-trained models to +improve in-the-wild scene modeling. We employ pre-trained networks to refine +K-Planes representations via optimization guidance using an alternating +training procedure. We carry out extensive experiments and verify the merit of +our method on synthetic data and real tourism photo collections. RefinedFields +enhances rendered scenes with richer details and outperforms previous work on +the task of novel view synthesis in the wild. Our project page can be found at +https://refinedfields.github.io . + +
+
+
+
+
+ + ♻ ☆ Spintronics for image recognition: performance benchmarking via + ultrafast data-driven simulations + + +
+ We present a demonstration of image classification using an echo-state +network (ESN) relying on a single simulated spintronic nanostructure known as +the vortex-based spin-torque oscillator (STVO) delayed in time. We employ an +ultrafast data-driven simulation framework called the data-driven Thiele +equation approach (DD-TEA) to simulate the STVO dynamics. This allows us to +avoid the challenges associated with repeated experimental manipulation of such +a nanostructured system. We showcase the versatility of our solution by +successfully applying it to solve classification challenges with the MNIST, +EMNIST-letters and Fashion MNIST datasets. Through our simulations, we +determine that within an ESN with numerous learnable parameters the results +obtained using the STVO dynamics as an activation function are comparable to +the ones obtained with other conventional nonlinear activation functions like +the reLU and the sigmoid. While achieving state-of-the-art accuracy levels on +the MNIST dataset, our model's performance on EMNIST-letters and Fashion MNIST +is lower due to the relative simplicity of the system architecture and the +increased complexity of the tasks. We expect that the DD-TEA framework will +enable the exploration of deeper architectures, ultimately leading to improved +classification accuracy. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Vivim: a Video Vision Mamba for Medical Video Object Segmentation + + +
+ Traditional convolutional neural networks have a limited receptive field +while transformer-based networks are mediocre in constructing long-term +dependency from the perspective of computational complexity. Such the +bottleneck poses a significant challenge when processing long video sequences +in video analysis tasks. Very recently, the state space models (SSMs) with +efficient hardware-aware designs, famous by Mamba, have exhibited impressive +achievements in long sequence modeling, which facilitates the development of +deep neural networks on many vision tasks. To better capture available cues in +video frames, this paper presents a generic Video Vision Mamba-based framework +for medical video object segmentation tasks, named Vivim. Our Vivim can +effectively compress the long-term spatiotemporal representation into sequences +at varying scales by our designed Temporal Mamba Block. Compared to existing +video-level Transformer-based methods, our model maintains excellent +segmentation results with better speed performance. Extensive experiments on +breast lesion segmentation in ultrasound videos and polyp segmentation in +colonoscopy videos demonstrate the effectiveness and efficiency of our Vivim. +The code is available at: https://github.com/scott-yjyang/Vivim. + +
+
+
+
+
+ + ♻ ☆ Taming Reversible Halftoning via Predictive Luminance + + +
+ Traditional halftoning usually drops colors when dithering images with binary +dots, which makes it difficult to recover the original color information. We +proposed a novel halftoning technique that converts a color image into a binary +halftone with full restorability to its original version. Our novel base +halftoning technique consists of two convolutional neural networks (CNNs) to +produce the reversible halftone patterns, and a noise incentive block (NIB) to +mitigate the flatness degradation issue of CNNs. Furthermore, to tackle the +conflicts between the blue-noise quality and restoration accuracy in our novel +base method, we proposed a predictor-embedded approach to offload predictable +information from the network, which in our case is the luminance information +resembling from the halftone pattern. Such an approach allows the network to +gain more flexibility to produce halftones with better blue-noise quality +without compromising the restoration quality. Detailed studies on the +multiple-stage training method and loss weightings have been conducted. We have +compared our predictor-embedded method and our novel method regarding spectrum +analysis on halftone, halftone accuracy, restoration accuracy, and the data +embedding studies. Our entropy evaluation evidences our halftone contains less +encoding information than our novel base method. The experiments show our +predictor-embedded method gains more flexibility to improve the blue-noise +quality of halftones and maintains a comparable restoration quality with a +higher tolerance for disturbances. + +
+
+ comment: published in IEEE Transactions on Visualization and Computer Graphics +
+
+
+
+
+ + ♻ ☆ Optimization-Free Test-Time Adaptation for Cross-Person Activity + Recognition + + +
+ Human Activity Recognition (HAR) models often suffer from performance +degradation in real-world applications due to distribution shifts in activity +patterns across individuals. Test-Time Adaptation (TTA) is an emerging learning +paradigm that aims to utilize the test stream to adjust predictions in +real-time inference, which has not been explored in HAR before. However, the +high computational cost of optimization-based TTA algorithms makes it +intractable to run on resource-constrained edge devices. In this paper, we +propose an Optimization-Free Test-Time Adaptation (OFTTA) framework for +sensor-based HAR. OFTTA adjusts the feature extractor and linear classifier +simultaneously in an optimization-free manner. For the feature extractor, we +propose Exponential DecayTest-time Normalization (EDTN) to replace the +conventional batch normalization (CBN) layers. EDTN combines CBN and Test-time +batch Normalization (TBN) to extract reliable features against domain shifts +with TBN's influence decreasing exponentially in deeper layers. For the +classifier, we adjust the prediction by computing the distance between the +feature and the prototype, which is calculated by a maintained support set. In +addition, the update of the support set is based on the pseudo label, which can +benefit from reliable features extracted by EDTN. Extensive experiments on +three public cross-person HAR datasets and two different TTA settings +demonstrate that OFTTA outperforms the state-of-the-art TTA approaches in both +classification performance and computational efficiency. Finally, we verify the +superiority of our proposed OFTTA on edge devices, indicating possible +deployment in real applications. Our code is available at +https://github.com/Claydon-Wang/OFTTA. + +
+
+ comment: To be presented at UbiComp 2024; Accepted by Proceedings of the ACM + on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) +
+
+
+
+
+ + ♻ ☆ Fusion of Single and Integral Multispectral Aerial Images + + +
+ An adequate fusion of the most significant salient information from multiple +input channels is essential for many aerial imaging tasks. While multispectral +recordings reveal features in various spectral ranges, synthetic aperture +sensing makes occluded features visible. We present a first and hybrid (model- +and learning-based) architecture for fusing the most significant features from +conventional aerial images with the ones from integral aerial images that are +the result of synthetic aperture sensing for removing occlusion. It combines +the environment's spatial references with features of unoccluded targets that +would normally be hidden by dense vegetation. Our method out-beats +state-of-the-art two-channel and multi-channel fusion approaches visually and +quantitatively in common metrics, such as mutual information, visual +information fidelity, and peak signal-to-noise ratio. The proposed model does +not require manually tuned parameters, can be extended to an arbitrary number +and combinations of spectral channels, and is reconfigurable for addressing +different use cases. We demonstrate examples for search-and-rescue, wildfire +detection, and wildlife observation. + +
+
+
+
+
+ + ♻ ☆ Stable Score Distillation for High-Quality 3D Generation + + +
+ Although Score Distillation Sampling (SDS) has exhibited remarkable +performance in conditional 3D content generation, a comprehensive understanding +of its formulation is still lacking, hindering the development of 3D +generation. In this work, we decompose SDS as a combination of three functional +components, namely mode-seeking, mode-disengaging and variance-reducing terms, +analyzing the properties of each. We show that problems such as over-smoothness +and implausibility result from the intrinsic deficiency of the first two terms +and propose a more advanced variance-reducing term than that introduced by SDS. +Based on the analysis, we propose a simple yet effective approach named Stable +Score Distillation (SSD) which strategically orchestrates each term for +high-quality 3D generation and can be readily incorporated to various 3D +generation frameworks and 3D representations. Extensive experiments validate +the efficacy of our approach, demonstrating its ability to generate +high-fidelity 3D content without succumbing to issues such as over-smoothness. + +
+
+
+
+
+ + ♻ ☆ Skip \n: A Simple Method to Reduce Hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Towards General Visual-Linguistic Face Forgery Detection + + +
+ Deepfakes are realistic face manipulations that can pose serious threats to +security, privacy, and trust. Existing methods mostly treat this task as binary +classification, which uses digital labels or mask signals to train the +detection model. We argue that such supervisions lack semantic information and +interpretability. To address this issues, in this paper, we propose a novel +paradigm named Visual-Linguistic Face Forgery Detection(VLFFD), which uses +fine-grained sentence-level prompts as the annotation. Since text annotations +are not available in current deepfakes datasets, VLFFD first generates the +mixed forgery image with corresponding fine-grained prompts via Prompt Forgery +Image Generator (PFIG). Then, the fine-grained mixed data and coarse-grained +original data and is jointly trained with the Coarse-and-Fine Co-training +framework (C2F), enabling the model to gain more generalization and +interpretability. The experiments show the proposed method improves the +existing detection models on several challenging benchmarks. Furthermore, we +have integrated our method with multimodal large models, achieving noteworthy +results that demonstrate the potential of our approach. This integration not +only enhances the performance of our VLFFD paradigm but also underscores the +versatility and adaptability of our method when combined with advanced +multimodal technologies, highlighting its potential in tackling the evolving +challenges of deepfake detection. + +
+
+
+
+
+ + ♻ ☆ EDO-Net: Learning Elastic Properties of Deformable Objects from Graph + Dynamics + + +
+ We study the problem of learning graph dynamics of deformable objects that +generalizes to unknown physical properties. Our key insight is to leverage a +latent representation of elastic physical properties of cloth-like deformable +objects that can be extracted, for example, from a pulling interaction. In this +paper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph +dynamics trained on a large variety of samples with different elastic +properties that does not rely on ground-truth labels of the properties. EDO-Net +jointly learns an adaptation module, and a forward-dynamics module. The former +is responsible for extracting a latent representation of the physical +properties of the object, while the latter leverages the latent representation +to predict future states of cloth-like objects represented as graphs. We +evaluate EDO-Net both in simulation and real world, assessing its capabilities +of: 1) generalizing to unknown physical properties, 2) transferring the learned +representation to new downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans 3DV 2024 + + +
+ We present a novel task for cross-dataset visual grounding in 3D scenes +(Cross3DVG), which overcomes limitations of existing 3D visual grounding +models, specifically their restricted 3D resources and consequent tendencies of +overfitting a specific 3D dataset. We created RIORefer, a large-scale 3D visual +grounding dataset, to facilitate Cross3DVG. It includes more than 63k diverse +descriptions of 3D objects within 1,380 indoor RGB-D scans from 3RScan, with +human annotations. After training the Cross3DVG model using the source 3D +visual grounding dataset, we evaluate it without target labels using the target +dataset with, e.g., different sensors, 3D reconstruction methods, and language +annotators. Comprehensive experiments are conducted using established visual +grounding models and with CLIP-based multi-view 2D and 3D integration designed +to bridge gaps among 3D datasets. For Cross3DVG tasks, (i) cross-dataset 3D +visual grounding exhibits significantly worse performance than learning and +evaluation with a single dataset because of the 3D data and language variants +across datasets. Moreover, (ii) better object detector and localization modules +and fusing 3D data and multi-view CLIP-based image features can alleviate this +lower performance. Our Cross3DVG task can provide a benchmark for developing +robust 3D visual grounding models to handle diverse 3D scenes while leveraging +deep language understanding. + +
+
+ comment: 3DV 2024 +
+
+
+
+
+ + ♻ ☆ Taylor Videos for Action Recognition + + +
+ Effectively extracting motions from video is a critical and long-standing +problem for action recognition. This problem is very challenging because +motions (i) do not have an explicit form, (ii) have various concepts such as +displacement, velocity, and acceleration, and (iii) often contain noise caused +by unstable pixels. Addressing these challenges, we propose the Taylor video, a +new video format that highlights the dominate motions (e.g., a waving hand) in +each of its frames named the Taylor frame. Taylor video is named after Taylor +series, which approximates a function at a given point using important terms. +In the scenario of videos, we define an implicit motion-extraction function +which aims to extract motions from video temporal block. In this block, using +the frames, the difference frames, and higher-order difference frames, we +perform Taylor expansion to approximate this function at the starting frame. We +show the summation of the higher-order terms in the Taylor series gives us +dominant motion patterns, where static objects, small and unstable motions are +removed. Experimentally we show that Taylor videos are effective inputs to +popular architectures including 2D CNNs, 3D CNNs, and transformers. When used +individually, Taylor videos yield competitive action recognition accuracy +compared to RGB videos and optical flow. When fused with RGB or optical flow +videos, further accuracy improvement is achieved. + +
+
+ comment: Research report +
+
+
+
+
+ + ♻ ☆ ProxyDet: Synthesizing Proxy Novel Classes via Classwise Mixup for + Open-Vocabulary Object Detection AAAI24 + + +
+ Open-vocabulary object detection (OVOD) aims to recognize novel objects whose +categories are not included in the training set. In order to classify these +unseen classes during training, many OVOD frameworks leverage the zero-shot +capability of largely pretrained vision and language models, such as CLIP. To +further improve generalization on the unseen novel classes, several approaches +proposed to additionally train with pseudo region labeling on the external data +sources that contain a substantial number of novel category labels beyond the +existing training data. Albeit its simplicity, these pseudo-labeling methods +still exhibit limited improvement with regard to the truly unseen novel classes +that were not pseudo-labeled. In this paper, we present a novel, yet simple +technique that helps generalization on the overall distribution of novel +classes. Inspired by our observation that numerous novel classes reside within +the convex hull constructed by the base (seen) classes in the CLIP embedding +space, we propose to synthesize proxy-novel classes approximating novel classes +via linear mixup between a pair of base classes. By training our detector with +these synthetic proxy-novel classes, we effectively explore the embedding space +of novel classes. The experimental results on various OVOD benchmarks such as +LVIS and COCO demonstrate superior performance on novel classes compared to the +other state-of-the-art methods. Code is available at +https://github.com/clovaai/ProxyDet. + +
+
+ comment: Accepted in AAAI24 +
+
+
+
+
+ + ♻ ☆ Select2Col: Leveraging Spatial-Temporal Importance of Semantic + Information for Efficient Collaborative Perception + + +
+ Collaborative perception by leveraging the shared semantic information plays +a crucial role in overcoming the individual limitations of isolated agents. +However, existing collaborative perception methods tend to focus solely on the +spatial features of semantic information, while neglecting the importance of +the temporal dimension. Consequently, the potential benefits of collaboration +remain underutilized. In this article, we propose Select2Col, a novel +collaborative perception framework that takes into account the +\underline{s}patial-t\underline{e}mpora\underline{l} importanc\underline{e} of +semanti\underline{c} informa\underline{t}ion. Within the Select2Col, we develop +a collaborator selection method that utilizes a lightweight graph neural +network (GNN) to estimate the importance of semantic information (IoSI) of each +collaborator in enhancing perception performance, thereby identifying +contributive collaborators while excluding those that potentially bring +negative impact. Moreover, we present a semantic information fusion algorithm +called HPHA (historical prior hybrid attention), which integrates multi-scale +attention and short-term attention modules to capture the IoSI in feature +representation from the spatial and temporal dimensions respectively, and +assigns IoSI-consistent weights for efficient fusion of information from +selected collaborators. Extensive experiments on three open datasets +demonstrate that our proposed Select2Col significantly improves the perception +performance compared to state-of-the-art approaches. The code associated with +this research is publicly available at https://github.com/huangqzj/Select2Col/. + +
+
+
+
+
+ + ♻ ☆ LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal + Language Model + + +
+ The revolutionary capabilities of large language models (LLMs) have paved the +way for multimodal large language models (MLLMs) and fostered diverse +applications across various specialized domains. In the remote sensing (RS) +field, however, the diverse geographical landscapes and varied objects in RS +imagery are not adequately considered in recent MLLM endeavors. To bridge this +gap, we construct a large-scale RS image-text dataset, LHRS-Align, and an +informative RS-specific instruction dataset, LHRS-Instruct, leveraging the +extensive volunteered geographic information (VGI) and globally available RS +images. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored +for RS image understanding through a novel multi-level vision-language +alignment strategy and a curriculum learning method. Comprehensive experiments +demonstrate that LHRS-Bot exhibits a profound understanding of RS images and +the ability to perform nuanced reasoning within the RS domain. + +
+
+ comment: 32 pages, 8 figures. Github https://github.com/NJU-LHRS/LHRS-Bot +
+
+
+
+
+ + ♻ ☆ Zero-shot Object-Level OOD Detection with Context-Aware Inpainting + + +
+ Machine learning algorithms are increasingly provided as black-box cloud +services or pre-trained models, without access to their training data. This +motivates the problem of zero-shot out-of-distribution (OOD) detection. +Concretely, we aim to detect OOD objects that do not belong to the classifier's +label set but are erroneously classified as in-distribution (ID) objects. Our +approach, RONIN, uses an off-the-shelf diffusion model to replace detected +objects with inpainting. RONIN conditions the inpainting process with the +predicted ID label, drawing the input object closer to the in-distribution +domain. As a result, the reconstructed object is very close to the original in +the ID cases and far in the OOD cases, allowing RONIN to effectively +distinguish ID and OOD samples. Throughout extensive experiments, we +demonstrate that RONIN achieves competitive results compared to previous +approaches across several datasets, both in zero-shot and non-zero-shot +settings. + +
+
+
+
+
+ + ♻ ☆ ECG-Image-Kit: A Synthetic Image Generation Toolbox to Facilitate Deep + Learning-Based Electrocardiogram Digitization + + +
+ Cardiovascular diseases are a major cause of mortality globally, and +electrocardiograms (ECGs) are crucial for diagnosing them. Traditionally, ECGs +are printed on paper. However, these printouts, even when scanned, are +incompatible with advanced ECG diagnosis software that require time-series +data. Digitizing ECG images is vital for training machine learning models in +ECG diagnosis and to leverage the extensive global archives collected over +decades. Deep learning models for image processing are promising in this +regard, although the lack of clinical ECG archives with reference time-series +data is challenging. Data augmentation techniques using realistic generative +data models provide a solution. + We introduce ECG-Image-Kit, an open-source toolbox for generating synthetic +multi-lead ECG images with realistic artifacts from time-series data. The tool +synthesizes ECG images from real time-series data, applying distortions like +text artifacts, wrinkles, and creases on a standard ECG paper background. + As a case study, we used ECG-Image-Kit to create a dataset of 21,801 ECG +images from the PhysioNet QT database. We developed and trained a combination +of a traditional computer vision and deep neural network model on this dataset +to convert synthetic images into time-series data for evaluation. We assessed +digitization quality by calculating the signal-to-noise ratio (SNR) and +compared clinical parameters like QRS width, RR, and QT intervals recovered +from this pipeline, with the ground truth extracted from ECG time-series. The +results show that this deep learning pipeline accurately digitizes paper ECGs, +maintaining clinical parameters, and highlights a generative approach to +digitization. This toolbox currently supports data augmentation for the 2024 +PhysioNet Challenge, focusing on digitizing and classifying paper ECG images. + +
+
+
+
+
+ + ♻ ☆ NeBLa: Neural Beer-Lambert for 3D Reconstruction of Oral Structures from + Panoramic Radiographs AAAI 2024 + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose NeBLa (Neural +Beer-Lambert) to estimate 3D oral structures from real-world PX. NeBLa tackles +full 3D reconstruction for varying subjects (patients) where each +reconstruction is based only on a single panoramic image. We create an +intermediate representation called simulated PX (SimPX) from 3D Cone-beam +computed tomography (CBCT) data based on the Beer-Lambert law of X-ray +rendering and rotational principles of PX imaging. SimPX aims at not only +truthfully simulating PX, but also facilitates the reverting process back to 3D +data. We propose a novel neural model based on ray tracing which exploits both +global and local input features to convert SimPX to 3D output. At inference, a +real PX image is translated to a SimPX-style image with semantic +regularization, and the translated image is processed by generation module to +produce high-quality outputs. Experiments show that NeBLa outperforms prior +state-of-the-art in reconstruction tasks both quantitatively and qualitatively. +Unlike prior methods, NeBLa does not require any prior information such as the +shape of dental arches, nor the matched PX-CBCT dataset for training, which is +difficult to obtain in clinical practice. Our code is available at +https://github.com/sihwa-park/nebla. + +
+
+ comment: 18 pages, 16 figures, Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ NormAUG: Normalization-guided Augmentation for Domain Generalization + + +
+ Deep learning has made significant advancements in supervised learning. +However, models trained in this setting often face challenges due to domain +shift between training and test sets, resulting in a significant drop in +performance during testing. To address this issue, several domain +generalization methods have been developed to learn robust and domain-invariant +features from multiple training domains that can generalize well to unseen test +domains. Data augmentation plays a crucial role in achieving this goal by +enhancing the diversity of the training data. In this paper, inspired by the +observation that normalizing an image with different statistics generated by +different batches with various domains can perturb its feature, we propose a +simple yet effective method called NormAUG (Normalization-guided Augmentation). +Our method includes two paths: the main path and the auxiliary (augmented) +path. During training, the auxiliary path includes multiple sub-paths, each +corresponding to batch normalization for a single domain or a random +combination of multiple domains. This introduces diverse information at the +feature level and improves the generalization of the main path. Moreover, our +NormAUG method effectively reduces the existing upper boundary for +generalization based on theoretical perspectives. During the test stage, we +leverage an ensemble strategy to combine the predictions from the auxiliary +path of our model, further boosting performance. Extensive experiments are +conducted on multiple benchmark datasets to validate the effectiveness of our +proposed method. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ♻ ☆ Progressive Fourier Neural Representation for Sequential Video + Compilation + + +
+ Neural Implicit Representation (NIR) has recently gained significant +attention due to its remarkable ability to encode complex and high-dimensional +data into representation space and easily reconstruct it through a trainable +mapping function. However, NIR methods assume a one-to-one mapping between the +target data and representation models regardless of data relevancy or +similarity. This results in poor generalization over multiple complex data and +limits their efficiency and scalability. Motivated by continual learning, this +work investigates how to accumulate and transfer neural implicit +representations for multiple complex video data over sequential encoding +sessions. To overcome the limitation of NIR, we propose a novel method, +Progressive Fourier Neural Representation (PFNR), that aims to find an adaptive +and compact sub-module in Fourier space to encode videos in each training +session. This sparsified neural encoding allows the neural network to hold free +weights, enabling an improved adaptation for future videos. In addition, when +learning a representation for a new video, PFNR transfers the representation of +previous videos with frozen weights. This design allows the model to +continuously accumulate high-quality neural representations for multiple videos +while ensuring lossless decoding that perfectly preserves the learned +representations for previous videos. We validate our PFNR method on the UVG8/17 +and DAVIS50 video sequence benchmarks and achieve impressive performance gains +over strong continual learning baselines. The PFNR code is available at +https://github.com/ihaeyong/PFNR.git. + +
+
+
+
+
+ + ♻ ☆ SHMC-Net: A Mask-guided Feature Fusion Network for Sperm Head Morphology + Classification + + +
+ Male infertility accounts for about one-third of global infertility cases. +Manual assessment of sperm abnormalities through head morphology analysis +encounters issues of observer variability and diagnostic discrepancies among +experts. Its alternative, Computer-Assisted Semen Analysis (CASA), suffers from +low-quality sperm images, small datasets, and noisy class labels. We propose a +new approach for sperm head morphology classification, called SHMC-Net, which +uses segmentation masks of sperm heads to guide the morphology classification +of sperm images. SHMC-Net generates reliable segmentation masks using image +priors, refines object boundaries with an efficient graph-based method, and +trains an image network with sperm head crops and a mask network with the +corresponding masks. In the intermediate stages of the networks, image and mask +features are fused with a fusion scheme to better learn morphological features. +To handle noisy class labels and regularize training on small datasets, +SHMC-Net applies Soft Mixup to combine mixup augmentation and a loss function. +We achieve state-of-the-art results on SCIAN and HuSHeM datasets, outperforming +methods that use additional pre-training or costly ensembling techniques. + +
+
+ comment: A shorter version is published on ISBI 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ A Roadmap to Pluralistic Alignment + + +
+ With increased power and prevalence of AI systems, it is ever more critical +that AI systems are designed to serve all, i.e., people with diverse values and +perspectives. However, aligning models to serve pluralistic human values +remains an open research question. In this piece, we propose a roadmap to +pluralistic alignment, specifically using language models as a test bed. We +identify and formalize three possible ways to define and operationalize +pluralism in AI systems: 1) Overton pluralistic models that present a spectrum +of reasonable responses; 2) Steerably pluralistic models that can steer to +reflect certain perspectives; and 3) Distributionally pluralistic models that +are well-calibrated to a given population in distribution. We also propose and +formalize three possible classes of pluralistic benchmarks: 1) Multi-objective +benchmarks, 2) Trade-off steerable benchmarks, which incentivize models to +steer to arbitrary trade-offs, and 3) Jury-pluralistic benchmarks which +explicitly model diverse human ratings. We use this framework to argue that +current alignment techniques may be fundamentally limited for pluralistic AI; +indeed, we highlight empirical evidence, both from our own experiments and from +other work, that standard alignment procedures might reduce distributional +pluralism in models, motivating the need for further research on pluralistic +alignment. + +
+
+
+
+
+ + ☆ Detecting Generated Native Ads in Conversational Search WWW'24 + + +
+ Conversational search engines such as YouChat and Microsoft Copilot use large +language models (LLMs) to generate answers to queries. It is only a small step +to also use this technology to generate and integrate advertising within these +answers - instead of placing ads separately from the organic search results. +This type of advertising is reminiscent of native advertising and product +placement, both of which are very effective forms of subtle and manipulative +advertising. It is likely that information seekers will be confronted with such +use of LLM technology in the near future, especially when considering the high +computational costs associated with LLMs, for which providers need to develop +sustainable business models. This paper investigates whether LLMs can also be +used as a countermeasure against generated native ads, i.e., to block them. For +this purpose we compile a large dataset of ad-prone queries and of generated +answers with automatically integrated ads to experiment with fine-tuned +sentence transformers and state-of-the-art LLMs on the task of recognizing the +ads. In our experiments sentence transformers achieve detection precision and +recall values above 0.9, while the investigated LLMs struggle with the task. + +
+
+ comment: Submitted to WWW'24 Short Papers Track; 4 pages +
+
+
+
+
+ + ☆ Multimodal Query Suggestion with Multi-Agent Reinforcement Learning from + Human Feedback WWW 2024 + + +
+ In the rapidly evolving landscape of information retrieval, search engines +strive to provide more personalized and relevant results to users. Query +suggestion systems play a crucial role in achieving this goal by assisting +users in formulating effective queries. However, existing query suggestion +systems mainly rely on textual inputs, potentially limiting user search +experiences for querying images. In this paper, we introduce a novel Multimodal +Query Suggestion (MMQS) task, which aims to generate query suggestions based on +user query images to improve the intentionality and diversity of search +results. We present the RL4Sugg framework, leveraging the power of Large +Language Models (LLMs) with Multi-Agent Reinforcement Learning from Human +Feedback to optimize the generation process. Through comprehensive experiments, +we validate the effectiveness of RL4Sugg, demonstrating a 18% improvement +compared to the best existing approach. Moreover, the MMQS has been transferred +into real-world search engine products, which yield enhanced user engagement. +Our research advances query suggestion systems and provides a new perspective +on multimodal information retrieval. + +
+
+ comment: This paper has been accepted by WWW 2024 +
+
+
+
+
+ + ☆ Leveraging LLMs for Unsupervised Dense Retriever Ranking + + +
+ This paper introduces a novel unsupervised technique that utilizes large +language models (LLMs) to determine the most suitable dense retriever for a +specific test(target) corpus. Selecting the appropriate dense retriever is +vital for numerous IR applications that employ these retrievers, trained on +public datasets, to encode or conduct searches within a new private target +corpus. The effectiveness of a dense retriever can significantly diminish when +applied to a target corpus that diverges in domain or task from the original +training set. The problem becomes more pronounced in cases where the target +corpus is unlabeled, e.g. in zero-shot scenarios, rendering direct evaluation +of the model's effectiveness on the target corpus unattainable. Therefore, the +unsupervised selection of an optimally pre-trained dense retriever, especially +under conditions of domain shift, emerges as a critical challenge. Existing +methodologies for ranking dense retrievers fall short in addressing these +domain shift scenarios. + To tackle this, our method capitalizes on LLMs to create pseudo-relevant +queries, labels, and reference lists by analyzing a subset of documents from +the target corpus. This allows for the ranking of dense retrievers based on +their performance with these pseudo-relevant signals. Significantly, this +strategy is the first to depend exclusively on the target corpus data, removing +the necessity for training data and test labels. We assessed the effectiveness +of our approach by compiling a comprehensive pool of cutting-edge dense +retrievers and comparing our method against traditional dense retriever +selection benchmarks. The findings reveal that our proposed solution surpasses +the existing benchmarks in both the selection and ranking of dense retrievers. + +
+
+
+
+
+ + ☆ Theoretical and Empirical Analysis of Adaptive Entry Point Selection for + Graph-based Approximate Nearest Neighbor Search + + +
+ We present a theoretical and empirical analysis of the adaptive entry point +selection for graph-based approximate nearest neighbor search (ANNS). We +introduce novel concepts: $b\textit{-monotonic path}$ and $B\textit{-MSNET}$, +which better capture an actual graph in practical algorithms than existing +concepts like MSNET. We prove that adaptive entry point selection offers better +performance upper bound than the fixed central entry point under more general +conditions than previous work. Empirically, we validate the method's +effectiveness in accuracy, speed, and memory usage across various datasets, +especially in challenging scenarios with out-of-distribution data and hard +instances. Our comprehensive study provides deeper insights into optimizing +entry points for graph-based ANNS for real-world high-dimensional data +applications. + +
+
+
+
+
+ + ☆ SPARQL Generation: an analysis on fine-tuning OpenLLaMA for Question + Answering over a Life Science Knowledge Graph + + +
+ The recent success of Large Language Models (LLM) in a wide range of Natural +Language Processing applications opens the path towards novel Question +Answering Systems over Knowledge Graphs leveraging LLMs. However, one of the +main obstacles preventing their implementation is the scarcity of training data +for the task of translating questions into corresponding SPARQL queries, +particularly in the case of domain-specific KGs. To overcome this challenge, in +this study, we evaluate several strategies for fine-tuning the OpenLlama LLM +for question answering over life science knowledge graphs. In particular, we +propose an end-to-end data augmentation approach for extending a set of +existing queries over a given knowledge graph towards a larger dataset of +semantically enriched question-to-SPARQL query pairs, enabling fine-tuning even +for datasets where these pairs are scarce. In this context, we also investigate +the role of semantic "clues" in the queries, such as meaningful variable names +and inline comments. Finally, we evaluate our approach over the real-world Bgee +gene expression knowledge graph and we show that semantic clues can improve +model performance by up to 33% compared to a baseline with random variable +names and no comments included. + +
+
+ comment: To appear in Proceedings of SWAT4HCLS 2024: Semantic Web Tools and + Applications for Healthcare and Life Sciences +
+
+
+
+
+ + ☆ NORMY: Non-Uniform History Modeling for Open Retrieval Conversational + Question Answering SC 2024 + + +
+ Open Retrieval Conversational Question Answering (OrConvQA) answers a +question given a conversation as context and a document collection. A typical +OrConvQA pipeline consists of three modules: a Retriever to retrieve relevant +documents from the collection, a Reranker to rerank them given the question and +the context, and a Reader to extract an answer span. The conversational turns +can provide valuable context to answer the final query. State-of-the-art +OrConvQA systems use the same history modeling for all three modules of the +pipeline. We hypothesize this as suboptimal. Specifically, we argue that a +broader context is needed in the first modules of the pipeline to not miss +relevant documents, while a narrower context is needed in the last modules to +identify the exact answer span. We propose NORMY, the first unsupervised +non-uniform history modeling pipeline which generates the best conversational +history for each module. We further propose a novel Retriever for NORMY, which +employs keyphrase extraction on the conversation history, and leverages +passages retrieved in previous turns as additional context. We also created a +new dataset for OrConvQA, by expanding the doc2dial dataset. We implemented +various state-of-the-art history modeling techniques and comprehensively +evaluated them separately for each module of the pipeline on three datasets: +OR-QUAC, our doc2dial extension, and ConvMix. Our extensive experiments show +that NORMY outperforms the state-of-the-art in the individual modules and in +the end-to-end system. + +
+
+ comment: Accepted for publication at IEEE ICSC 2024 +
+
+
+
+
+ + ☆ RA-Rec: An Efficient ID Representation Alignment Framework for LLM-based + Recommendation + + +
+ Large language models (LLM) have recently emerged as a powerful tool for a +variety of natural language processing tasks, bringing a new surge of combining +LLM with recommendation systems, termed as LLM-based RS. Current approaches +generally fall into two main paradigms, the ID direct usage paradigm and the ID +translation paradigm, noting their core weakness stems from lacking +recommendation knowledge and uniqueness. To address this limitation, we propose +a new paradigm, ID representation, which incorporates pre-trained ID embeddings +into LLMs in a complementary manner. In this work, we present RA-Rec, an +efficient ID representation alignment framework for LLM-based recommendation, +which is compatible with multiple ID-based methods and LLM architectures. +Specifically, we treat ID embeddings as soft prompts and design an innovative +alignment module and an efficient tuning method with tailored data construction +for alignment. Extensive experiments demonstrate RA-Rec substantially +outperforms current state-of-the-art methods, achieving up to 3.0% absolute +HitRate@100 improvements while utilizing less than 10x training data. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Recent Advances in Text Analysis + + +
+ Text analysis is an interesting research area in data science and has various +applications, such as in artificial intelligence, biomedical research, and +engineering. We review popular methods for text analysis, ranging from topic +modeling to the recent neural language models. In particular, we review +Topic-SCORE, a statistical approach to topic modeling, and discuss how to use +it to analyze MADStat - a dataset on statistical publications that we collected +and cleaned. + The application of Topic-SCORE and other methods on MADStat leads to +interesting findings. For example, $11$ representative topics in statistics are +identified. For each journal, the evolution of topic weights over time can be +visualized, and these results are used to analyze the trends in statistical +research. In particular, we propose a new statistical model for ranking the +citation impacts of $11$ topics, and we also build a cross-topic citation graph +to illustrate how research results on different topics spread to one another. + The results on MADStat provide a data-driven picture of the statistical +research in $1975$--$2015$, from a text analysis perspective. + +
+
+
+
+
+ + ♻ ☆ Future Impact Decomposition in Request-level Recommendations + + +
+ In recommender systems, reinforcement learning solutions have shown promising +results in optimizing the interaction sequence between users and the system +over the long-term performance. For practical reasons, the policy's actions are +typically designed as recommending a list of items to handle users' frequent +and continuous browsing requests more efficiently. In this list-wise +recommendation scenario, the user state is updated upon every request in the +corresponding MDP formulation. However, this request-level formulation is +essentially inconsistent with the user's item-level behavior. In this study, we +demonstrate that an item-level optimization approach can better utilize item +characteristics and optimize the policy's performance even under the +request-level MDP. We support this claim by comparing the performance of +standard request-level methods with the proposed item-level actor-critic +framework in both simulation and online experiments. Furthermore, we show that +a reward-based future decomposition strategy can better express the item-wise +future impact and improve the recommendation accuracy in the long term. To +achieve a more thorough understanding of the decomposition strategy, we propose +a model-based re-weighting framework with adversarial learning that further +boost the performance and investigate its correlation with the reward-based +strategy. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Labeled Interactive Topic Models + + +
+ Topic models are valuable for understanding extensive document collections, +but they don't always identify the most relevant topics. Classical +probabilistic and anchor-based topic models offer interactive versions that +allow users to guide the models towards more pertinent topics. However, such +interactive features have been lacking in neural topic models. To correct this +lacuna, we introduce a user-friendly interaction for neural topic models. This +interaction permits users to assign a word label to a topic, leading to an +update in the topic model where the words in the topic become closely aligned +with the given label. Our approach encompasses two distinct kinds of neural +topic models. The first includes models where topic embeddings are trainable +and evolve during the training process. The second kind involves models where +topic embeddings are integrated post-training, offering a different approach to +topic refinement. To facilitate user interaction with these neural topic +models, we have developed an interactive interface. This interface enables +users to engage with and re-label topics as desired. We evaluate our method +through a human study, where users can relabel topics to find relevant +documents. Using our method, user labeling improves document rank scores, +helping to find more relevant documents to a given query when compared to no +user labeling. + +
+
+
+
+
+ + ♻ ☆ Engineering Design Knowledge Graphs from Patented Artefact Descriptions + for Retrieval-Augmented Generation in the Design Process + + +
+ Despite significant popularity, Large-language Models (LLMs) require +explicit, contextual facts to support domain-specific knowledge-intensive tasks +in the design process. The applications built using LLMs should hence adopt +Retrieval-Augmented Generation (RAG) to better suit the design process. In this +article, we present a data-driven method to identify explicit facts from patent +documents that provide standard descriptions of over 8 million artefacts. In +our method, we train roBERTa Transformer-based sequence classification models +using our dataset of 44,227 sentences and facts. Upon classifying tokens in a +sentence as entities or relationships, our method uses another classifier to +identify specific relationship tokens for a given pair of entities so that +explicit facts of the form head entity :: relationship :: tail entity are +identified. In the benchmark approaches for constructing facts, we use linear +classifiers and Graph Neural Networks (GNNs) both incorporating BERT +Transformer-based token embeddings to predict associations among the entities +and relationships. We apply our method to 4,870 fan system related patents and +populate a knowledge base of around 3 million facts. Upon retrieving the facts +representing generalisable domain knowledge and the knowledge of specific +subsystems and issues, we demonstrate how these facts contextualise LLMs for +generating text that is more relevant to the design process. + +
+
+
+
+
+ + ♻ ☆ Recency Ranking by Diversification of Result Set + + +
+ In this paper, we propose a web search retrieval approach which automatically +detects recency sensitive queries and increases the freshness of the ordinary +document ranking by a degree proportional to the probability of the need in +recent content. We propose to solve the recency ranking problem by using result +diversification principles and deal with the query's non-topical ambiguity +appearing when the need in recent content can be detected only with +uncertainty. Our offline and online experiments with millions of queries from +real search engine users demonstrate the significant increase in satisfaction +of users presented with a search result generated by our approach. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Opening the AI black box: program synthesis via mechanistic + interpretability + + +
+ We present MIPS, a novel method for program synthesis based on automated +mechanistic interpretability of neural networks trained to perform the desired +task, auto-distilling the learned algorithm into Python code. We test MIPS on a +benchmark of 62 algorithmic tasks that can be learned by an RNN and find it +highly complementary to GPT-4: MIPS solves 32 of them, including 13 that are +not solved by GPT-4 (which also solves 30). MIPS uses an integer autoencoder to +convert the RNN into a finite state machine, then applies Boolean or integer +symbolic regression to capture the learned algorithm. As opposed to large +language models, this program synthesis technique makes no use of (and is +therefore not limited by) human training data such as algorithms and code from +GitHub. We discuss opportunities and challenges for scaling up this approach to +make machine-learned models more interpretable and trustworthy. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Hydra: Sequentially-Dependent Draft Heads for Medusa Decoding + + +
+ To combat the memory bandwidth-bound nature of autoregressive LLM inference, +previous research has proposed the speculative decoding framework. To perform +speculative decoding, a small draft model proposes candidate continuations of +the input sequence, that are then verified in parallel by the base model. One +way to specify the draft model, as used in the recent Medusa decoding +framework, is as a collection of light-weight heads, called draft heads, that +operate on the base model's hidden states. To date, all existing draft heads +have been sequentially independent, meaning that they speculate tokens in the +candidate continuation independently of any preceding tokens in the candidate +continuation. In this work, we propose Hydra heads, a sequentially dependent, +drop-in replacement for standard draft heads that significantly improves +speculation accuracy. Decoding with Hydra heads improves throughput compared to +Medusa decoding with standard draft heads. We further explore the design space +of Hydra head training objectives and architectures, and propose a +carefully-tuned Hydra head recipe, which we call Hydra++, that improves +decoding throughput by 1.31x and 2.71x compared to Medusa decoding and +autoregressive decoding, respectively. Overall, Hydra heads are a simple +intervention on standard draft heads that significantly improve the end-to-end +speed of draft head based speculative decoding. + +
+
+
+
+
+ + ☆ Tighter Generalisation Bounds via Interpolation + + +
+ This paper contains a recipe for deriving new PAC-Bayes generalisation bounds +based on the $(f, \Gamma)$-divergence, and, in addition, presents PAC-Bayes +generalisation bounds where we interpolate between a series of probability +divergences (including but not limited to KL, Wasserstein, and total +variation), making the best out of many worlds depending on the posterior +distributions properties. We explore the tightness of these bounds and connect +them to earlier results from statistical learning, which are specific cases. We +also instantiate our bounds as training objectives, yielding non-trivial +guarantees and practical performances. + +
+
+
+
+
+ + ☆ Hydragen: High-Throughput LLM Inference with Shared Prefixes + + +
+ Transformer-based large language models (LLMs) are now deployed to hundreds +of millions of users. LLM inference is commonly performed on batches of +sequences that share a prefix, such as few-shot examples or a chatbot system +prompt. Decoding in this large-batch setting can be bottlenecked by the +attention operation, which reads large key-value (KV) caches from memory and +computes inefficient matrix-vector products for every sequence in the batch. In +this work, we introduce Hydragen, a hardware-aware exact implementation of +attention with shared prefixes. Hydragen computes attention over the shared +prefix and unique suffixes separately. This decomposition enables efficient +prefix attention by batching queries together across sequences, reducing +redundant memory reads and enabling the use of hardware-friendly matrix +multiplications. Our method can improve end-to-end LLM throughput by up to 32x +against competitive baselines, with speedup growing with the batch size and +shared prefix length. Hydragen also enables the use of very long shared +contexts: with a high batch size, increasing the prefix length from 1K to 16K +tokens decreases Hydragen throughput by less than 15%, while the throughput of +baselines drops by over 90%. Hydragen generalizes beyond simple prefix-suffix +decomposition and can be applied to tree-based prompt sharing patterns, +allowing us to further reduce inference time on competitive programming +problems by 55%. + +
+
+
+
+
+ + On diffusion models for amortized inference: Benchmarking and improving + stochastic control and sampling + + +
+ We study the problem of training diffusion models to sample from a +distribution with a given unnormalized density or energy function. We benchmark +several diffusion-structured inference methods, including simulation-based +variational approaches and off-policy methods (continuous generative flow +networks). Our results shed light on the relative advantages of existing +algorithms while bringing into question some claims from past work. We also +propose a novel exploration strategy for off-policy methods, based on local +search in the target space with the use of a replay buffer, and show that it +improves the quality of samples on a variety of target distributions. Our code +for the sampling methods and benchmarks studied is made public at +https://github.com/GFNOrg/gfn-diffusion as a base for future work on diffusion +models for amortized inference. + +
+
+ comment: 21 pages; code: https://github.com/GFNOrg/gfn-diffusion +
+
+
+
+
+ + ☆ NITO: Neural Implicit Fields for Resolution-free Topology Optimization + + +
+ Topology optimization is a critical task in engineering design, where the +goal is to optimally distribute material in a given space for maximum +performance. We introduce Neural Implicit Topology Optimization (NITO), a novel +approach to accelerate topology optimization problems using deep learning. NITO +stands out as one of the first frameworks to offer a resolution-free and +domain-agnostic solution in deep learning-based topology optimization. NITO +synthesizes structures with up to seven times better structural efficiency +compared to SOTA diffusion models and does so in a tenth of the time. In the +NITO framework, we introduce a novel method, the Boundary Point Order-Invariant +MLP (BPOM), to represent boundary conditions in a sparse and domain-agnostic +manner, moving away from expensive simulation-based approaches. Crucially, NITO +circumvents the domain and resolution limitations that restrict Convolutional +Neural Network (CNN) models to a structured domain of fixed size -- limitations +that hinder the widespread adoption of CNNs in engineering applications. This +generalizability allows a single NITO model to train and generate solutions in +countless domains, eliminating the need for numerous domain-specific CNNs and +their extensive datasets. Despite its generalizability, NITO outperforms SOTA +models even in specialized tasks, is an order of magnitude smaller, and is +practically trainable at high resolutions that would be restrictive for CNNs. +This combination of versatility, efficiency, and performance underlines NITO's +potential to transform the landscape of engineering design optimization +problems through implicit fields. + +
+
+
+
+
+ + ☆ Extending the Reach of First-Order Algorithms for Nonconvex Min-Max + Problems with Cohypomonotonicity + + +
+ We focus on constrained, $L$-smooth, nonconvex-nonconcave min-max problems +either satisfying $\rho$-cohypomonotonicity or admitting a solution to the +$\rho$-weakly Minty Variational Inequality (MVI), where larger values of the +parameter $\rho>0$ correspond to a greater degree of nonconvexity. These +problem classes include examples in two player reinforcement learning, +interaction dominant min-max problems, and certain synthetic test problems on +which classical min-max algorithms fail. It has been conjectured that +first-order methods can tolerate value of $\rho$ no larger than $\frac{1}{L}$, +but existing results in the literature have stagnated at the tighter +requirement $\rho < \frac{1}{2L}$. With a simple argument, we obtain optimal or +best-known complexity guarantees with cohypomonotonicity or weak MVI conditions +for $\rho < \frac{1}{L}$. The algorithms we analyze are inexact variants of +Halpern and Krasnosel'ski\u{\i}-Mann (KM) iterations. We also provide +algorithms and complexity guarantees in the stochastic case with the same range +on $\rho$. Our main insight for the improvements in the convergence analyses is +to harness the recently proposed "conic nonexpansiveness" property of +operators. As byproducts, we provide a refined analysis for inexact Halpern +iteration and propose a stochastic KM iteration with a multilevel Monte Carlo +estimator. + +
+
+
+
+
+ + ☆ Multiscale Modelling with Physics-informed Neural Network: from + Large-scale Dynamics to Small-scale Predictions in Complex Systems + + +
+ Multiscale phenomena manifest across various scientific domains, presenting a +ubiquitous challenge in accurately and effectively predicting multiscale +dynamics in complex systems. In this paper, a novel solving mode is proposed +for characterizing multiscale dynamics through a decoupling method. By +modelling large-scale dynamics independently and treating small-scale dynamics +as a slaved system, a Spectral PINN is developed to approach the small-scale +system in an orthogonal basis functional space. The effectiveness of the method +is demonstrated through extensive numerical experiments, including +one-dimensional Kuramot-Sivashinsky (KS) equation, two- and three-dimensional +Navier-Stokes (NS) equations, showcasing its versatility in addressing problems +of fluid dynamics. Furthermore, we also delve into the application of the +proposed approach to more complex problems, including non-uniform meshes, +complex geometries, large-scale data with noise, and high-dimensional +small-scale dynamics. The discussions about these scenarios contribute to a +comprehensive understanding of the method's capabilities and limitations. This +novel decoupling approach simplifies the analysis and prediction of +spatiotemporal systems, where large-scale data can be obtained with low +computational demands, followed by Spectral PINNs for capturing small-scale +dynamics with improved efficiency and accuracy. + +
+
+
+
+
+ + ☆ Causal Representation Learning from Multiple Distributions: A General + Setting + + +
+ In many problems, the measured variables (e.g., image pixels) are just +mathematical functions of the hidden causal variables (e.g., the underlying +concepts or objects). For the purpose of making predictions in changing +environments or making proper changes to the system, it is helpful to recover +the hidden causal variables $Z_i$ and their causal relations represented by +graph $\mathcal{G}_Z$. This problem has recently been known as causal +representation learning. This paper is concerned with a general, completely +nonparametric setting of causal representation learning from multiple +distributions (arising from heterogeneous data or nonstationary time series), +without assuming hard interventions behind distribution changes. We aim to +develop general solutions in this fundamental case; as a by product, this helps +see the unique benefit offered by other assumptions such as parametric causal +models or hard interventions. We show that under the sparsity constraint on the +recovered graph over the latent variables and suitable sufficient change +conditions on the causal influences, interestingly, one can recover the +moralized graph of the underlying directed acyclic graph, and the recovered +latent variables and their relations are related to the underlying causal model +in a specific, nontrivial way. In some cases, each latent variable can even be +recovered up to component-wise transformations. Experimental results verify our +theoretical claims. + +
+
+
+
+
+ + ☆ Federated Learning Can Find Friends That Are Beneficial + + +
+ In Federated Learning (FL), the distributed nature and heterogeneity of +client data present both opportunities and challenges. While collaboration +among clients can significantly enhance the learning process, not all +collaborations are beneficial; some may even be detrimental. In this study, we +introduce a novel algorithm that assigns adaptive aggregation weights to +clients participating in FL training, identifying those with data distributions +most conducive to a specific learning objective. We demonstrate that our +aggregation method converges no worse than the method that aggregates only the +updates received from clients with the same data distribution. Furthermore, +empirical evaluations consistently reveal that collaborations guided by our +algorithm outperform traditional FL approaches. This underscores the critical +role of judicious client selection and lays the foundation for more streamlined +and effective FL implementations in the coming years. + +
+
+
+
+
+ + ☆ SALAD-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large + Language Models + + +
+ In the rapidly evolving landscape of Large Language Models (LLMs), ensuring +robust safety measures is paramount. To meet this crucial need, we propose +\emph{SALAD-Bench}, a safety benchmark specifically designed for evaluating +LLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench +transcends conventional benchmarks through its large scale, rich diversity, +intricate taxonomy spanning three levels, and versatile +functionalities.SALAD-Bench is crafted with a meticulous array of questions, +from standard queries to complex ones enriched with attack, defense +modifications and multiple-choice. To effectively manage the inherent +complexity, we introduce an innovative evaluators: the LLM-based MD-Judge for +QA pairs with a particular focus on attack-enhanced queries, ensuring a +seamless, and reliable evaluation. Above components extend SALAD-Bench from +standard LLM safety evaluation to both LLM attack and defense methods +evaluation, ensuring the joint-purpose utility. Our extensive experiments shed +light on the resilience of LLMs against emerging threats and the efficacy of +contemporary defense tactics. Data and evaluator are released under +\url{https://github.com/OpenSafetyLab/SALAD-BENCH}. Warning: this paper +includes examples that may be offensive or harmful. + +
+
+
+
+
+ + ☆ PAC Learnability under Explanation-Preserving Graph Perturbations + + +
+ Graphical models capture relations between entities in a wide range of +applications including social networks, biology, and natural language +processing, among others. Graph neural networks (GNN) are neural models that +operate over graphs, enabling the model to leverage the complex relationships +and dependencies in graph-structured data. A graph explanation is a subgraph +which is an `almost sufficient' statistic of the input graph with respect to +its classification label. Consequently, the classification label is invariant, +with high probability, to perturbations of graph edges not belonging to its +explanation subgraph. This work considers two methods for leveraging such +perturbation invariances in the design and training of GNNs. First, +explanation-assisted learning rules are considered. It is shown that the sample +complexity of explanation-assisted learning can be arbitrarily smaller than +explanation-agnostic learning. Next, explanation-assisted data augmentation is +considered, where the training set is enlarged by artificially producing new +training samples via perturbation of the non-explanation edges in the original +training set. It is shown that such data augmentation methods may improve +performance if the augmented data is in-distribution, however, it may also lead +to worse sample complexity compared to explanation-agnostic learning rules if +the augmented data is out-of-distribution. Extensive empirical evaluations are +provided to verify the theoretical analysis. + +
+
+ comment: 21 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Simulated Overparameterization + + +
+ In this work, we introduce a novel paradigm called Simulated +Overparametrization (SOP). SOP merges the computational efficiency of compact +models with the advanced learning proficiencies of overparameterized models. +SOP proposes a unique approach to model training and inference, where a model +with a significantly larger number of parameters is trained in such a way that +a smaller, efficient subset of these parameters is used for the actual +computation during inference. Building upon this framework, we present a novel, +architecture agnostic algorithm called "majority kernels", which seamlessly +integrates with predominant architectures, including Transformer models. +Majority kernels enables the simulated training of overparameterized models, +resulting in performance gains across architectures and tasks. Furthermore, our +approach adds minimal overhead to the cost incurred (wall clock time) at +training time. The proposed approach shows strong performance on a wide variety +of datasets and models, even outperforming strong baselines such as +combinatorial optimization methods based on submodular optimization. + +
+
+
+
+
+ + ☆ Strong convexity-guided hyper-parameter optimization for flatter losses + + +
+ We propose a novel white-box approach to hyper-parameter optimization. +Motivated by recent work establishing a relationship between flat minima and +generalization, we first establish a relationship between the strong convexity +of the loss and its flatness. Based on this, we seek to find hyper-parameter +configurations that improve flatness by minimizing the strong convexity of the +loss. By using the structure of the underlying neural network, we derive +closed-form equations to approximate the strong convexity parameter, and +attempt to find hyper-parameters that minimize it in a randomized fashion. +Through experiments on 14 classification datasets, we show that our method +achieves strong performance at a fraction of the runtime. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ A Sober Look at LLMs for Material Discovery: Are They Actually Good for + Bayesian Optimization Over Molecules? + + +
+ Automation is one of the cornerstones of contemporary material discovery. +Bayesian optimization (BO) is an essential part of such workflows, enabling +scientists to leverage prior domain knowledge into efficient exploration of a +large molecular space. While such prior knowledge can take many forms, there +has been significant fanfare around the ancillary scientific knowledge +encapsulated in large language models (LLMs). However, existing work thus far +has only explored LLMs for heuristic materials searches. Indeed, recent work +obtains the uncertainty estimate -- an integral part of BO -- from +point-estimated, non-Bayesian LLMs. In this work, we study the question of +whether LLMs are actually useful to accelerate principled Bayesian optimization +in the molecular space. We take a sober, dispassionate stance in answering this +question. This is done by carefully (i) viewing LLMs as fixed feature +extractors for standard but principled BO surrogate models and by (ii) +leveraging parameter-efficient finetuning methods and Bayesian neural networks +to obtain the posterior of the LLM surrogate. Our extensive experiments with +real-world chemistry problems show that LLMs can be useful for BO over +molecules, but only if they have been pretrained or finetuned with +domain-specific data. + +
+
+
+
+
+ + ☆ Compression of Structured Data with Autoencoders: Provable Benefit of + Nonlinearities and Depth + + +
+ Autoencoders are a prominent model in many empirical branches of machine +learning and lossy data compression. However, basic theoretical questions +remain unanswered even in a shallow two-layer setting. In particular, to what +degree does a shallow autoencoder capture the structure of the underlying data +distribution? For the prototypical case of the 1-bit compression of sparse +Gaussian data, we prove that gradient descent converges to a solution that +completely disregards the sparse structure of the input. Namely, the +performance of the algorithm is the same as if it was compressing a Gaussian +source - with no sparsity. For general data distributions, we give evidence of +a phase transition phenomenon in the shape of the gradient descent minimizer, +as a function of the data sparsity: below the critical sparsity level, the +minimizer is a rotation taken uniformly at random (just like in the compression +of non-sparse data); above the critical sparsity, the minimizer is the identity +(up to a permutation). Finally, by exploiting a connection with approximate +message passing algorithms, we show how to improve upon Gaussian performance +for the compression of sparse data: adding a denoising function to a shallow +architecture already reduces the loss provably, and a suitable multi-layer +decoder leads to a further improvement. We validate our findings on image +datasets, such as CIFAR-10 and MNIST. + +
+
+
+
+
+ + ☆ Navigating Complexity: Toward Lossless Graph Condensation via Expanding + Window Matching + + +
+ Graph condensation aims to reduce the size of a large-scale graph dataset by +synthesizing a compact counterpart without sacrificing the performance of Graph +Neural Networks (GNNs) trained on it, which has shed light on reducing the +computational cost for training GNNs. Nevertheless, existing methods often fall +short of accurately replicating the original graph for certain datasets, +thereby failing to achieve the objective of lossless condensation. To +understand this phenomenon, we investigate the potential reasons and reveal +that the previous state-of-the-art trajectory matching method provides biased +and restricted supervision signals from the original graph when optimizing the +condensed one. This significantly limits both the scale and efficacy of the +condensed graph. In this paper, we make the first attempt toward +\textit{lossless graph condensation} by bridging the previously neglected +supervision signals. Specifically, we employ a curriculum learning strategy to +train expert trajectories with more diverse supervision signals from the +original graph, and then effectively transfer the information into the +condensed graph with expanding window matching. Moreover, we design a loss +function to further extract knowledge from the expert trajectories. Theoretical +analysis justifies the design of our method and extensive experiments verify +its superiority across different datasets. Code is released at +https://github.com/NUS-HPC-AI-Lab/GEOM. + +
+
+ comment: Lossless graph condensation method +
+
+
+
+
+ + ☆ EfficientViT-SAM: Accelerated Segment Anything Model Without Performance + Loss + + +
+ We present EfficientViT-SAM, a new family of accelerated segment anything +models. We retain SAM's lightweight prompt encoder and mask decoder while +replacing the heavy image encoder with EfficientViT. For the training, we begin +with the knowledge distillation from the SAM-ViT-H image encoder to +EfficientViT. Subsequently, we conduct end-to-end training on the SA-1B +dataset. Benefiting from EfficientViT's efficiency and capacity, +EfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over +SAM-ViT-H without sacrificing performance. Our code and pre-trained models are +released at https://github.com/mit-han-lab/efficientvit. + +
+
+ comment: tech report +
+
+
+
+
+ + ☆ Example-based Explanations for Random Forests using Machine Unlearning + + +
+ Tree-based machine learning models, such as decision trees and random +forests, have been hugely successful in classification tasks primarily because +of their predictive power in supervised learning tasks and ease of +interpretation. Despite their popularity and power, these models have been +found to produce unexpected or discriminatory outcomes. Given their +overwhelming success for most tasks, it is of interest to identify sources of +their unexpected and discriminatory behavior. However, there has not been much +work on understanding and debugging tree-based classifiers in the context of +fairness. + We introduce FairDebugger, a system that utilizes recent advances in machine +unlearning research to identify training data subsets responsible for instances +of fairness violations in the outcomes of a random forest classifier. +FairDebugger generates top-$k$ explanations (in the form of coherent training +data subsets) for model unfairness. Toward this goal, FairDebugger first +utilizes machine unlearning to estimate the change in the tree structures of +the random forest when parts of the underlying training data are removed, and +then leverages the Apriori algorithm from frequent itemset mining to reduce the +subset search space. We empirically evaluate our approach on three real-world +datasets, and demonstrate that the explanations generated by FairDebugger are +consistent with insights from prior studies on these datasets. + +
+
+
+
+
+ + ☆ Randomized Confidence Bounds for Stochastic Partial Monitoring + + +
+ The partial monitoring (PM) framework provides a theoretical formulation of +sequential learning problems with incomplete feedback. On each round, a +learning agent plays an action while the environment simultaneously chooses an +outcome. The agent then observes a feedback signal that is only partially +informative about the (unobserved) outcome. The agent leverages the received +feedback signals to select actions that minimize the (unobserved) cumulative +loss. In contextual PM, the outcomes depend on some side information that is +observable by the agent before selecting the action on each round. In this +paper, we consider the contextual and non-contextual PM settings with +stochastic outcomes. We introduce a new class of strategies based on the +randomization of deterministic confidence bounds, that extend regret guarantees +to settings where existing stochastic strategies are not applicable. Our +experiments show that the proposed RandCBP and RandCBPside* strategies improve +state-of-the-art baselines in PM games. To encourage the adoption of the PM +framework, we design a use case on the real-world problem of monitoring the +error rate of any deployed classification system. + +
+
+
+
+
+ + ☆ Generative Flows on Discrete State-Spaces: Enabling Multimodal Flows + with Applications to Protein Co-Design + + +
+ Combining discrete and continuous data is an important capability for +generative models. We present Discrete Flow Models (DFMs), a new flow-based +model of discrete data that provides the missing link in enabling flow-based +generative models to be applied to multimodal continuous and discrete data +problems. Our key insight is that the discrete equivalent of continuous space +flow matching can be realized using Continuous Time Markov Chains. DFMs benefit +from a simple derivation that includes discrete diffusion models as a specific +instance while allowing improved performance over existing diffusion-based +approaches. We utilize our DFMs method to build a multimodal flow-based +modeling framework. We apply this capability to the task of protein co-design, +wherein we learn a model for jointly generating protein structure and sequence. +Our approach achieves state-of-the-art co-design performance while allowing the +same multimodal model to be used for flexible generation of the sequence or +structure. + +
+
+ comment: 52 pages, 11 figures, 5 tables +
+
+
+
+
+ + ☆ PriorBoost: An Adaptive Algorithm for Learning from Aggregate Responses + + +
+ This work studies algorithms for learning from aggregate responses. We focus +on the construction of aggregation sets (called bags in the literature) for +event-level loss functions. We prove for linear regression and generalized +linear models (GLMs) that the optimal bagging problem reduces to +one-dimensional size-constrained $k$-means clustering. Further, we +theoretically quantify the advantage of using curated bags over random bags. We +then propose the PriorBoost algorithm, which adaptively forms bags of samples +that are increasingly homogeneous with respect to (unobserved) individual +responses to improve model quality. We study label differential privacy for +aggregate learning, and we also provide extensive experiments showing that +PriorBoost regularly achieves optimal model quality for event-level +predictions, in stark contrast to non-adaptive algorithms. + +
+
+ comment: 29 pages, 4 figures +
+
+
+
+
+ + ☆ Beyond explaining: XAI-based Adaptive Learning with SHAP Clustering for + Energy Consumption Prediction + + +
+ This paper presents an approach integrating explainable artificial +intelligence (XAI) techniques with adaptive learning to enhance energy +consumption prediction models, with a focus on handling data distribution +shifts. Leveraging SHAP clustering, our method provides interpretable +explanations for model predictions and uses these insights to adaptively refine +the model, balancing model complexity with predictive performance. We introduce +a three-stage process: (1) obtaining SHAP values to explain model predictions, +(2) clustering SHAP values to identify distinct patterns and outliers, and (3) +refining the model based on the derived SHAP clustering characteristics. Our +approach mitigates overfitting and ensures robustness in handling data +distribution shifts. We evaluate our method on a comprehensive dataset +comprising energy consumption records of buildings, as well as two additional +datasets to assess the transferability of our approach to other domains, +regression, and classification problems. Our experiments demonstrate the +effectiveness of our approach in both task types, resulting in improved +predictive performance and interpretable model explanations. + +
+
+ comment: A short version of this paper was published at the Australasian Joint + Conference on Artificial Intelligence in 2023 +
+
+
+
+
+ + ☆ Asymptotics of feature learning in two-layer networks after one + gradient-step + + +
+ In this manuscript we investigate the problem of how two-layer neural +networks learn features from data, and improve over the kernel regime, after +being trained with a single gradient descent step. Leveraging a connection from +(Ba et al., 2022) with a non-linear spiked matrix model and recent progress on +Gaussian universality (Dandi et al., 2023), we provide an exact asymptotic +description of the generalization error in the high-dimensional limit where the +number of samples $n$, the width $p$ and the input dimension $d$ grow at a +proportional rate. We characterize exactly how adapting to the data is crucial +for the network to efficiently learn non-linear functions in the direction of +the gradient -- where at initialization it can only express linear functions in +this regime. To our knowledge, our results provides the first tight description +of the impact of feature learning in the generalization of two-layer neural +networks in the large learning rate regime $\eta=\Theta_{d}(d)$, beyond +perturbative finite width corrections of the conjugate and neural tangent +kernels. + +
+
+
+
+
+ + ☆ A Bayesian Approach to Online Learning for Contextual Restless Bandits + with Applications to Public Health + + +
+ Restless multi-armed bandits (RMABs) are used to model sequential resource +allocation in public health intervention programs. In these settings, the +underlying transition dynamics are often unknown a priori, requiring online +reinforcement learning (RL). However, existing methods in online RL for RMABs +cannot incorporate properties often present in real-world public health +applications, such as contextual information and non-stationarity. We present +Bayesian Learning for Contextual RMABs (BCoR), an online RL approach for RMABs +that novelly combines techniques in Bayesian modeling with Thompson sampling to +flexibly model a wide range of complex RMAB settings, such as contextual and +non-stationary RMABs. A key contribution of our approach is its ability to +leverage shared information within and between arms to learn unknown RMAB +transition dynamics quickly in budget-constrained settings with relatively +short time horizons. Empirically, we show that BCoR achieves substantially +higher finite-sample performance than existing approaches over a range of +experimental settings, including one constructed from a real-world public +health campaign in India. + +
+
+ comment: 26 pages, 18 figures +
+
+
+
+
+ + ☆ Blue noise for diffusion models + + +
+ Most of the existing diffusion models use Gaussian noise for training and +sampling across all time steps, which may not optimally account for the +frequency contents reconstructed by the denoising network. Despite the diverse +applications of correlated noise in computer graphics, its potential for +improving the training process has been underexplored. In this paper, we +introduce a novel and general class of diffusion models taking correlated noise +within and across images into account. More specifically, we propose a +time-varying noise model to incorporate correlated noise into the training +process, as well as a method for fast generation of correlated noise mask. Our +model is built upon deterministic diffusion models and utilizes blue noise to +help improve the generation quality compared to using Gaussian white (random) +noise only. Further, our framework allows introducing correlation across images +within a single mini-batch to improve gradient flow. We perform both +qualitative and quantitative evaluations on a variety of datasets using our +method, achieving improvements on different tasks over existing deterministic +diffusion models in terms of FID metric. + +
+
+ comment: 10 pages, 12 figures +
+
+
+
+
+ + ☆ Source-Free Domain Adaptation with Diffusion-Guided Source Data + Generation + + +
+ This paper introduces a novel approach to leverage the generalizability +capability of Diffusion Models for Source-Free Domain Adaptation (DM-SFDA). Our +proposed DM-SFDA method involves fine-tuning a pre-trained text-to-image +diffusion model to generate source domain images using features from the target +images to guide the diffusion process. Specifically, the pre-trained diffusion +model is fine-tuned to generate source samples that minimize entropy and +maximize confidence for the pre-trained source model. We then apply established +unsupervised domain adaptation techniques to align the generated source images +with target domain data. We validate our approach through comprehensive +experiments across a range of datasets, including Office-31, Office-Home, and +VisDA. The results highlight significant improvements in SFDA performance, +showcasing the potential of diffusion models in generating contextually +relevant, domain-specific images. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.01701 +
+
+
+
+
+ + ☆ Two Trades is not Baffled: Condense Graph via Crafting Rational Gradient + Matching + + +
+ Training on large-scale graphs has achieved remarkable results in graph +representation learning, but its cost and storage have raised growing concerns. +As one of the most promising directions, graph condensation methods address +these issues by employing gradient matching, aiming to condense the full graph +into a more concise yet information-rich synthetic set. Though encouraging, +these strategies primarily emphasize matching directions of the gradients, +which leads to deviations in the training trajectories. Such deviations are +further magnified by the differences between the condensation and evaluation +phases, culminating in accumulated errors, which detrimentally affect the +performance of the condensed graphs. In light of this, we propose a novel graph +condensation method named \textbf{C}raf\textbf{T}ing \textbf{R}ationa\textbf{L} +trajectory (\textbf{CTRL}), which offers an optimized starting point closer to +the original dataset's feature distribution and a more refined strategy for +gradient matching. Theoretically, CTRL can effectively neutralize the impact of +accumulated errors on the performance of condensed graphs. We provide extensive +experiments on various graph datasets and downstream tasks to support the +effectiveness of CTRL. Code is released at +https://github.com/NUS-HPC-AI-Lab/CTRL. + +
+
+ comment: An effective method for graph condensation +
+
+
+
+
+ + ☆ Voronoi Candidates for Bayesian Optimization + + +
+ Bayesian optimization (BO) offers an elegant approach for efficiently +optimizing black-box functions. However, acquisition criteria demand their own +challenging inner-optimization, which can induce significant overhead. Many +practical BO methods, particularly in high dimension, eschew a formal, +continuous optimization of the acquisition function and instead search +discretely over a finite set of space-filling candidates. Here, we propose to +use candidates which lie on the boundary of the Voronoi tessellation of the +current design points, so they are equidistant to two or more of them. We +discuss strategies for efficient implementation by directly sampling the +Voronoi boundary without explicitly generating the tessellation, thus +accommodating large designs in high dimension. On a battery of test problems +optimized via Gaussian processes with expected improvement, our proposed +approach significantly improves the execution time of a multi-start continuous +search without a loss in accuracy. + +
+
+ comment: comments very welcome +
+
+
+
+
+ + ☆ Moco: A Learnable Meta Optimizer for Combinatorial Optimization + + +
+ Relevant combinatorial optimization problems (COPs) are often NP-hard. While +they have been tackled mainly via handcrafted heuristics in the past, advances +in neural networks have motivated the development of general methods to learn +heuristics from data. Many approaches utilize a neural network to directly +construct a solution, but are limited in further improving based on already +constructed solutions at inference time. Our approach, Moco, learns a graph +neural network that updates the solution construction procedure based on +features extracted from the current search state. This meta training procedure +targets the overall best solution found during the search procedure given +information such as the search budget. This allows Moco to adapt to varying +circumstances such as different computational budgets. Moco is a fully +learnable meta optimizer that does not utilize any problem specific local +search or decomposition. We test Moco on the Traveling Salesman Problem (TSP) +and Maximum Independent Set (MIS) and show that it outperforms other approaches +on MIS and is overall competitive on the TSP, especially outperforming related +approaches, partially even if they use additional local search. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Towards Biologically Plausible and Private Gene Expression Data + Generation + + +
+ Generative models trained with Differential Privacy (DP) are becoming +increasingly prominent in the creation of synthetic data for downstream +applications. Existing literature, however, primarily focuses on basic +benchmarking datasets and tends to report promising results only for elementary +metrics and relatively simple data distributions. In this paper, we initiate a +systematic analysis of how DP generative models perform in their natural +application scenarios, specifically focusing on real-world gene expression +data. We conduct a comprehensive analysis of five representative DP generation +methods, examining them from various angles, such as downstream utility, +statistical properties, and biological plausibility. Our extensive evaluation +illuminates the unique characteristics of each DP generation method, offering +critical insights into the strengths and weaknesses of each approach, and +uncovering intriguing possibilities for future developments. Perhaps +surprisingly, our analysis reveals that most methods are capable of achieving +seemingly reasonable downstream utility, according to the standard evaluation +metrics considered in existing literature. Nevertheless, we find that none of +the DP methods are able to accurately capture the biological characteristics of +the real dataset. This observation suggests a potential over-optimistic +assessment of current methodologies in this field and underscores a pressing +need for future enhancements in model design. + +
+
+
+
+
+ + ☆ On a Combinatorial Problem Arising in Machine Teaching + + +
+ We study a model of machine teaching where the teacher mapping is constructed +from a size function on both concepts and examples. The main question in +machine teaching is the minimum number of examples needed for any concept, the +so-called teaching dimension. A recent paper [7] conjectured that the worst +case for this model, as a function of the size of the concept class, occurs +when the consistency matrix contains the binary representations of numbers from +zero and up. In this paper we prove their conjecture. The result can be seen as +a generalization of a theorem resolving the edge isoperimetry problem for +hypercubes [12], and our proof is based on a lemma of [10]. + +
+
+ comment: 14 pages, 1 figure +
+
+
+
+
+ + ☆ Conformal Monte Carlo Meta-learners for Predictive Inference of + Individual Treatment Effects + + +
+ Knowledge of the effect of interventions, called the treatment effect, is +paramount for decision-making. Approaches to estimating this treatment effect, +e.g. by using Conditional Average Treatment Effect (CATE) estimators, often +only provide a point estimate of this treatment effect, while additional +uncertainty quantification is frequently desired instead. Therefore, we present +a novel method, the Conformal Monte Carlo (CMC) meta-learners, leveraging +conformal predictive systems, Monte Carlo sampling, and CATE meta-learners, to +instead produce a predictive distribution usable in individualized +decision-making. Furthermore, we show how specific assumptions on the noise +distribution of the outcome heavily affect these uncertainty predictions. +Nonetheless, the CMC framework shows strong experimental coverage while +retaining small interval widths to provide estimates of the true individual +treatment effect. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ☆ L4Q: Parameter Efficient Quantization-Aware Training on Large Language + Models via LoRA-wise LSQ + + +
+ Post-training quantization (PTQ) and quantization-aware training (QAT) +methods are gaining popularity in mitigating the high memory and computational +costs associated with Large Language Models (LLMs). In resource-constrained +scenarios, PTQ, with its reduced training overhead, is often preferred over +QAT, despite the latter's potential for higher accuracy. Meanwhile, +parameter-efficient fine-tuning (PEFT) methods like low-rank adaptation (LoRA) +have been introduced, and recent efforts have explored quantization-aware PEFT +techniques. However, these approaches may lack generality due to their reliance +on the pre-quantized model's configuration. Their effectiveness may be +compromised by non-linearly quantized or mixed-precision weights, and the +retraining of specific quantization parameters might impede optimal +performance. To address these challenges, we propose L4Q, an algorithm for +parameter-efficient quantization-aware training. L4Q leverages LoRA-wise +learned quantization step size for LLMs, aiming to enhance generality. The +simultaneous quantization-and-fine-tuning process of L4Q is applicable to +high-precision models, yielding linearly quantized weights with superior +accuracy. Our experiments, conducted on the LLaMA and LLaMA2 model families +using an instructional dataset, showcase L4Q's capabilities in language +comprehension and few-shot in-context learning, achieving sub-4-bit precision +while maintaining comparable training times to applying PEFT on a quantized +model. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ The Strain of Success: A Predictive Model for Injury Risk Mitigation and + Team Success in Soccer + + +
+ In this paper, we present a novel sequential team selection model in soccer. +Specifically, we model the stochastic process of player injury and +unavailability using player-specific information learned from real-world soccer +data. Monte-Carlo Tree Search is used to select teams for games that optimise +long-term team performance across a soccer season by reasoning over player +injury probability. We validate our approach compared to benchmark solutions +for the 2018/19 English Premier League season. Our model achieves similar +season expected points to the benchmark whilst reducing first-team injuries by +~13% and the money inefficiently spent on injured players by ~11% - +demonstrating the potential to reduce costs and improve player welfare in +real-world soccer teams. + +
+
+ comment: 19 pages (16 main, 2 references, 1 appendix), 10 figures (9 main, 1 + appendix). Accepted at the MIT Sloan Sports Analytics Conference 2024 + Research Paper Competition +
+
+
+
+
+ + ☆ Deep Reinforcement Learning with Dynamic Graphs for Adaptive Informative + Path Planning + + +
+ Autonomous robots are often employed for data collection due to their +efficiency and low labour costs. A key task in robotic data acquisition is +planning paths through an initially unknown environment to collect observations +given platform-specific resource constraints, such as limited battery life. +Adaptive online path planning in 3D environments is challenging due to the +large set of valid actions and the presence of unknown occlusions. To address +these issues, we propose a novel deep reinforcement learning approach for +adaptively replanning robot paths to map targets of interest in unknown 3D +environments. A key aspect of our approach is a dynamically constructed graph +that restricts planning actions local to the robot, allowing us to quickly +react to newly discovered obstacles and targets of interest. For replanning, we +propose a new reward function that balances between exploring the unknown +environment and exploiting online-collected data about the targets of interest. +Our experiments show that our method enables more efficient target detection +compared to state-of-the-art learning and non-learning baselines. We also show +the applicability of our approach for orchard monitoring using an unmanned +aerial vehicle in a photorealistic simulator. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ A Unified Framework for Probabilistic Verification of AI Systems via + Weighted Model Integration + + +
+ The probabilistic formal verification (PFV) of AI systems is in its infancy. +So far, approaches have been limited to ad-hoc algorithms for specific classes +of models and/or properties. + We propose a unifying framework for the PFV of AI systems based onWeighted +Model Integration (WMI), which allows to frame the problem in very general +terms. + Crucially, this reduction enables the verification of many properties of +interest, like fairness, robustness or monotonicity, over a wide range of +machine learning models, without making strong distributional assumptions. + We support the generality of the approach by solving multiple verification +tasks with a single, off-the-shelf WMI solver, then discuss the scalability +challenges and research directions related to this promising framework. + +
+
+
+
+
+ + ☆ On Provable Length and Compositional Generalization + + +
+ Length generalization -- the ability to generalize to longer sequences than +ones seen during training, and compositional generalization -- the ability to +generalize to token combinations not seen during training, are crucial forms of +out-of-distribution generalization in sequence-to-sequence models. In this +work, we take the first steps towards provable length and compositional +generalization for a range of architectures, including deep sets, transformers, +state space models, and simple recurrent neural nets. Depending on the +architecture, we prove different degrees of representation identification, +e.g., a linear or a permutation relation with ground truth representation, is +necessary for length and compositional generalization. + +
+
+
+
+
+ + ☆ Learning by Doing: An Online Causal Reinforcement Learning Framework + with Causal-Aware Policy + + +
+ As a key component to intuitive cognition and reasoning solutions in human +intelligence, causal knowledge provides great potential for reinforcement +learning (RL) agents' interpretability towards decision-making by helping +reduce the searching space. However, there is still a considerable gap in +discovering and incorporating causality into RL, which hinders the rapid +development of causal RL. In this paper, we consider explicitly modeling the +generation process of states with the causal graphical model, based on which we +augment the policy. We formulate the causal structure updating into the RL +interaction process with active intervention learning of the environment. To +optimize the derived objective, we propose a framework with theoretical +performance guarantees that alternates between two steps: using interventions +for causal structure learning during exploration and using the learned causal +structure for policy guidance during exploitation. Due to the lack of public +benchmarks that allow direct intervention in the state space, we design the +root cause localization task in our simulated fault alarm environment and then +empirically show the effectiveness and robustness of the proposed method +against state-of-the-art baselines. Theoretical analysis shows that our +performance improvement attributes to the virtuous cycle of causal-guided +policy learning and causal structure learning, which aligns with our +experimental results. + +
+
+
+
+
+ + ☆ CodeIt: Self-Improving Language Models with Prioritized Hindsight Replay + + +
+ Large language models are increasingly solving tasks that are commonly +believed to require human-level reasoning ability. However, these models still +perform very poorly on benchmarks of general intelligence such as the +Abstraction and Reasoning Corpus (ARC). In this paper, we approach ARC as a +programming-by-examples problem, and introduce a novel and scalable method for +language model self-improvement called Code Iteration (CodeIt). Our method +iterates between 1) program sampling and hindsight relabeling, and 2) learning +from prioritized experience replay. By relabeling the goal of an episode (i.e., +the target program output given input) to the realized output produced by the +sampled program, our method effectively deals with the extreme sparsity of +rewards in program synthesis. Applying CodeIt to the ARC dataset, we +demonstrate that prioritized hindsight replay, along with pre-training and +data-augmentation, leads to successful inter-task generalization. CodeIt is the +first neuro-symbolic approach that scales to the full ARC evaluation dataset. +Our method solves 15% of ARC evaluation tasks, achieving state-of-the-art +performance and outperforming existing neural and symbolic baselines. + +
+
+ comment: 8 pages, 11 figures +
+
+
+
+
+ + ☆ Explaining Learned Reward Functions with Counterfactual Trajectories + + +
+ Learning rewards from human behaviour or feedback is a promising approach to +aligning AI systems with human values but fails to consistently extract correct +reward functions. Interpretability tools could enable users to understand and +evaluate possible flaws in learned reward functions. We propose Counterfactual +Trajectory Explanations (CTEs) to interpret reward functions in reinforcement +learning by contrasting an original with a counterfactual partial trajectory +and the rewards they each receive. We derive six quality criteria for CTEs and +propose a novel Monte-Carlo-based algorithm for generating CTEs that optimises +these quality criteria. Finally, we measure how informative the generated +explanations are to a proxy-human model by training it on CTEs. CTEs are +demonstrably informative for the proxy-human model, increasing the similarity +between its predictions and the reward function on unseen trajectories. +Further, it learns to accurately judge differences in rewards between +trajectories and generalises to out-of-distribution examples. Although CTEs do +not lead to a perfect understanding of the reward, our method, and more +generally the adaptation of XAI methods, are presented as a fruitful approach +for interpreting learned reward functions. + +
+
+
+
+
+ + ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey + + +
+ Research surveys have always posed a challenge for beginner researchers who +lack of research training. These researchers struggle to understand the +directions within their research topic, and the discovery of new research +findings within a short time. One way to provide intuitive assistance to +beginner researchers is by offering relevant knowledge graphs(KG) and +recommending related academic papers. However, existing navigation knowledge +graphs primarily rely on keywords in the research field and often fail to +present the logical hierarchy among multiple related papers clearly. Moreover, +most recommendation systems for academic papers simply rely on high text +similarity, which can leave researchers confused as to why a particular article +is being recommended. They may lack of grasp important information about the +insight connection between "Issue resolved" and "Issue finding" that they hope +to obtain. To address these issues, this study aims to support research insight +surveys for beginner researchers by establishing a hierarchical tree-structured +knowledge graph that reflects the inheritance insight of research topics and +the relevance insight among the academic papers. + +
+
+ comment: This paper will submit to '27th International Symposium on + Methodologies for Intelligent Systems'(ISMIS 2024) +
+
+
+
+
+ + ☆ Multi-Patch Prediction: Adapting LLMs for Time Series Representation + Learning + + +
+ In this study, we present aLLM4TS, an innovative framework that adapts Large +Language Models (LLMs) for time-series representation learning. Central to our +approach is that we reconceive time-series forecasting as a self-supervised, +multi-patch prediction task, which, compared to traditional +mask-and-reconstruction methods, captures temporal dynamics in patch +representations more effectively. Our strategy encompasses two-stage training: +(i). a causal continual pre-training phase on various time-series datasets, +anchored on next patch prediction, effectively syncing LLM capabilities with +the intricacies of time-series data; (ii). fine-tuning for multi-patch +prediction in the targeted time-series context. A distinctive element of our +framework is the patch-wise decoding layer, which departs from previous methods +reliant on sequence-level decoding. Such a design directly transposes +individual patches into temporal sequences, thereby significantly bolstering +the model's proficiency in mastering temporal patch-based representations. +aLLM4TS demonstrates superior performance in several downstream tasks, proving +its effectiveness in deriving temporal representations with enhanced +transferability and marking a pivotal advancement in the adaptation of LLMs for +time-series analysis. + +
+
+
+
+
+ + ☆ AlphaFold Meets Flow Matching for Generating Protein Ensembles + + +
+ The biological functions of proteins often depend on dynamic structural +ensembles. In this work, we develop a flow-based generative modeling approach +for learning and sampling the conformational landscapes of proteins. We +repurpose highly accurate single-state predictors such as AlphaFold and ESMFold +and fine-tune them under a custom flow matching framework to obtain +sequence-conditoned generative models of protein structure called AlphaFlow and +ESMFlow. When trained and evaluated on the PDB, our method provides a superior +combination of precision and diversity compared to AlphaFold with MSA +subsampling. When further trained on ensembles from all-atom MD, our method +accurately captures conformational flexibility, positional distributions, and +higher-order ensemble observables for unseen proteins. Moreover, our method can +diversify a static PDB structure with faster wall-clock convergence to certain +equilibrium properties than replicate MD trajectories, demonstrating its +potential as a proxy for expensive physics-based simulations. Code is available +at https://github.com/bjing2016/alphaflow. + +
+
+
+
+
+ + ☆ On the Completeness of Invariant Geometric Deep Learning Models + + +
+ Invariant models, one important class of geometric deep learning models, are +capable of generating meaningful geometric representations by leveraging +informative geometric features. These models are characterized by their +simplicity, good experimental results and computational efficiency. However, +their theoretical expressive power still remains unclear, restricting a deeper +understanding of the potential of such models. In this work, we concentrate on +characterizing the theoretical expressiveness of invariant models. We first +rigorously bound the expressiveness of the most classical invariant model, +Vanilla DisGNN (message passing neural networks incorporating distance), +restricting its unidentifiable cases to be only those highly symmetric +geometric graphs. To break these corner cases' symmetry, we introduce a simple +yet E(3)-complete invariant design by nesting Vanilla DisGNN, named GeoNGNN. +Leveraging GeoNGNN as a theoretical tool, we for the first time prove the +E(3)-completeness of three well-established geometric models: DimeNet, GemNet +and SphereNet. Our results fill the gap in the theoretical power of invariant +models, contributing to a rigorous and comprehensive understanding of their +capabilities. Experimentally, GeoNGNN exhibits good inductive bias in capturing +local environments, and achieves competitive results w.r.t. complicated models +relying on high-order invariant/equivariant representations while exhibiting +significantly faster computational speed. + +
+
+
+
+
+ + ☆ SARI: Simplistic Average and Robust Identification based Noisy Partial + Label Learning + + +
+ Partial label learning (PLL) is a weakly-supervised learning paradigm where +each training instance is paired with a set of candidate labels (partial +label), one of which is the true label. Noisy PLL (NPLL) relaxes this +constraint by allowing some partial labels to not contain the true label, +enhancing the practicality of the problem. Our work centers on NPLL and +presents a minimalistic framework called SARI that initially assigns +pseudo-labels to images by exploiting the noisy partial labels through a +weighted nearest neighbour algorithm. These pseudo-label and image pairs are +then used to train a deep neural network classifier with label smoothing and +standard regularization techniques. The classifier's features and predictions +are subsequently employed to refine and enhance the accuracy of pseudo-labels. +SARI combines the strengths of Average Based Strategies (in pseudo labelling) +and Identification Based Strategies (in classifier training) from the +literature. We perform thorough experiments on seven datasets and compare SARI +against nine NPLL and PLL methods from the prior art. SARI achieves +state-of-the-art results in almost all studied settings, obtaining substantial +gains in fine-grained classification and extreme noise settings. + +
+
+ comment: 13 pages, 6 tables, 2 figures +
+
+
+
+
+ + ☆ Closing the Gap Between SGP4 and High-Precision Propagation via + Differentiable Programming + + +
+ The Simplified General Perturbations 4 (SGP4) orbital propagation method is +widely used for predicting the positions and velocities of Earth-orbiting +objects rapidly and reliably. Despite continuous refinement, SGP models still +lack the precision of numerical propagators, which offer significantly smaller +errors. This study presents dSGP4, a novel differentiable version of SGP4 +implemented using PyTorch. By making SGP4 differentiable, dSGP4 facilitates +various space-related applications, including spacecraft orbit determination, +state conversion, covariance transformation, state transition matrix +computation, and covariance propagation. Additionally, dSGP4's PyTorch +implementation allows for embarrassingly parallel orbital propagation across +batches of Two-Line Element Sets (TLEs), leveraging the computational power of +CPUs, GPUs, and advanced hardware for distributed prediction of satellite +positions at future times. Furthermore, dSGP4's differentiability enables +integration with modern machine learning techniques. Thus, we propose a novel +orbital propagation paradigm, ML-dSGP4, where neural networks are integrated +into the orbital propagator. Through stochastic gradient descent, this combined +model's inputs, outputs, and parameters can be iteratively refined, surpassing +SGP4's precision. Neural networks act as identity operators by default, +adhering to SGP4's behavior. However, dSGP4's differentiability allows +fine-tuning with ephemeris data, enhancing precision while maintaining +computational speed. This empowers satellite operators and researchers to train +the model using specific ephemeris or high-precision numerical propagation +data, significantly advancing orbital prediction capabilities. + +
+
+
+
+
+ + ☆ Fast Timing-Conditioned Latent Audio Diffusion + + +
+ Generating long-form 44.1kHz stereo audio from text prompts can be +computationally demanding. Further, most previous works do not tackle that +music and sound effects naturally vary in their duration. Our research focuses +on the efficient generation of long-form, variable-length stereo music and +sounds at 44.1kHz using text prompts with a generative model. Stable Audio is +based on latent diffusion, with its latent defined by a fully-convolutional +variational autoencoder. It is conditioned on text prompts as well as timing +embeddings, allowing for fine control over both the content and length of the +generated music and sounds. Stable Audio is capable of rendering stereo signals +of up to 95 sec at 44.1kHz in 8 sec on an A100 GPU. Despite its compute +efficiency and fast inference, it is one of the best in two public +text-to-music and -audio benchmarks and, differently from state-of-the-art +models, can generate music with structure and stereo sounds. + +
+
+ comment: Code: https://github.com/Stability-AI/stable-audio-tools. Metrics: + https://github.com/Stability-AI/stable-audio-metrics. Demo: + https://stability-ai.github.io/stable-audio-demo +
+
+
+
+
+ + ☆ How Realistic Is Your Synthetic Data? Constraining Deep Generative + Models for Tabular Data ICLR 2024 + + +
+ Deep Generative Models (DGMs) have been shown to be powerful tools for +generating tabular data, as they have been increasingly able to capture the +complex distributions that characterize them. However, to generate realistic +synthetic data, it is often not enough to have a good approximation of their +distribution, as it also requires compliance with constraints that encode +essential background knowledge on the problem at hand. In this paper, we +address this limitation and show how DGMs for tabular data can be transformed +into Constrained Deep Generative Models (C-DGMs), whose generated samples are +guaranteed to be compliant with the given constraints. This is achieved by +automatically parsing the constraints and transforming them into a Constraint +Layer (CL) seamlessly integrated with the DGM. Our extensive experimental +analysis with various DGMs and tasks reveals that standard DGMs often violate +constraints, some exceeding $95\%$ non-compliance, while their corresponding +C-DGMs are never non-compliant. Then, we quantitatively demonstrate that, at +training time, C-DGMs are able to exploit the background knowledge expressed by +the constraints to outperform their standard counterparts with up to $6.5\%$ +improvement in utility and detection. Further, we show how our CL does not +necessarily need to be integrated at training time, as it can be also used as a +guardrail at inference time, still producing some improvements in the overall +performance of the models. Finally, we show that our CL does not hinder the +sample generation time of the models. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ☆ E(3)-Equivariant Mesh Neural Networks + + +
+ Triangular meshes are widely used to represent three-dimensional objects. As +a result, many recent works have address the need for geometric deep learning +on 3D mesh. However, we observe that the complexities in many of these +architectures does not translate to practical performance, and simple deep +models for geometric graphs are competitive in practice. Motivated by this +observation, we minimally extend the update equations of E(n)-Equivariant Graph +Neural Networks (EGNNs) (Satorras et al., 2021) to incorporate mesh face +information, and further improve it to account for long-range interactions +through hierarchy. The resulting architecture, Equivariant Mesh Neural Network +(EMNN), outperforms other, more complicated equivariant methods on mesh tasks, +with a fast run-time and no expensive pre-processing. + +
+
+
+
+
+ + ☆ BOWLL: A Deceptively Simple Open World Lifelong Learner + + +
+ The quest to improve scalar performance numbers on predetermined benchmarks +seems to be deeply engraved in deep learning. However, the real world is seldom +carefully curated and applications are seldom limited to excelling on test +sets. A practical system is generally required to recognize novel concepts, +refrain from actively including uninformative data, and retain previously +acquired knowledge throughout its lifetime. Despite these key elements being +rigorously researched individually, the study of their conjunction, open world +lifelong learning, is only a recent trend. To accelerate this multifaceted +field's exploration, we introduce its first monolithic and much-needed +baseline. Leveraging the ubiquitous use of batch normalization across deep +neural networks, we propose a deceptively simple yet highly effective way to +repurpose standard models for open world lifelong learning. Through extensive +empirical evaluation, we highlight why our approach should serve as a future +standard for models that are able to effectively maintain their knowledge, +selectively focus on informative data, and accelerate future learning. + +
+
+
+
+
+ + ☆ Scalable Multi-view Clustering via Explicit Kernel Features Maps + + +
+ A growing awareness of multi-view learning as an important component in data +science and machine learning is a consequence of the increasing prevalence of +multiple views in real-world applications, especially in the context of +networks. In this paper we introduce a new scalability framework for multi-view +subspace clustering. An efficient optimization strategy is proposed, leveraging +kernel feature maps to reduce the computational burden while maintaining good +clustering performance. The scalability of the algorithm means that it can be +applied to large-scale datasets, including those with millions of data points, +using a standard machine, in a few minutes. We conduct extensive experiments on +real-world benchmark networks of various sizes in order to evaluate the +performance of our algorithm against state-of-the-art multi-view subspace +clustering methods and attributed-network multi-view approaches. + +
+
+
+
+
+ + ☆ Shadowheart SGD: Distributed Asynchronous SGD with Optimal Time + Complexity Under Arbitrary Computation and Communication Heterogeneity + + +
+ We consider nonconvex stochastic optimization problems in the asynchronous +centralized distributed setup where the communication times from workers to a +server can not be ignored, and the computation and communication times are +potentially different for all workers. Using an unbiassed compression +technique, we develop a new method-Shadowheart SGD-that provably improves the +time complexities of all previous centralized methods. Moreover, we show that +the time complexity of Shadowheart SGD is optimal in the family of centralized +methods with compressed communication. We also consider the bidirectional +setup, where broadcasting from the server to the workers is non-negligible, and +develop a corresponding method. + +
+
+
+
+
+ + ☆ Analyzing the Neural Tangent Kernel of Periodically Activated Coordinate + Networks + + +
+ Recently, neural networks utilizing periodic activation functions have been +proven to demonstrate superior performance in vision tasks compared to +traditional ReLU-activated networks. However, there is still a limited +understanding of the underlying reasons for this improved performance. In this +paper, we aim to address this gap by providing a theoretical understanding of +periodically activated networks through an analysis of their Neural Tangent +Kernel (NTK). We derive bounds on the minimum eigenvalue of their NTK in the +finite width setting, using a fairly general network architecture which +requires only one wide layer that grows at least linearly with the number of +data samples. Our findings indicate that periodically activated networks are +\textit{notably more well-behaved}, from the NTK perspective, than ReLU +activated networks. Additionally, we give an application to the memorization +capacity of such networks and verify our theoretical predictions empirically. +Our study offers a deeper understanding of the properties of periodically +activated neural networks and their potential in the field of deep learning. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2402.02711 +
+
+
+
+
+ + ☆ A fast score-based search algorithm for maximal ancestral graphs using + entropy + + +
+ \emph{Maximal ancestral graph} (MAGs) is a class of graphical model that +extend the famous \emph{directed acyclic graph} in the presence of latent +confounders. Most score-based approaches to learn the unknown MAG from +empirical data rely on BIC score which suffers from instability and heavy +computations. We propose to use the framework of imsets +\citep{studeny2006probabilistic} to score MAGs using empirical entropy +estimation and the newly proposed \emph{refined Markov property} +\citep{hu2023towards}. Our graphical search procedure is similar to +\citet{claassen2022greedy} but improved from our theoretical results. We show +that our search algorithm is polynomial in number of nodes by restricting +degree, maximal head size and number of discriminating paths. In simulated +experiment, our algorithm shows superior performance compared to other state of +art MAG learning algorithms. + +
+
+
+
+
+ + ☆ Code as Reward: Empowering Reinforcement Learning with VLMs + + +
+ Pre-trained Vision-Language Models (VLMs) are able to understand visual +concepts, describe and decompose complex tasks into sub-tasks, and provide +feedback on task completion. In this paper, we aim to leverage these +capabilities to support the training of reinforcement learning (RL) agents. In +principle, VLMs are well suited for this purpose, as they can naturally analyze +image-based observations and provide feedback (reward) on learning progress. +However, inference in VLMs is computationally expensive, so querying them +frequently to compute rewards would significantly slowdown the training of an +RL agent. To address this challenge, we propose a framework named Code as +Reward (VLM-CaR). VLM-CaR produces dense reward functions from VLMs through +code generation, thereby significantly reducing the computational burden of +querying the VLM directly. We show that the dense rewards generated through our +approach are very accurate across a diverse set of discrete and continuous +environments, and can be more effective in training RL policies than the +original sparse environment rewards. + +
+
+
+
+
+ + ☆ Color Recognition in Challenging Lighting Environments: CNN Approach + + +
+ Light plays a vital role in vision either human or machine vision, the +perceived color is always based on the lighting conditions of the surroundings. +Researchers are working to enhance the color detection techniques for the +application of computer vision. They have implemented proposed several methods +using different color detection approaches but still, there is a gap that can +be filled. To address this issue, a color detection method, which is based on a +Convolutional Neural Network (CNN), is proposed. Firstly, image segmentation is +performed using the edge detection segmentation technique to specify the object +and then the segmented object is fed to the Convolutional Neural Network +trained to detect the color of an object in different lighting conditions. It +is experimentally verified that our method can substantially enhance the +robustness of color detection in different lighting conditions, and our method +performed better results than existing methods. + +
+
+
+
+
+ + ☆ Towards Aligned Layout Generation via Diffusion Model with Aesthetic + Constraints ICLR 2024 + + +
+ Controllable layout generation refers to the process of creating a plausible +visual arrangement of elements within a graphic design (e.g., document and web +designs) with constraints representing design intentions. Although recent +diffusion-based models have achieved state-of-the-art FID scores, they tend to +exhibit more pronounced misalignment compared to earlier transformer-based +models. In this work, we propose the $\textbf{LA}$yout $\textbf{C}$onstraint +diffusion mod$\textbf{E}$l (LACE), a unified model to handle a broad range of +layout generation tasks, such as arranging elements with specified attributes +and refining or completing a coarse layout design. The model is based on +continuous diffusion models. Compared with existing methods that use discrete +diffusion models, continuous state-space design can enable the incorporation of +differentiable aesthetic constraint functions in training. For conditional +generation, we introduce conditions via masked input. Extensive experiment +results show that LACE produces high-quality layouts and outperforms existing +state-of-the-art baselines. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Progressive Gradient Flow for Robust N:M Sparsity Training in + Transformers + + +
+ N:M Structured sparsity has garnered significant interest as a result of +relatively modest overhead and improved efficiency. Additionally, this form of +sparsity holds considerable appeal for reducing the memory footprint owing to +their modest representation overhead. There have been efforts to develop +training recipes for N:M structured sparsity, they primarily focus on +low-sparsity regions ($\sim$50\%). Nonetheless, performance of models trained +using these approaches tends to decline when confronted with high-sparsity +regions ($>$80\%). In this work, we study the effectiveness of existing sparse +training recipes at \textit{high-sparsity regions} and argue that these methods +fail to sustain the model quality on par with low-sparsity regions. We +demonstrate that the significant factor contributing to this disparity is the +presence of elevated levels of induced noise in the gradient magnitudes. To +mitigate this undesirable effect, we employ decay mechanisms to progressively +restrict the flow of gradients towards pruned elements. Our approach improves +the model quality by up to 2$\%$ and 5$\%$ in vision and language models at +high sparsity regime, respectively. We also evaluate the trade-off between +model accuracy and training compute cost in terms of FLOPs. At iso-training +FLOPs, our method yields better performance compared to conventional sparse +training recipes, exhibiting an accuracy improvement of up to 2$\%$. The source +code is available at +https://github.com/abhibambhaniya/progressive_gradient_flow_nm_sparsity. + +
+
+ comment: 18 pages, 8 figures, 17 tables. Code is available at + https://github.com/abhibambhaniya/progressive_gradient_flow_nm_sparsity +
+
+
+
+
+ + ☆ Non-Parametric Estimation of Multi-dimensional Marked Hawkes Processes + + +
+ An extension of the Hawkes process, the Marked Hawkes process distinguishes +itself by featuring variable jump size across each event, in contrast to the +constant jump size observed in a Hawkes process without marks. While extensive +literature has been dedicated to the non-parametric estimation of both the +linear and non-linear Hawkes process, there remains a significant gap in the +literature regarding the marked Hawkes process. In response to this, we propose +a methodology for estimating the conditional intensity of the marked Hawkes +process. We introduce two distinct models: \textit{Shallow Neural Hawkes with +marks}- for Hawkes processes with excitatory kernels and \textit{Neural Network +for Non-Linear Hawkes with Marks}- for non-linear Hawkes processes. Both these +approaches take the past arrival times and their corresponding marks as the +input to obtain the arrival intensity. This approach is entirely +non-parametric, preserving the interpretability associated with the marked +Hawkes process. To validate the efficacy of our method, we subject the method +to synthetic datasets with known ground truth. Additionally, we apply our +method to model cryptocurrency order book data, demonstrating its applicability +to real-world scenarios. + +
+
+
+
+
+ + ☆ Graph Cuts with Arbitrary Size Constraints Through Optimal Transport + + +
+ A common way of partitioning graphs is through minimum cuts. One drawback of +classical minimum cut methods is that they tend to produce small groups, which +is why more balanced variants such as normalized and ratio cuts have seen more +success. However, we believe that with these variants, the balance constraints +can be too restrictive for some applications like for clustering of imbalanced +datasets, while not being restrictive enough for when searching for perfectly +balanced partitions. Here, we propose a new graph cut algorithm for +partitioning graphs under arbitrary size constraints. We formulate the graph +cut problem as a regularized Gromov-Wasserstein problem. We then propose to +solve it using accelerated proximal GD algorithm which has global convergence +guarantees, results in sparse solutions and only incurs an additional ratio of +$\mathcal{O}(\log(n))$ compared to the classical spectral clustering algorithm +but was seen to be more efficient. + +
+
+
+
+
+ + ☆ Theoretical and Empirical Analysis of Adaptive Entry Point Selection for + Graph-based Approximate Nearest Neighbor Search + + +
+ We present a theoretical and empirical analysis of the adaptive entry point +selection for graph-based approximate nearest neighbor search (ANNS). We +introduce novel concepts: $b\textit{-monotonic path}$ and $B\textit{-MSNET}$, +which better capture an actual graph in practical algorithms than existing +concepts like MSNET. We prove that adaptive entry point selection offers better +performance upper bound than the fixed central entry point under more general +conditions than previous work. Empirically, we validate the method's +effectiveness in accuracy, speed, and memory usage across various datasets, +especially in challenging scenarios with out-of-distribution data and hard +instances. Our comprehensive study provides deeper insights into optimizing +entry points for graph-based ANNS for real-world high-dimensional data +applications. + +
+
+
+
+
+ + ☆ Incorporating Retrieval-based Causal Learning with Information + Bottlenecks for Interpretable Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) have gained considerable traction for their +capability to effectively process topological data, yet their interpretability +remains a critical concern. Current interpretation methods are dominated by +post-hoc explanations to provide a transparent and intuitive understanding of +GNNs. However, they have limited performance in interpreting complicated +subgraphs and can't utilize the explanation to advance GNN predictions. On the +other hand, transparent GNN models are proposed to capture critical subgraphs. +While such methods could improve GNN predictions, they usually don't perform +well on explanations. Thus, it is desired for a new strategy to better couple +GNN explanation and prediction. In this study, we have developed a novel +interpretable causal GNN framework that incorporates retrieval-based causal +learning with Graph Information Bottleneck (GIB) theory. The framework could +semi-parametrically retrieve crucial subgraphs detected by GIB and compress the +explanatory subgraphs via a causal module. The framework was demonstrated to +consistently outperform state-of-the-art methods, and to achieve 32.71\% higher +precision on real-world explanation scenarios with diverse explanation types. +More importantly, the learned explanations were shown able to also improve GNN +prediction performance. + +
+
+
+
+
+ + ☆ EvoSeed: Unveiling the Threat on Deep Neural Networks with Real-World + Illusions + + +
+ Deep neural networks are exploited using natural adversarial samples, which +have no impact on human perception but are misclassified. Current approaches +often rely on the white-box nature of deep neural networks to generate these +adversarial samples or alter the distribution of adversarial samples compared +to training distribution. To alleviate the limitations of current approaches, +we propose EvoSeed, a novel evolutionary strategy-based search algorithmic +framework to generate natural adversarial samples. Our EvoSeed framework uses +auxiliary Diffusion and Classifier models to operate in a model-agnostic +black-box setting. We employ CMA-ES to optimize the search for an adversarial +seed vector, which, when processed by the Conditional Diffusion Model, results +in an unrestricted natural adversarial sample misclassified by the Classifier +Model. Experiments show that generated adversarial images are of high image +quality and are transferable to different classifiers. Our approach +demonstrates promise in enhancing the quality of adversarial samples using +evolutionary algorithms. We hope our research opens new avenues to enhance the +robustness of deep neural networks in real-world scenarios. Project Website can +be accessed at \url{https://shashankkotyan.github.io/EvoSeed}. + +
+
+
+
+
+ + ☆ From explained variance of correlated components to PCA without + orthogonality constraints + + +
+ Block Principal Component Analysis (Block PCA) of a data matrix A, where +loadings Z are determined by maximization of AZ 2 over unit norm orthogonal +loadings, is difficult to use for the design of sparse PCA by 1 regularization, +due to the difficulty of taking care of both the orthogonality constraint on +loadings and the non differentiable 1 penalty. Our objective in this paper is +to relax the orthogonality constraint on loadings by introducing new objective +functions expvar(Y) which measure the part of the variance of the data matrix A +explained by correlated components Y = AZ. So we propose first a comprehensive +study of mathematical and numerical properties of expvar(Y) for two existing +definitions Zou et al. [2006], Shen and Huang [2008] and four new definitions. +Then we show that only two of these explained variance are fit to use as +objective function in block PCA formulations for A rid of orthogonality +constraints. + +
+
+
+
+
+ + ☆ Learning Operators with Stochastic Gradient Descent in General Hilbert + Spaces + + +
+ This study investigates leveraging stochastic gradient descent (SGD) to learn +operators between general Hilbert spaces. We propose weak and strong regularity +conditions for the target operator to depict its intrinsic structure and +complexity. Under these conditions, we establish upper bounds for convergence +rates of the SGD algorithm and conduct a minimax lower bound analysis, further +illustrating that our convergence analysis and regularity conditions +quantitatively characterize the tractability of solving operator learning +problems using the SGD algorithm. It is crucial to highlight that our +convergence analysis is still valid for nonlinear operator learning. We show +that the SGD estimator will converge to the best linear approximation of the +nonlinear target operator. Moreover, applying our analysis to operator learning +problems based on vector-valued and real-valued reproducing kernel Hilbert +spaces yields new convergence results, thereby refining the conclusions of +existing literature. + +
+
+ comment: 56 pages +
+
+
+
+
+ + ☆ Large Language Models As Faithful Explainers + + +
+ Large Language Models (LLMs) have recently become proficient in addressing +complex tasks by utilizing their rich internal knowledge and reasoning ability. +Consequently, this complexity hinders traditional input-focused explanation +algorithms for explaining the complex decision-making processes of LLMs. Recent +advancements have thus emerged for self-explaining their predictions through a +single feed-forward inference in a natural language format. However, natural +language explanations are often criticized for lack of faithfulness since these +explanations may not accurately reflect the decision-making behaviors of the +LLMs. In this work, we introduce a generative explanation framework, xLLM, to +improve the faithfulness of the explanations provided in natural language +formats for LLMs. Specifically, we propose an evaluator to quantify the +faithfulness of natural language explanation and enhance the faithfulness by an +iterative optimization process of xLLM, with the goal of maximizing the +faithfulness scores. Experiments conducted on three NLU datasets demonstrate +that xLLM can significantly improve the faithfulness of generated explanations, +which are in alignment with the behaviors of LLMs. + +
+
+
+
+
+ + ☆ Group Distributionally Robust Dataset Distillation with Risk + Minimization + + +
+ Dataset distillation (DD) has emerged as a widely adopted technique for +crafting a synthetic dataset that captures the essential information of a +training dataset, facilitating the training of accurate neural models. Its +applications span various domains, including transfer learning, federated +learning, and neural architecture search. The most popular methods for +constructing the synthetic data rely on matching the convergence properties of +training the model with the synthetic dataset and the training dataset. +However, targeting the training dataset must be thought of as auxiliary in the +same sense that the training set is an approximate substitute for the +population distribution, and the latter is the data of interest. Yet despite +its popularity, an aspect that remains unexplored is the relationship of DD to +its generalization, particularly across uncommon subgroups. That is, how can we +ensure that a model trained on the synthetic dataset performs well when faced +with samples from regions with low population density? Here, the +representativeness and coverage of the dataset become salient over the +guaranteed training error at inference. Drawing inspiration from +distributionally robust optimization, we introduce an algorithm that combines +clustering with the minimization of a risk measure on the loss to conduct DD. +We provide a theoretical rationale for our approach and demonstrate its +effective generalization and robustness across subgroups through numerical +experiments. + +
+
+
+
+
+ + ☆ A Perspective on Individualized Treatment Effects Estimation from + Time-series Health Data + + +
+ The burden of diseases is rising worldwide, with unequal treatment efficacy +for patient populations that are underrepresented in clinical trials. +Healthcare, however, is driven by the average population effect of medical +treatments and, therefore, operates in a "one-size-fits-all" approach, not +necessarily what best fits each patient. These facts suggest a pressing need +for methodologies to study individualized treatment effects (ITE) to drive +personalized treatment. Despite the increased interest in +machine-learning-driven ITE estimation models, the vast majority focus on +tabular data with limited review and understanding of methodologies proposed +for time-series electronic health records (EHRs). To this end, this work +provides an overview of ITE works for time-series data and insights into future +research. The work summarizes the latest work in the literature and reviews it +in light of theoretical assumptions, types of treatment settings, and +computational frameworks. Furthermore, this work discusses challenges and +future research directions for ITEs in a time-series setting. We hope this work +opens new directions and serves as a resource for understanding one of the +exciting yet under-studied research areas. + +
+
+
+
+
+ + ☆ Adversarial Robustness Through Artifact Design + + +
+ Adversarial examples arose as a challenge for machine learning. To hinder +them, most defenses alter how models are trained (e.g., adversarial training) +or inference is made (e.g., randomized smoothing). Still, while these +approaches markedly improve models' adversarial robustness, models remain +highly susceptible to adversarial examples. Identifying that, in certain +domains such as traffic-sign recognition, objects are implemented per standards +specifying how artifacts (e.g., signs) should be designed, we propose a novel +approach for improving adversarial robustness. Specifically, we offer a method +to redefine standards, making minor changes to existing ones, to defend against +adversarial examples. We formulate the problem of artifact design as a robust +optimization problem, and propose gradient-based and greedy search methods to +solve it. We evaluated our approach in the domain of traffic-sign recognition, +allowing it to alter traffic-sign pictograms (i.e., symbols within the signs) +and their colors. We found that, combined with adversarial training, our +approach led to up to 25.18\% higher robust accuracy compared to +state-of-the-art methods against two adversary types, while further increasing +accuracy on benign inputs. + +
+
+
+
+
+ + ☆ Open-Vocabulary Calibration for Vision-Language Models + + +
+ Vision-language models (VLMs) have emerged as formidable tools, showing their +strong capability in handling various open-vocabulary tasks in image +recognition, text-driven visual content generation, and visual chatbots, to +name a few. In recent years, considerable efforts and resources have been +devoted to adaptation methods for improving downstream performance of VLMs, +particularly on parameter-efficient fine-tuning methods like prompt learning. +However, a crucial aspect that has been largely overlooked is the confidence +calibration problem in fine-tuned VLMs, which could greatly reduce reliability +when deploying such models in the real world. This paper bridges the gap by +systematically investigating the confidence calibration problem in the context +of prompt learning and reveals that existing calibration methods are +insufficient to address the problem, especially in the open-vocabulary setting. +To solve the problem, we present a simple and effective approach called +Distance-Aware Calibration (DAC), which is based on scaling the temperature +using as guidance the distance between predicted text labels and base classes. +The experiments with 7 distinct prompt learning methods applied across 11 +diverse downstream datasets demonstrate the effectiveness of DAC, which +achieves high efficacy without sacrificing the inference speed. + +
+
+ comment: Preprrint +
+
+
+
+
+ + ☆ An Over Complete Deep Learning Method for Inverse Problems + + +
+ Obtaining meaningful solutions for inverse problems has been a major +challenge with many applications in science and engineering. Recent machine +learning techniques based on proximal and diffusion-based methods have shown +promising results. However, as we show in this work, they can also face +challenges when applied to some exemplary problems. We show that similar to +previous works on over-complete dictionaries, it is possible to overcome these +shortcomings by embedding the solution into higher dimensions. The novelty of +the work proposed is that we jointly design and learn the embedding and the +regularizer for the embedding vector. We demonstrate the merit of this approach +on several exemplary and common inverse problems. + +
+
+
+
+
+ + ☆ Latent Plan Transformer: Planning as Latent Variable Inference + + +
+ In tasks aiming for long-term returns, planning becomes necessary. We study +generative modeling for planning with datasets repurposed from offline +reinforcement learning. Specifically, we identify temporal consistency in the +absence of step-wise rewards as one key technical challenge. We introduce the +Latent Plan Transformer (LPT), a novel model that leverages a latent space to +connect a Transformer-based trajectory generator and the final return. LPT can +be learned with maximum likelihood estimation on trajectory-return pairs. In +learning, posterior sampling of the latent variable naturally gathers +sub-trajectories to form a consistent abstraction despite the finite context. +During test time, the latent variable is inferred from an expected return +before policy execution, realizing the idea of planning as inference. It then +guides the autoregressive policy throughout the episode, functioning as a plan. +Our experiments demonstrate that LPT can discover improved decisions from +suboptimal trajectories. It achieves competitive performance across several +benchmarks, including Gym-Mujoco, Maze2D, and Connect Four, exhibiting +capabilities of nuanced credit assignments, trajectory stitching, and +adaptation to environmental contingencies. These results validate that latent +variable inference can be a strong alternative to step-wise reward prompting. + +
+
+
+
+
+ + ☆ Learning with Diversification from Block Sparse Signal + + +
+ This paper introduces a novel prior called Diversified Block Sparse Prior to +characterize the widespread block sparsity phenomenon in real-world data. By +allowing diversification on variance and correlation matrix, we effectively +address the sensitivity issue of existing block sparse learning methods to +pre-defined block information, which enables adaptive block estimation while +mitigating the risk of overfitting. Based on this, a diversified block sparse +Bayesian learning method (DivSBL) is proposed, utilizing EM algorithm and dual +ascent method for hyperparameter estimation. Moreover, we establish the global +and local optimality theory of our model. Experiments validate the advantages +of DivSBL over existing algorithms. + +
+
+ comment: 12 pages, 12 figures, 3 tables +
+
+
+
+
+ + ☆ LEVI: Generalizable Fine-tuning via Layer-wise Ensemble of Different + Views + + +
+ Fine-tuning is becoming widely used for leveraging the power of pre-trained +foundation models in new downstream tasks. While there are many successes of +fine-tuning on various tasks, recent studies have observed challenges in the +generalization of fine-tuned models to unseen distributions (i.e., +out-of-distribution; OOD). To improve OOD generalization, some previous studies +identify the limitations of fine-tuning data and regulate fine-tuning to +preserve the general representation learned from pre-training data. However, +potential limitations in the pre-training data and models are often ignored. In +this paper, we contend that overly relying on the pre-trained representation +may hinder fine-tuning from learning essential representations for downstream +tasks and thus hurt its OOD generalization. It can be especially catastrophic +when new tasks are from different (sub)domains compared to pre-training data. +To address the issues in both pre-training and fine-tuning data, we propose a +novel generalizable fine-tuning method LEVI, where the pre-trained model is +adaptively ensembled layer-wise with a small task-specific model, while +preserving training and inference efficiencies. By combining two complementing +models, LEVI effectively suppresses problematic features in both the +fine-tuning data and pre-trained model and preserves useful features for new +tasks. Broad experiments with large language and vision models show that LEVI +greatly improves fine-tuning generalization via emphasizing different views +from fine-tuning data and pre-trained features. + +
+
+
+
+
+ + ☆ Domain Bridge: Generative model-based domain forensic for black-box + models + + +
+ In forensic investigations of machine learning models, techniques that +determine a model's data domain play an essential role, with prior work relying +on large-scale corpora like ImageNet to approximate the target model's domain. +Although such methods are effective in finding broad domains, they often +struggle in identifying finer-grained classes within those domains. In this +paper, we introduce an enhanced approach to determine not just the general data +domain (e.g., human face) but also its specific attributes (e.g., wearing +glasses). Our approach uses an image embedding model as the encoder and a +generative model as the decoder. Beginning with a coarse-grained description, +the decoder generates a set of images, which are then presented to the unknown +target model. Successful classifications by the model guide the encoder to +refine the description, which in turn, are used to produce a more specific set +of images in the subsequent iteration. This iterative refinement narrows down +the exact class of interest. A key strength of our approach lies in leveraging +the expansive dataset, LAION-5B, on which the generative model Stable Diffusion +is trained. This enlarges our search space beyond traditional corpora, such as +ImageNet. Empirical results showcase our method's performance in identifying +specific attributes of a model's input domain, paving the way for more detailed +forensic analyses of deep learning models. + +
+
+
+
+
+ + ☆ Feature Distribution on Graph Topology Mediates the Effect of Graph + Convolution: Homophily Perspective + + +
+ How would randomly shuffling feature vectors among nodes from the same class +affect graph neural networks (GNNs)? The feature shuffle, intuitively, perturbs +the dependence between graph topology and features (A-X dependence) for GNNs to +learn from. Surprisingly, we observe a consistent and significant improvement +in GNN performance following the feature shuffle. Having overlooked the impact +of A-X dependence on GNNs, the prior literature does not provide a satisfactory +understanding of the phenomenon. Thus, we raise two research questions. First, +how should A-X dependence be measured, while controlling for potential +confounds? Second, how does A-X dependence affect GNNs? In response, we (i) +propose a principled measure for A-X dependence, (ii) design a random graph +model that controls A-X dependence, (iii) establish a theory on how A-X +dependence relates to graph convolution, and (iv) present empirical analysis on +real-world graphs that aligns with the theory. We conclude that A-X dependence +mediates the effect of graph convolution, such that smaller dependence improves +GNN-based node classification. + +
+
+
+
+
+ + ☆ CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract + Patients + + +
+ The healthcare landscape is evolving, with patients seeking more reliable +information about their health conditions, treatment options, and potential +risks. Despite the abundance of information sources, the digital age overwhelms +individuals with excess, often inaccurate information. Patients primarily trust +doctors and hospital staff, highlighting the need for expert-endorsed health +information. However, the pressure on experts has led to reduced communication +time, impacting information sharing. To address this gap, we propose +CataractBot, an experts-in-the-loop chatbot powered by large language models +(LLMs). Developed in collaboration with a tertiary eye hospital in India, +CataractBot answers cataract surgery related questions instantly by querying a +curated knowledge base, and provides expert-verified responses asynchronously. +CataractBot features multimodal support and multilingual capabilities. In an +in-the-wild deployment study with 49 participants, CataractBot proved valuable, +providing anytime accessibility, saving time, and accommodating diverse +literacy levels. Trust was established through expert verification. Broadly, +our results could inform future work on designing expert-mediated LLM bots. + +
+
+
+
+
+ + ☆ InfLLM: Unveiling the Intrinsic Capacity of LLMs for Understanding + Extremely Long Sequences with Training-Free Memory + + +
+ Large language models (LLMs) have emerged as a cornerstone in real-world +applications with lengthy streaming inputs, such as LLM-driven agents. However, +existing LLMs, pre-trained on sequences with restricted maximum length, cannot +generalize to longer sequences due to the out-of-domain and distraction issues. +To alleviate these issues, existing efforts employ sliding attention windows +and discard distant tokens to achieve the processing of extremely long +sequences. Unfortunately, these approaches inevitably fail to capture +long-distance dependencies within sequences to deeply understand semantics. +This paper introduces a training-free memory-based method, InfLLM, to unveil +the intrinsic ability of LLMs to process streaming long sequences. +Specifically, InfLLM stores distant contexts into additional memory units and +employs an efficient mechanism to lookup token-relevant units for attention +computation. Thereby, InfLLM allows LLMs to efficiently process long sequences +while maintaining the ability to capture long-distance dependencies. Without +any training, InfLLM enables LLMs pre-trained on sequences of a few thousand +tokens to achieve superior performance than competitive baselines continually +training these LLMs on long sequences. Even when the sequence length is scaled +to $1,024$K, InfLLM still effectively captures long-distance dependencies. + +
+
+
+
+
+ + ☆ TinyLLM: Learning a Small Student from Multiple Large Language Models + + +
+ Transferring the reasoning capability from stronger large language models +(LLMs) to smaller ones has been quite appealing, as smaller LLMs are more +flexible to deploy with less expense. Among the existing solutions, knowledge +distillation stands out due to its outstanding efficiency and generalization. +However, existing methods suffer from several drawbacks, including limited +knowledge diversity and the lack of rich contextual information. To solve the +problems and facilitate the learning of compact language models, we propose +TinyLLM, a novel knowledge distillation paradigm to learn a small student LLM +from multiple large teacher LLMs. In particular, we encourage the student LLM +to not only generate the correct answers but also understand the rationales +behind these answers. Given that different LLMs possess diverse reasoning +skills, we guide the student model to assimilate knowledge from various teacher +LLMs. We further introduce an in-context example generator and a +teacher-forcing Chain-of-Thought strategy to ensure that the rationales are +accurate and grounded in contextually appropriate scenarios. Extensive +experiments on six datasets across two reasoning tasks demonstrate the +superiority of our method. Results show that TinyLLM can outperform large +teacher LLMs significantly, despite having a considerably smaller model size. + +
+
+
+
+
+ + ☆ Wasserstein Gradient Flows for Moreau Envelopes of f-Divergences in + Reproducing Kernel Hilbert Spaces + + +
+ Most commonly used $f$-divergences of measures, e.g., the Kullback-Leibler +divergence, are subject to limitations regarding the support of the involved +measures. A remedy consists of regularizing the $f$-divergence by a squared +maximum mean discrepancy (MMD) associated with a characteristic kernel $K$. In +this paper, we use the so-called kernel mean embedding to show that the +corresponding regularization can be rewritten as the Moreau envelope of some +function in the reproducing kernel Hilbert space associated with $K$. Then, we +exploit well-known results on Moreau envelopes in Hilbert spaces to prove +properties of the MMD-regularized $f$-divergences and, in particular, their +gradients. Subsequently, we use our findings to analyze Wasserstein gradient +flows of MMD-regularized $f$-divergences. Finally, we consider Wasserstein +gradient flows starting from empirical measures and provide +proof-of-the-concept numerical examples with Tsallis-$\alpha$ divergences. + +
+
+ comment: 42 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ DiSK: A Diffusion Model for Structured Knowledge + + +
+ Structured (dictionary-like) data presents challenges for left-to-right +language models, as they can struggle with structured entities for a wide +variety of reasons such as formatting and sensitivity to the order in which +attributes are presented. Tabular generative models suffer from a different set +of limitations such as their lack of flexibility. We introduce Diffusion Models +of Structured Knowledge (DiSK) - a new architecture and training approach +specialized for structured data. DiSK handles text, categorical, and continuous +numerical data using a Gaussian mixture model approach, which allows for +improved precision when dealing with numbers. It employs diffusion training to +model relationships between properties. Experiments demonstrate DiSK's +state-of-the-art performance on tabular data modeling, synthesis, and +imputation on over 15 datasets across diverse domains. DiSK provides an +effective inductive bias for generative modeling and manipulation of structured +data. The techniques we propose could open the door to improved knowledge +manipulation in future language models. + +
+
+ comment: 24 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Factorized Explainer for Graph Neural Networks AAAI 24 + + +
+ Graph Neural Networks (GNNs) have received increasing attention due to their +ability to learn from graph-structured data. To open the black-box of these +deep learning models, post-hoc instance-level explanation methods have been +proposed to understand GNN predictions. These methods seek to discover +substructures that explain the prediction behavior of a trained GNN. In this +paper, we show analytically that for a large class of explanation tasks, +conventional approaches, which are based on the principle of graph information +bottleneck (GIB), admit trivial solutions that do not align with the notion of +explainability. Instead, we argue that a modified GIB principle may be used to +avoid the aforementioned trivial solutions. We further introduce a novel +factorized explanation model with theoretical performance guarantees. The +modified GIB is used to analyze the structural properties of the proposed +factorized explainer. We conduct extensive experiments on both synthetic and +real-world datasets to validate the effectiveness of our proposed factorized +explainer. + +
+
+ comment: AAAI 24 +
+
+
+
+
+ + ♻ ☆ A Differentiable Partially Observable Generalized Linear Model with + Forward-Backward Message Passing + + +
+ The partially observable generalized linear model (POGLM) is a powerful tool +for understanding neural connectivity under the assumption of existing hidden +neurons. With spike trains only recorded from visible neurons, existing works +use variational inference to learn POGLM meanwhile presenting the difficulty of +learning this latent variable model. There are two main issues: (1) the sampled +Poisson hidden spike count hinders the use of the pathwise gradient estimator +in VI; and (2) the existing design of the variational model is neither +expressive nor time-efficient, which further affects the performance. For (1), +we propose a new differentiable POGLM, which enables the pathwise gradient +estimator, better than the score function gradient estimator used in existing +works. For (2), we propose the forward-backward message-passing sampling scheme +for the variational model. Comprehensive experiments show that our +differentiable POGLMs with our forward-backward message passing produce a +better performance on one synthetic and two real-world datasets. Furthermore, +our new method yields more interpretable parameters, underscoring its +significance in neuroscience. + +
+
+
+
+
+ + ♻ ☆ PAGAR: Taming Reward Misalignment in Inverse Reinforcement + Learning-Based Imitation Learning with Protagonist Antagonist Guided + Adversarial Reward + + +
+ Many imitation learning (IL) algorithms employ inverse reinforcement learning +(IRL) to infer the intrinsic reward function that an expert is implicitly +optimizing for based on their demonstrated behaviors. However, in practice, +IRL-based IL can fail to accomplish the underlying task due to a misalignment +between the inferred reward and the objective of the task. In this paper, we +address the susceptibility of IL to such misalignment by introducing a +semi-supervised reward design paradigm called Protagonist Antagonist Guided +Adversarial Reward (PAGAR). PAGAR-based IL trains a policy to perform well +under mixed reward functions instead of a single reward function as in +IRL-based IL. We identify the theoretical conditions under which PAGAR-based IL +can avoid the task failures caused by reward misalignment. We also present a +practical on-and-off policy approach to implementing PAGAR-based IL. +Experimental results show that our algorithm outperforms standard IL baselines +in complex tasks and challenging transfer settings. + +
+
+
+
+
+ + ♻ ☆ A General Theory for Kernel Packets: from state space model to compactly + supported basis + + +
+ It is well known that the state space (SS) model formulation of a Gaussian +process (GP) can lower its training and prediction time both to O(n) for n data +points. We prove that an $m$-dimensional SS model formulation of GP is +equivalent to a concept we introduce as the general right Kernel Packet (KP): a +transformation for the GP covariance function $K$ such that +$\sum_{i=0}^{m}a_iD_t^{(j)}K(t,t_i)=0$ holds for any $t \leq t_1$, 0 $\leq j +\leq m-1$, and $m+1$ consecutive points $t_i$, where ${D}_t^{(j)}f(t) $ denotes +$j$-th order derivative acting on $t$. We extend this idea to the backward SS +model formulation of the GP, leading to the concept of the left KP for next $m$ +consecutive points: $\sum_{i=0}^{m}b_i{D}_t^{(j)}K(t,t_{m+i})=0$ for any $t\geq +t_{2m}$. By combining both left and right KPs, we can prove that a suitable +linear combination of these covariance functions yields $m$ compactly supported +KP functions: $\phi^{(j)}(t)=0$ for any $t\not\in(t_0,t_{2m})$ and +$j=0,\cdots,m-1$. KPs further reduce the prediction time of GP to O(log n) or +even O(1), can be applied to more general problems involving the derivative of +GPs, and have multi-dimensional generalization for scattered data. + +
+
+
+
+
+ + ♻ ☆ Solving Large-scale Spatial Problems with Convolutional Neural Networks + + +
+ Over the past decade, deep learning research has been accelerated by +increasingly powerful hardware, which facilitated rapid growth in the model +complexity and the amount of data ingested. This is becoming unsustainable and +therefore refocusing on efficiency is necessary. In this paper, we employ +transfer learning to improve training efficiency for large-scale spatial +problems. We propose that a convolutional neural network (CNN) can be trained +on small windows of signals, but evaluated on arbitrarily large signals with +little to no performance degradation, and provide a theoretical bound on the +resulting generalization error. Our proof leverages shift-equivariance of CNNs, +a property that is underexploited in transfer learning. The theoretical results +are experimentally supported in the context of mobile infrastructure on demand +(MID). The proposed approach is able to tackle MID at large scales with +hundreds of agents, which was computationally intractable prior to this work. + +
+
+ comment: 6 pages, 2 figures, submitted to Asilomar Conference on Signals, + Systems, and Computers 2023 +
+
+
+
+
+ + ♻ ☆ High-dimensional and Permutation Invariant Anomaly Detection + + +
+ Methods for anomaly detection of new physics processes are often limited to +low-dimensional spaces due to the difficulty of learning high-dimensional +probability densities. Particularly at the constituent level, incorporating +desirable properties such as permutation invariance and variable-length inputs +becomes difficult within popular density estimation methods. In this work, we +introduce a permutation-invariant density estimator for particle physics data +based on diffusion models, specifically designed to handle variable-length +inputs. We demonstrate the efficacy of our methodology by utilizing the learned +density as a permutation-invariant anomaly detection score, effectively +identifying jets with low likelihood under the background-only hypothesis. To +validate our density estimation method, we investigate the ratio of learned +densities and compare to those obtained by a supervised classification +algorithm. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Empirical Risk Minimization with Shuffled SGD: A Primal-Dual Perspective + and Improved Bounds + + +
+ Stochastic gradient descent (SGD) is perhaps the most prevalent optimization +method in modern machine learning. Contrary to the empirical practice of +sampling from the datasets without replacement and with (possible) reshuffling +at each epoch, the theoretical counterpart of SGD usually relies on the +assumption of sampling with replacement. It is only very recently that SGD with +sampling without replacement -- shuffled SGD -- has been analyzed. For convex +finite sum problems with $n$ components and under the $L$-smoothness assumption +for each component function, there are matching upper and lower bounds, under +sufficiently small -- $\mathcal{O}(\frac{1}{nL})$ -- step sizes. Yet those +bounds appear too pessimistic -- in fact, the predicted performance is +generally no better than for full gradient descent -- and do not agree with the +empirical observations. In this work, to narrow the gap between the theory and +practice of shuffled SGD, we sharpen the focus from general finite sum problems +to empirical risk minimization with linear predictors. This allows us to take a +primal-dual perspective and interpret shuffled SGD as a primal-dual method with +cyclic coordinate updates on the dual side. Leveraging this perspective, we +prove fine-grained complexity bounds that depend on the data matrix and are +never worse than what is predicted by the existing bounds. Notably, our bounds +predict much faster convergence than the existing analyses -- by a factor of +the order of $\sqrt{n}$ in some cases. We empirically demonstrate that on +common machine learning datasets our bounds are indeed much tighter. We further +extend our analysis to nonsmooth convex problems and more general finite-sum +problems, with similar improvements. + +
+
+
+
+
+ + ♻ ☆ Large Multi-Modal Models (LMMs) as Universal Foundation Models for + AI-Native Wireless Systems + + +
+ Large language models (LLMs) and foundation models have been recently touted +as a game-changer for 6G systems. However, recent efforts on LLMs for wireless +networks are limited to a direct application of existing language models that +were designed for natural language processing (NLP) applications. To address +this challenge and create wireless-centric foundation models, this paper +presents a comprehensive vision on how to design universal foundation models +that are tailored towards the deployment of artificial intelligence (AI)-native +networks. Diverging from NLP-based foundation models, the proposed framework +promotes the design of large multi-modal models (LMMs) fostered by three key +capabilities: 1) processing of multi-modal sensing data, 2) grounding of +physical symbol representations in real-world wireless systems using causal +reasoning and retrieval-augmented generation (RAG), and 3) enabling +instructibility from the wireless environment feedback to facilitate dynamic +network adaptation thanks to logical and mathematical reasoning facilitated by +neuro-symbolic AI. In essence, these properties enable the proposed LMM +framework to build universal capabilities that cater to various cross-layer +networking tasks and alignment of intents across different domains. Preliminary +results from experimental evaluation demonstrate the efficacy of grounding +using RAG in LMMs, and showcase the alignment of LMMs with wireless system +designs. Furthermore, the enhanced rationale exhibited in the responses to +mathematical questions by LMMs, compared to vanilla LLMs, demonstrates the +logical and mathematical reasoning capabilities inherent in LMMs. Building on +those results, we present a sequel of open questions and challenges for LMMs. +We then conclude with a set of recommendations that ignite the path towards +LMM-empowered AI-native systems. + +
+
+
+
+
+ + ♻ ☆ Efficient Numerical Wave Propagation Enhanced By An End-to-End Deep + Learning Model + + +
+ In a variety of scientific and engineering domains, the need for +high-fidelity and efficient solutions for high-frequency wave propagation holds +great significance. Recent advances in wave modeling use sufficiently accurate +fine solver outputs to train a neural networks that enhances the accuracy of a +fast but inaccurate coarse solver. A stable and fast solver allows the use of +Parareal, a parallel-in-time algorithm to correct high-frequency wave +components. In this paper we build upon the work of Nguyen and Tsai (2023) and +present a unified system that integrates a numerical solver with a neural +network into an end-to-end framework. In the proposed setting, we investigate +refinements to the deep learning architecture, data generation algorithm and +Parareal scheme. Our results show that the cohesive structure improves +performance without sacrificing speed, and demonstrate the importance of +temporal dynamics, as well as Parareal, for accurate wave propagation. + +
+
+
+
+
+ + ♻ ☆ Deep Fusion: Efficient Network Training via Pre-trained Initializations + + +
+ In recent years, deep learning has made remarkable progress in a wide range +of domains, with a particularly notable impact on natural language processing +tasks. One of the challenges associated with training deep neural networks in +the context of LLMs is the need for large amounts of computational resources +and time. To mitigate this, network growing algorithms offer potential cost +savings, but their underlying mechanisms are poorly understood. We present two +notable contributions in this paper. First, we present Deep Fusion, an +efficient approach to network training that leverages pre-trained +initializations of smaller networks. Second, we propose a theoretical framework +using backward error analysis to illustrate the dynamics of mid-training +network growth. Our experiments show how Deep Fusion is a practical and +effective approach that not only accelerates the training process but also +reduces computational requirements, maintaining or surpassing traditional +training methods' performance in various NLP tasks and T5 model sizes. Finally, +we validate our theoretical framework, which guides the optimal use of Deep +Fusion, showing that with carefully optimized training dynamics, it +significantly reduces both training time and resource consumption. + +
+
+
+
+
+ + ♻ ☆ When Analytic Calculus Cracks AdaBoost Code + + +
+ The principle of boosting in supervised learning involves combining multiple +weak classifiers to obtain a stronger classifier. AdaBoost has the reputation +to be a perfect example of this approach. + This study analyzes the (two classes) AdaBoost procedure implemented in +scikit-learn. + This paper shows that AdaBoost is an algorithm in name only, as the resulting +combination of weak classifiers can be explicitly calculated using a truth +table. + Indeed, using a logical analysis of the training set with weak classifiers +constructing a truth table, we recover, through an analytical formula, the +weights of the combination of these weak classifiers obtained by the procedure. + We observe that this formula does not give the point of minimum of the risk, +we provide a system to compute the exact point of minimum and we check that the +AdaBoost procedure in scikit-learn does not implement the algorithm described +by Freund and Schapire. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Compact Binary Systems Waveform Generation with Generative Pre-trained + Transformer + + +
+ Space-based gravitational wave detection is one of the most anticipated +gravitational wave (GW) detection projects in the next decade, which is +promising to detect abundant compact binary systems. However, the precise +prediction of space GW waveforms remains unexplored. To solve the data +processing difficulty in the increasing waveform complexity caused by +detectors' response and second-generation time-delay interferometry (TDI 2.0), +an interpretable pre-trained large model named CBS-GPT (Compact Binary Systems +Waveform Generation with Generative Pre-trained Transformer) is proposed. For +compact binary system waveforms, three models were trained to predict the +waveforms of massive black hole binary (MBHB), extreme mass-ratio inspirals +(EMRIs), and galactic binary (GB), achieving prediction accuracies of 99%, 91%, +and 99%, respectively at most.The CBS-GPT model exhibits notable generalization +and interpretability, with its hidden parameters effectively capturing the +intricate information of waveforms, even with complex instrument response and a +wide parameter range. Our research demonstrates the potential of large +pre-trained models in gravitational wave realm, opening up new opportunities +and guidance for future researches such as the complex waveforms generation, +gap completion, and deep learning model design for GW science. + +
+
+
+
+
+ + ♻ ☆ Multivariate Probabilistic Time Series Forecasting with Correlated + Errors + + +
+ Modeling the correlations among errors is closely associated with how +accurately the model can quantify predictive uncertainty in probabilistic time +series forecasting. Recent multivariate models have made significant progress +in accounting for contemporaneous correlations among errors, while a common +assumption on these errors is that they are temporally independent for the sake +of statistical simplicity. However, real-world observations often deviate from +this assumption, since errors usually exhibit substantial autocorrelation due +to various factors such as the exclusion of temporally correlated covariates. +In this work, we propose an efficient method, based on a low-rank-plus-diagonal +parameterization of the covariance matrix, which can effectively characterize +the autocorrelation of errors. The proposed method possesses several desirable +properties: the complexity does not scale with the number of time series, the +resulting covariance can be used for calibrating predictions, and it can +seamlessly integrate with any model with Gaussian-distributed errors. We +empirically demonstrate these properties using two distinct neural forecasting +models-GPVar and Transformer. Our experimental results confirm the +effectiveness of our method in enhancing predictive accuracy and the quality of +uncertainty quantification on multiple real-world datasets. + +
+
+ comment: This paper extends the work presented in arXiv:2305.17028 to a + multivariate setting +
+
+
+
+
+ + ♻ ☆ Statistical Guarantees for Link Prediction using Graph Neural Networks + + +
+ This paper derives statistical guarantees for the performance of Graph Neural +Networks (GNNs) in link prediction tasks on graphs generated by a graphon. We +propose a linear GNN architecture (LG-GNN) that produces consistent estimators +for the underlying edge probabilities. We establish a bound on the mean squared +error and give guarantees on the ability of LG-GNN to detect high-probability +edges. Our guarantees hold for both sparse and dense graphs. Finally, we +demonstrate some of the shortcomings of the classical GCN architecture, as well +as verify our results on real and synthetic datasets. + +
+
+
+
+
+ + ♻ ☆ Heuristic Optimal Transport in Branching Networks + + +
+ Optimal transport aims to learn a mapping of sources to targets by minimizing +the cost, which is typically defined as a function of distance. The solution to +this problem consists of straight line segments optimally connecting sources to +targets, and it does not exhibit branching. These optimal solutions are in +stark contrast with both natural, and man-made transportation networks, where +branching structures are prevalent. Here we discuss a fast heuristic branching +method for optimal transport in networks. We also provide several numerical +applications to synthetic examples, a simplified cardiovascular network, and +the "Santa Claus" distribution network which includes 141,182 cities around the +world, with known location and population. + +
+
+ comment: Accepted in Int. J. Mod. Phys. C, 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Vector Quantile Regression on Manifolds + + +
+ Quantile regression (QR) is a statistical tool for distribution-free +estimation of conditional quantiles of a target variable given explanatory +features. QR is limited by the assumption that the target distribution is +univariate and defined on an Euclidean domain. Although the notion of quantiles +was recently extended to multi-variate distributions, QR for multi-variate +distributions on manifolds remains underexplored, even though many important +applications inherently involve data distributed on, e.g., spheres (climate and +geological phenomena), and tori (dihedral angles in proteins). By leveraging +optimal transport theory and c-concave functions, we meaningfully define +conditional vector quantile functions of high-dimensional variables on +manifolds (M-CVQFs). Our approach allows for quantile estimation, regression, +and computation of conditional confidence sets and likelihoods. We demonstrate +the approach's efficacy and provide insights regarding the meaning of +non-Euclidean quantiles through synthetic and real data experiments. + +
+
+
+
+
+ + ♻ ☆ Score-based Conditional Generation with Fewer Labeled Data by + Self-calibrating Classifier Guidance + + +
+ Score-based generative models (SGMs) are a popular family of deep generative +models that achieve leading image generation quality. Early studies extend SGMs +to tackle class-conditional generation by coupling an unconditional SGM with +the guidance of a trained classifier. Nevertheless, such classifier-guided SGMs +do not always achieve accurate conditional generation, especially when trained +with fewer labeled data. We argue that the problem is rooted in the +classifier's tendency to overfit without coordinating with the underlying +unconditional distribution. To make the classifier respect the unconditional +distribution, we propose improving classifier-guided SGMs by letting the +classifier regularize itself. The key idea of our proposed method is to use +principles from energy-based models to convert the classifier into another view +of the unconditional SGM. Existing losses for unconditional SGMs can then be +leveraged to achieve regularization by calibrating the classifier's internal +unconditional scores. The regularization scheme can be applied to not only the +labeled data but also unlabeled ones to further improve the classifier. Across +various percentages of fewer labeled data, empirical results show that the +proposed approach significantly enhances conditional generation quality. The +enhancements confirm the potential of the proposed self-calibration technique +for generative modeling with limited labeled data. + +
+
+
+
+
+ + ♻ ☆ Continuous Monte Carlo Graph Search AAMAS 2024 + + +
+ Online planning is crucial for high performance in many complex sequential +decision-making tasks. Monte Carlo Tree Search (MCTS) employs a principled +mechanism for trading off exploration for exploitation for efficient online +planning, and it outperforms comparison methods in many discrete +decision-making domains such as Go, Chess, and Shogi. Subsequently, extensions +of MCTS to continuous domains have been developed. However, the inherent high +branching factor and the resulting explosion of the search tree size are +limiting the existing methods. To address this problem, we propose Continuous +Monte Carlo Graph Search (CMCGS), an extension of MCTS to online planning in +environments with continuous state and action spaces. CMCGS takes advantage of +the insight that, during planning, sharing the same action policy between +several states can yield high performance. To implement this idea, at each time +step, CMCGS clusters similar states into a limited number of stochastic action +bandit nodes, which produce a layered directed graph instead of an MCTS search +tree. Experimental evaluation shows that CMCGS outperforms comparable planning +methods in several complex continuous DeepMind Control Suite benchmarks and 2D +navigation and exploration tasks with limited sample budgets. Furthermore, +CMCGS can be scaled up through parallelization, and it outperforms the +Cross-Entropy Method (CEM) in continuous control with learned dynamics models. + +
+
+ comment: Accepted at AAMAS 2024 (full paper & oral) +
+
+
+
+
+ + ♻ ☆ Kaizen: Practical Self-supervised Continual Learning with Continual + Fine-tuning WACV + + +
+ Self-supervised learning (SSL) has shown remarkable performance in computer +vision tasks when trained offline. However, in a Continual Learning (CL) +scenario where new data is introduced progressively, models still suffer from +catastrophic forgetting. Retraining a model from scratch to adapt to newly +generated data is time-consuming and inefficient. Previous approaches suggested +re-purposing self-supervised objectives with knowledge distillation to mitigate +forgetting across tasks, assuming that labels from all tasks are available +during fine-tuning. In this paper, we generalize self-supervised continual +learning in a practical setting where available labels can be leveraged in any +step of the SSL process. With an increasing number of continual tasks, this +offers more flexibility in the pre-training and fine-tuning phases. With +Kaizen, we introduce a training architecture that is able to mitigate +catastrophic forgetting for both the feature extractor and classifier with a +carefully designed loss function. By using a set of comprehensive evaluation +metrics reflecting different aspects of continual learning, we demonstrated +that Kaizen significantly outperforms previous SSL models in competitive vision +benchmarks, with up to 16.5% accuracy improvement on split CIFAR-100. Kaizen is +able to balance the trade-off between knowledge retention and learning from new +data with an end-to-end model, paving the way for practical deployment of +continual learning systems. + +
+
+ comment: Presented at IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2024. The code for this work is available at + https://github.com/dr-bell/kaizen +
+
+
+
+
+ + ♻ ☆ cDVGAN: One Flexible Model for Multi-class Gravitational Wave Signal and + Glitch Generation + + +
+ Simulating realistic time-domain observations of gravitational waves (GWs) +and GW detector glitches can help in advancing GW data analysis. Simulated data +can be used in downstream tasks by augmenting datasets for signal searches, +balancing data sets for machine learning, and validating detection schemes. In +this work, we present Conditional Derivative GAN (cDVGAN), a novel conditional +model in the Generative Adversarial Network framework for simulating multiple +classes of time-domain observations that represent gravitational waves (GWs) +and detector glitches. cDVGAN can also generate generalized hybrid samples that +span the variation between classes through interpolation in the conditioned +class vector. cDVGAN introduces an additional player into the typical 2-player +adversarial game of GANs, where an auxiliary discriminator analyzes the +first-order derivative time-series. Our results show that this provides +synthetic data that better captures the features of the original data. cDVGAN +conditions on three classes, two denoised from LIGO blip and tomte glitch +events from its 3rd observing run (O3), and the third representing binary black +hole (BBH) mergers. Our proposed cDVGAN outperforms 4 different baseline GAN +models in replicating the features of the three classes. Specifically, our +experiments show that training convolutional neural networks (CNNs) with our +cDVGAN-generated data improves the detection of samples embedded in detector +noise beyond the synthetic data from other state-of-the-art GAN models. Our +best synthetic dataset yields as much as a 4.2% increase in +area-under-the-curve (AUC) performance compared to synthetic datasets from +baseline GANs. Moreover, training the CNN with hybrid samples from our cDVGAN +outperforms CNNs trained only on the standard classes, when identifying real +samples embedded in LIGO detector background (4% AUC improvement for cDVGAN). + +
+
+
+
+
+ + ♻ ☆ Entropy-MCMC: Sampling from Flat Basins with Ease + + +
+ Bayesian deep learning counts on the quality of posterior distribution +estimation. However, the posterior of deep neural networks is highly +multi-modal in nature, with local modes exhibiting varying generalization +performance. Given a practical budget, targeting at the original posterior can +lead to suboptimal performance, as some samples may become trapped in "bad" +modes and suffer from overfitting. Leveraging the observation that "good" modes +with low generalization error often reside in flat basins of the energy +landscape, we propose to bias sampling on the posterior toward these flat +regions. Specifically, we introduce an auxiliary guiding variable, the +stationary distribution of which resembles a smoothed posterior free from sharp +modes, to lead the MCMC sampler to flat basins. By integrating this guiding +variable with the model parameter, we create a simple joint distribution that +enables efficient sampling with minimal computational overhead. We prove the +convergence of our method and further show that it converges faster than +several existing flatness-aware methods in the strongly convex setting. +Empirical results demonstrate that our method can successfully sample from flat +basins of the posterior, and outperforms all compared baselines on multiple +benchmarks including classification, calibration, and out-of-distribution +detection. + +
+
+
+
+
+ + ♻ ☆ Online Uniform Risk Times Sampling: First Approximation Algorithms, + Learning Augmentation with Full Confidence Interval Integration + + +
+ In digital health, the strategy of allocating a limited treatment budget +across available risk times is crucial to reduce user fatigue. This strategy, +however, encounters a significant obstacle due to the unknown actual number of +risk times, a factor not adequately addressed by existing methods lacking +theoretical guarantees. This paper introduces, for the first time, the online +uniform risk times sampling problem within the approximation algorithm +framework. We propose two online approximation algorithms for this problem, one +with and one without learning augmentation, and provide rigorous theoretical +performance guarantees for them using competitive ratio analysis. We assess the +performance of our algorithms using both synthetic experiments and a real-world +case study on HeartSteps mobile applications. + +
+
+
+
+
+ + ♻ ☆ Imitation Learning from Observation with Automatic Discount Scheduling ICLR 2024 + + +
+ Humans often acquire new skills through observation and imitation. For +robotic agents, learning from the plethora of unlabeled video demonstration +data available on the Internet necessitates imitating the expert without access +to its action, presenting a challenge known as Imitation Learning from +Observations (ILfO). A common approach to tackle ILfO problems is to convert +them into inverse reinforcement learning problems, utilizing a proxy reward +computed from the agent's and the expert's observations. Nonetheless, we +identify that tasks characterized by a progress dependency property pose +significant challenges for such approaches; in these tasks, the agent needs to +initially learn the expert's preceding behaviors before mastering the +subsequent ones. Our investigation reveals that the main cause is that the +reward signals assigned to later steps hinder the learning of initial +behaviors. To address this challenge, we present a novel ILfO framework that +enables the agent to master earlier behaviors before advancing to later ones. +We introduce an Automatic Discount Scheduling (ADS) mechanism that adaptively +alters the discount factor in reinforcement learning during the training phase, +prioritizing earlier rewards initially and gradually engaging later rewards +only when the earlier behaviors have been mastered. Our experiments, conducted +on nine Meta-World tasks, demonstrate that our method significantly outperforms +state-of-the-art methods across all tasks, including those that are unsolvable +by them. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Labeled Interactive Topic Models + + +
+ Topic models are valuable for understanding extensive document collections, +but they don't always identify the most relevant topics. Classical +probabilistic and anchor-based topic models offer interactive versions that +allow users to guide the models towards more pertinent topics. However, such +interactive features have been lacking in neural topic models. To correct this +lacuna, we introduce a user-friendly interaction for neural topic models. This +interaction permits users to assign a word label to a topic, leading to an +update in the topic model where the words in the topic become closely aligned +with the given label. Our approach encompasses two distinct kinds of neural +topic models. The first includes models where topic embeddings are trainable +and evolve during the training process. The second kind involves models where +topic embeddings are integrated post-training, offering a different approach to +topic refinement. To facilitate user interaction with these neural topic +models, we have developed an interactive interface. This interface enables +users to engage with and re-label topics as desired. We evaluate our method +through a human study, where users can relabel topics to find relevant +documents. Using our method, user labeling improves document rank scores, +helping to find more relevant documents to a given query when compared to no +user labeling. + +
+
+
+
+
+ + ♻ ☆ Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science + + +
+ Intelligent agents powered by large language models (LLMs) have demonstrated +substantial promise in autonomously conducting experiments and facilitating +scientific discoveries across various disciplines. While their capabilities are +promising, they also introduce novel vulnerabilities that demand careful +consideration for safety. However, there exists a notable gap in the +literature, as there has been no comprehensive exploration of these +vulnerabilities. This position paper fills this gap by conducting a thorough +examination of vulnerabilities in LLM-based agents within scientific domains, +shedding light on potential risks associated with their misuse and emphasizing +the need for safety measures. We begin by providing a comprehensive overview of +the potential risks inherent to scientific LLM agents, taking into account user +intent, the specific scientific domain, and their potential impact on the +external environment. Then, we delve into the origins of these vulnerabilities +and provide a scoping review of the limited existing works. Based on our +analysis, we propose a triadic framework involving human regulation, agent +alignment, and an understanding of environmental feedback (agent regulation) to +mitigate these identified risks. Furthermore, we highlight the limitations and +challenges associated with safeguarding scientific agents and advocate for the +development of improved models, robust benchmarks, and comprehensive +regulations to address these issues effectively. + +
+
+
+
+
+ + ♻ ☆ DS-MS-TCN: Otago Exercises Recognition with a Dual-Scale Multi-Stage + Temporal Convolutional Network + + +
+ The Otago Exercise Program (OEP) represents a crucial rehabilitation +initiative tailored for older adults, aimed at enhancing balance and strength. +Despite previous efforts utilizing wearable sensors for OEP recognition, +existing studies have exhibited limitations in terms of accuracy and +robustness. This study addresses these limitations by employing a single +waist-mounted Inertial Measurement Unit (IMU) to recognize OEP exercises among +community-dwelling older adults in their daily lives. A cohort of 36 older +adults participated in laboratory settings, supplemented by an additional 7 +older adults recruited for at-home assessments. The study proposes a Dual-Scale +Multi-Stage Temporal Convolutional Network (DS-MS-TCN) designed for two-level +sequence-to-sequence classification, incorporating them in one loss function. +In the first stage, the model focuses on recognizing each repetition of the +exercises (micro labels). Subsequent stages extend the recognition to encompass +the complete range of exercises (macro labels). The DS-MS-TCN model surpasses +existing state-of-the-art deep learning models, achieving f1-scores exceeding +80% and Intersection over Union (IoU) f1-scores surpassing 60% for all four +exercises evaluated. Notably, the model outperforms the prior study utilizing +the sliding window technique, eliminating the need for post-processing stages +and window size tuning. To our knowledge, we are the first to present a novel +perspective on enhancing Human Activity Recognition (HAR) systems through the +recognition of each repetition of activities. + +
+
+
+
+
+ + ♻ ☆ Defending Our Privacy With Backdoors + + +
+ The proliferation of large AI models trained on uncurated, often sensitive +web-scraped data has raised significant privacy concerns. One of the concerns +is that adversaries can extract information about the training data using +privacy attacks. Unfortunately, the task of removing specific information from +the models without sacrificing performance is not straightforward and has +proven to be challenging. We propose a rather easy yet effective defense based +on backdoor attacks to remove private information such as names and faces of +individuals from vision-language models by fine-tuning them for only a few +minutes instead of re-training them from scratch. Specifically, through +strategic insertion of backdoors into text encoders, we align the embeddings of +sensitive phrases with those of neutral terms-"a person" instead of the +person's actual name. For image encoders, we map embeddings of individuals to +be removed from the model to a universal, anonymous embedding. Our empirical +results demonstrate the effectiveness of our backdoor-based defense on CLIP by +assessing its performance using a specialized privacy attack for zero-shot +classifiers. Our approach provides not only a new "dual-use" perspective on +backdoor attacks, but also presents a promising avenue to enhance the privacy +of individuals within models trained on uncurated web-scraped data. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Guide to CAN IDS Data & Introduction of the ROAD Dataset + + +
+ Although ubiquitous in modern vehicles, Controller Area Networks (CANs) lack +basic security properties and are easily exploitable. A rapidly growing field +of CAN security research has emerged that seeks to detect intrusions on CANs. +Producing vehicular CAN data with a variety of intrusions is out of reach for +most researchers as it requires expensive assets and expertise. To assist +researchers, we present the first comprehensive guide to the existing open CAN +intrusion datasets, including a quality analysis of each dataset and an +enumeration of each's benefits, drawbacks, and suggested use case. Current +public CAN IDS datasets are limited to real fabrication (simple message +injection) attacks and simulated attacks often in synthetic data, which lack +fidelity. In general, the physical effects of attacks on the vehicle are not +verified in the available datasets. Only one dataset provides signal-translated +data but not a corresponding raw binary version. Overall, the available data +pigeon-holes CAN IDS works into testing on limited, often inappropriate data +(usually with attacks that are too easily detectable to truly test the method), +and this lack data has stymied comparability and reproducibility of results. As +our primary contribution, we present the ROAD (Real ORNL Automotive +Dynamometer) CAN Intrusion Dataset, consisting of over 3.5 hours of one +vehicle's CAN data. ROAD contains ambient data recorded during a diverse set of +activities, and attacks of increasing stealth with multiple variants and +instances of real fuzzing, fabrication, and unique advanced attacks, as well as +simulated masquerade attacks. To facilitate benchmarking CAN IDS methods that +require signal-translated inputs, we also provide the signal time series format +for many of the CAN captures. Our contributions aim to facilitate appropriate +benchmarking and needed comparability in the CAN IDS field. + +
+
+ comment: title changed and author added from original version +
+
+
+
+
+ + ♻ ☆ Mixed Autoencoder for Self-supervised Visual Representation Learning CVPR 2023 + + +
+ Masked Autoencoder (MAE) has demonstrated superior performance on various +vision tasks via randomly masking image patches and reconstruction. However, +effective data augmentation strategies for MAE still remain open questions, +different from those in contrastive learning that serve as the most important +part. This paper studies the prevailing mixing augmentation for MAE. We first +demonstrate that naive mixing will in contrast degenerate model performance due +to the increase of mutual information (MI). To address, we propose homologous +recognition, an auxiliary pretext task, not only to alleviate the MI +increasement by explicitly requiring each patch to recognize homologous +patches, but also to perform object-aware self-supervised pre-training for +better downstream dense perception performance. With extensive experiments, we +demonstrate that our proposed Mixed Autoencoder (MixedAE) achieves the +state-of-the-art transfer results among masked image modeling (MIM) +augmentations on different downstream tasks with significant efficiency. +Specifically, our MixedAE outperforms MAE by +0.3% accuracy, +1.7 mIoU and +0.9 +AP on ImageNet-1K, ADE20K and COCO respectively with a standard ViT-Base. +Moreover, MixedAE surpasses iBOT, a strong MIM method combined with instance +discrimination, while accelerating training by 2x. To our best knowledge, this +is the very first work to consider mixing for MIM from the perspective of +pretext task design. Code will be made available. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Localizing Anomalies in Critical Infrastructure using Model-Based Drift + Explanations + + +
+ Facing climate change, the already limited availability of drinking water +will decrease in the future rendering drinking water an increasingly scarce +resource. Considerable amounts of it are lost through leakages in water +transportation and distribution networks. Thus, anomaly detection and +localization, in particular for leakages, are crucial but challenging tasks due +to the complex interactions and changing demands in water distribution +networks. In this work, we analyze the effects of anomalies on the dynamics of +critical infrastructure systems by modeling the networks employing Bayesian +networks. We then discuss how the problem is connected to and can be considered +through the lens of concept drift. In particular, we argue that model-based +explanations of concept drift are a promising tool for localizing anomalies +given limited information about the network. The methodology is experimentally +evaluated using realistic benchmark scenarios. To showcase that our methodology +applies to critical infrastructure more generally, in addition to considering +leakages and sensor faults in water systems, we showcase the suitability of the +derived technique to localize sensor faults in power systems. + +
+
+
+
+
+ + ♻ ☆ Fully Hyperbolic Convolutional Neural Networks for Computer Vision + + +
+ Real-world visual data exhibit intrinsic hierarchical structures that can be +represented effectively in hyperbolic spaces. Hyperbolic neural networks (HNNs) +are a promising approach for learning feature representations in such spaces. +However, current HNNs in computer vision rely on Euclidean backbones and only +project features to the hyperbolic space in the task heads, limiting their +ability to fully leverage the benefits of hyperbolic geometry. To address this, +we present HCNN, a fully hyperbolic convolutional neural network (CNN) designed +for computer vision tasks. Based on the Lorentz model, we generalize +fundamental components of CNNs and propose novel formulations of the +convolutional layer, batch normalization, and multinomial logistic regression. +{Experiments on standard vision tasks demonstrate the promising performance of +our HCNN framework in both hybrid and fully hyperbolic settings.} Overall, we +believe our contributions provide a foundation for developing more powerful +HNNs that can better represent complex structures found in image data. Our code +is publicly available at https://github.com/kschwethelm/HyperbolicCV. + +
+
+
+
+
+ + ♻ ☆ Universal Jailbreak Backdoors from Poisoned Human Feedback ICLR 2024 + + +
+ Reinforcement Learning from Human Feedback (RLHF) is used to align large +language models to produce helpful and harmless responses. Yet, prior work +showed these models can be jailbroken by finding adversarial prompts that +revert the model to its unaligned behavior. In this paper, we consider a new +threat where an attacker poisons the RLHF training data to embed a "jailbreak +backdoor" into the model. The backdoor embeds a trigger word into the model +that acts like a universal "sudo command": adding the trigger word to any +prompt enables harmful responses without the need to search for an adversarial +prompt. Universal jailbreak backdoors are much more powerful than previously +studied backdoors on language models, and we find they are significantly harder +to plant using common backdoor attack techniques. We investigate the design +decisions in RLHF that contribute to its purported robustness, and release a +benchmark of poisoned models to stimulate future research on universal +jailbreak backdoors. + +
+
+ comment: Accepted as conference paper in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Recurrent Distance Filtering for Graph Representation Learning + + +
+ Graph neural networks based on iterative one-hop message passing have been +shown to struggle in harnessing the information from distant nodes effectively. +Conversely, graph transformers allow each node to attend to all other nodes +directly, but lack graph inductive bias and have to rely on ad-hoc positional +encoding. In this paper, we propose a new architecture to reconcile these +challenges. Our approach stems from the recent breakthroughs in long-range +modeling provided by deep state-space models on sequential data: for a given +target node, our model aggregates other nodes by their shortest distances to +the target and uses a linear RNN to encode the sequence of hop representations. +The linear RNN is parameterized in a particular diagonal form for stable +long-range signal propagation and is theoretically expressive enough to encode +the neighborhood hierarchy. With no need for positional encoding, we +empirically show that the performance of our model is highly competitive +compared with that of state-of-the-art graph transformers on various +benchmarks, with a significantly reduced computational cost. + +
+
+
+
+
+ + ♻ ☆ How Far Can Fairness Constraints Help Recover From Biased Data? + + +
+ A general belief in fair classification is that fairness constraints incur a +trade-off with accuracy, which biased data may worsen. Contrary to this belief, +Blum & Stangl (2019) show that fair classification with equal opportunity +constraints even on extremely biased data can recover optimally accurate and +fair classifiers on the original data distribution. Their result is interesting +because it demonstrates that fairness constraints can implicitly rectify data +bias and simultaneously overcome a perceived fairness-accuracy trade-off. Their +data bias model simulates under-representation and label bias in +underprivileged population, and they show the above result on a stylized data +distribution with i.i.d. label noise, under simple conditions on the data +distribution and bias parameters. We propose a general approach to extend the +result of Blum & Stangl (2019) to different fairness constraints, data bias +models, data distributions, and hypothesis classes. We strengthen their result, +and extend it to the case when their stylized distribution has labels with +Massart noise instead of i.i.d. noise. We prove a similar recovery result for +arbitrary data distributions using fair reject option classifiers. We further +generalize it to arbitrary data distributions and arbitrary hypothesis classes, +i.e., we prove that for any data distribution, if the optimally accurate +classifier in a given hypothesis class is fair and robust, then it can be +recovered through fair classification with equal opportunity constraints on the +biased distribution whenever the bias parameters satisfy certain simple +conditions. Finally, we show applications of our technique to time-varying data +bias in classification and fair machine learning pipelines. + +
+
+
+
+
+ + ♻ ☆ Domain Adaptation based Interpretable Image Emotion Recognition using + Facial Expression Recognition + + +
+ A domain adaptation technique has been proposed in this paper to identify the +emotions in generic images containing facial & non-facial objects and non-human +components. It addresses the challenge of the insufficient availability of +pre-trained models and well-annotated datasets for image emotion recognition +(IER). It starts with proposing a facial emotion recognition (FER) system and +then moves on to adapting it for image emotion recognition. First, a +deep-learning-based FER system has been proposed that classifies a given facial +image into discrete emotion classes. Further, an image recognition system has +been proposed that adapts the proposed FER system to recognize the emotions +portrayed by images using domain adaptation. It classifies the generic images +into 'happy,' 'sad,' 'hate,' and 'anger' classes. A novel interpretability +approach, Divide and Conquer based Shap (DnCShap), has also been proposed to +interpret the highly relevant visual features for emotion recognition. The +proposed system's architecture has been decided through ablation studies, and +the experiments are conducted on four FER and four IER datasets. The proposed +IER system has shown an emotion classification accuracy of 59.61% for the IAPSa +dataset, 57.83% for the ArtPhoto dataset, 67.93% for the FI dataset, and 55.13% +for the EMOTIC dataset. The important visual features leading to a particular +emotion class have been identified, and the embedding plots for various emotion +classes have been analyzed to explain the proposed system's predictions. + +
+
+
+
+
+ + ♻ ☆ Deep Unrolling Networks with Recurrent Momentum Acceleration for + Nonlinear Inverse Problems + + +
+ Combining the strengths of model-based iterative algorithms and data-driven +deep learning solutions, deep unrolling networks (DuNets) have become a popular +tool to solve inverse imaging problems. While DuNets have been successfully +applied to many linear inverse problems, nonlinear problems tend to impair the +performance of the method. Inspired by momentum acceleration techniques that +are often used in optimization algorithms, we propose a recurrent momentum +acceleration (RMA) framework that uses a long short-term memory recurrent +neural network (LSTM-RNN) to simulate the momentum acceleration process. The +RMA module leverages the ability of the LSTM-RNN to learn and retain knowledge +from the previous gradients. We apply RMA to two popular DuNets -- the learned +proximal gradient descent (LPGD) and the learned primal-dual (LPD) methods, +resulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results +on two nonlinear inverse problems: a nonlinear deconvolution problem, and an +electrical impedance tomography problem with limited boundary measurements. In +the first experiment we have observed that the improvement due to RMA largely +increases with respect to the nonlinearity of the problem. The results of the +second example further demonstrate that the RMA schemes can significantly +improve the performance of DuNets in strongly ill-posed problems. + +
+
+
+
+
+ + ♻ ☆ An Equivalence between Bayesian Priors and Penalties in Variational + Inference + + +
+ In machine learning, it is common to optimize the parameters of a +probabilistic model, modulated by an ad hoc regularization term that penalizes +some values of the parameters. Regularization terms appear naturally in +Variational Inference, a tractable way to approximate Bayesian posteriors: the +loss to optimize contains a Kullback--Leibler divergence term between the +approximate posterior and a Bayesian prior. We fully characterize the +regularizers that can arise according to this procedure, and provide a +systematic way to compute the prior corresponding to a given penalty. Such a +characterization can be used to discover constraints over the penalty function, +so that the overall procedure remains Bayesian. + +
+
+
+
+
+ + ♻ ☆ A Stable, Fast, and Fully Automatic Learning Algorithm for Predictive + Coding Networks + + +
+ Predictive coding networks are neuroscience-inspired models with roots in +both Bayesian statistics and neuroscience. Training such models, however, is +quite inefficient and unstable. In this work, we show how by simply changing +the temporal scheduling of the update rule for the synaptic weights leads to an +algorithm that is much more efficient and stable than the original one, and has +theoretical guarantees in terms of convergence. The proposed algorithm, that we +call incremental predictive coding (iPC) is also more biologically plausible +than the original one, as it it fully automatic. In an extensive set of +experiments, we show that iPC constantly performs better than the original +formulation on a large number of benchmarks for image classification, as well +as for the training of both conditional and masked language models, in terms of +test accuracy, efficiency, and convergence with respect to a large set of +hyperparameters. + +
+
+ comment: Change of title and abstract, that now reflect the version accepted + for publication. One co-author also added, that performed the additional + experiments +
+
+
+
+
+ + ♻ ☆ On Mitigating the Utility-Loss in Differentially Private Learning: A new + Perspective by a Geometrically Inspired Kernel Approach + + +
+ Privacy-utility tradeoff remains as one of the fundamental issues of +differentially private machine learning. This paper introduces a geometrically +inspired kernel-based approach to mitigate the accuracy-loss issue in +classification. In this approach, a representation of the affine hull of given +data points is learned in Reproducing Kernel Hilbert Spaces (RKHS). This leads +to a novel distance measure that hides privacy-sensitive information about +individual data points and improves the privacy-utility tradeoff via +significantly reducing the risk of membership inference attacks. The +effectiveness of the approach is demonstrated through experiments on MNIST +dataset, Freiburg groceries dataset, and a real biomedical dataset. It is +verified that the approach remains computationally practical. The application +of the approach to federated learning is considered and it is observed that the +accuracy-loss due to data being distributed is either marginal or not +significantly high. + +
+
+
+
+
+ + ♻ ☆ Adaptive Multi-Agent Deep Reinforcement Learning for Timely Healthcare + Interventions + + +
+ Effective patient monitoring is vital for timely interventions and improved +healthcare outcomes. Traditional monitoring systems often struggle to handle +complex, dynamic environments with fluctuating vital signs, leading to delays +in identifying critical conditions. To address this challenge, we propose a +novel AI-driven patient monitoring framework using multi-agent deep +reinforcement learning (DRL). Our approach deploys multiple learning agents, +each dedicated to monitoring a specific physiological feature, such as heart +rate, respiration, and temperature. These agents interact with a generic +healthcare monitoring environment, learn the patients' behaviour patterns, and +make informed decisions to alert the corresponding Medical Emergency Teams +(METs) based on the level of emergency estimated. In this study, we evaluate +the performance of the proposed multi-agent DRL framework using real-world +physiological and motion data from two datasets: PPG-DaLiA and WESAD. We +compare the results with several baseline models, including Q-Learning, PPO, +Actor-Critic, Double DQN, and DDPG, as well as monitoring frameworks like +WISEML and CA-MAQL. Our experiments demonstrate that the proposed DRL approach +outperforms all other baseline models, achieving more accurate monitoring of +patient's vital signs. Furthermore, we conduct hyperparameter optimization to +fine-tune the learning process of each agent. By optimizing hyperparameters, we +enhance the learning rate and discount factor, thereby improving the agents' +overall performance in monitoring patient health status. + +
+
+ comment: This work has been submitted to the ELSEVIER for possible + publication. Copyright may be transferred without notice, after which this + version may no longer be accessible. arXiv admin note: text overlap with + arXiv:2309.10576 +
+
+
+
+
+ + ♻ ☆ An objective comparison of methods for augmented reality in laparoscopic + liver resection by preoperative-to-intraoperative image fusion + + +
+ Augmented reality for laparoscopic liver resection is a visualisation mode +that allows a surgeon to localise tumours and vessels embedded within the liver +by projecting them on top of a laparoscopic image. Preoperative 3D models +extracted from CT or MRI data are registered to the intraoperative laparoscopic +images during this process. In terms of 3D-2D fusion, most of the algorithms +make use of anatomical landmarks to guide registration. These landmarks include +the liver's inferior ridge, the falciform ligament, and the occluding contours. +They are usually marked by hand in both the laparoscopic image and the 3D +model, which is time-consuming and may contain errors if done by a +non-experienced user. Therefore, there is a need to automate this process so +that augmented reality can be used effectively in the operating room. We +present the Preoperative-to-Intraoperative Laparoscopic Fusion Challenge +(P2ILF), held during the Medical Imaging and Computer Assisted Interventions +(MICCAI 2022) conference, which investigates the possibilities of detecting +these landmarks automatically and using them in registration. The challenge was +divided into two tasks: 1) A 2D and 3D landmark detection task and 2) a 3D-2D +registration task. The teams were provided with training data consisting of 167 +laparoscopic images and 9 preoperative 3D models from 9 patients, with the +corresponding 2D and 3D landmark annotations. A total of 6 teams from 4 +countries participated, whose proposed methods were evaluated on 16 images and +two preoperative 3D models from two patients. All the teams proposed deep +learning-based methods for the 2D and 3D landmark segmentation tasks and +differentiable rendering-based methods for the registration task. Based on the +experimental outcomes, we propose three key hypotheses that determine current +limitations and future directions for research in this domain. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Gated recurrent neural networks discover attention + + +
+ Recent architectural developments have enabled recurrent neural networks +(RNNs) to reach and even surpass the performance of Transformers on certain +sequence modeling tasks. These modern RNNs feature a prominent design pattern: +linear recurrent layers interconnected by feedforward paths with multiplicative +gating. Here, we show how RNNs equipped with these two design elements can +exactly implement (linear) self-attention, the main building block of +Transformers. By reverse-engineering a set of trained RNNs, we find that +gradient descent in practice discovers our construction. In particular, we +examine RNNs trained to solve simple in-context learning tasks on which +Transformers are known to excel and find that gradient descent instills in our +RNNs the same attention-based in-context learning algorithm used by +Transformers. Our findings highlight the importance of multiplicative +interactions in neural networks and suggest that certain RNNs might be +unexpectedly implementing attention under the hood. + +
+
+
+
+
+ + ♻ ☆ O$n$ Learning Deep O($n$)-Equivariant Hyperspheres + + +
+ In this paper, we utilize hyperspheres and regular $n$-simplexes and propose +an approach to learning deep features equivariant under the transformations of +$n$D reflections and rotations, encompassed by the powerful group of O$(n)$. +Namely, we propose O$(n)$-equivariant neurons with spherical decision surfaces +that generalize to any dimension $n$, which we call Deep Equivariant +Hyperspheres. We demonstrate how to combine them in a network that directly +operates on the basis of the input points and propose an invariant operator +based on the relation between two points and a sphere, which as we show, turns +out to be a Gram matrix. Using synthetic and real-world data in $n$D, we +experimentally verify our theoretical contributions and find that our approach +is superior to the competing methods for O$(n)$-equivariant benchmark datasets +(classification and regression), demonstrating a favorable speed/performance +trade-off. + +
+
+
+
+
+ + ♻ ☆ Looking for a better fit? An Incremental Learning Multimodal Object + Referencing Framework adapting to Individual Drivers SC + + +
+ The rapid advancement of the automotive industry towards automated and +semi-automated vehicles has rendered traditional methods of vehicle +interaction, such as touch-based and voice command systems, inadequate for a +widening range of non-driving related tasks, such as referencing objects +outside of the vehicle. Consequently, research has shifted toward gestural +input (e.g., hand, gaze, and head pose gestures) as a more suitable mode of +interaction during driving. However, due to the dynamic nature of driving and +individual variation, there are significant differences in drivers' gestural +input performance. While, in theory, this inherent variability could be +moderated by substantial data-driven machine learning models, prevalent +methodologies lean towards constrained, single-instance trained models for +object referencing. These models show a limited capacity to continuously adapt +to the divergent behaviors of individual drivers and the variety of driving +scenarios. To address this, we propose \textit{IcRegress}, a novel +regression-based incremental learning approach that adapts to changing behavior +and the unique characteristics of drivers engaged in the dual task of driving +and referencing objects. We suggest a more personalized and adaptable solution +for multimodal gestural interfaces, employing continuous lifelong learning to +enhance driver experience, safety, and convenience. Our approach was evaluated +using an outside-the-vehicle object referencing use case, highlighting the +superiority of the incremental learning models adapted over a single trained +model across various driver traits such as handedness, driving experience, and +numerous driving conditions. Finally, to facilitate reproducibility, ease +deployment, and promote further research, we offer our approach as an +open-source framework at \url{https://github.com/amrgomaaelhady/IcRegress}. + +
+
+ comment: Accepted for publication in the Proceedings of the 29th International + Conference on Intelligent User Interfaces (IUI'24), March 18--21, 2024, in + Greenville, SC, USA +
+
+
+
+
+ + ♻ ☆ Beyond Training Objectives: Interpreting Reward Model Divergence in + Large Language Models + + +
+ Large language models (LLMs) fine-tuned by reinforcement learning from human +feedback (RLHF) are becoming more widely deployed. We coin the term +$\textit{Implicit Reward Model}$ (IRM) to refer to the changes that occur to an +LLM during RLHF that result in high-reward generations. We interpret IRMs, and +measure their divergence from the RLHF reward model used in the fine-tuning +process that induced them. By fitting a linear function to an LLM's IRM, a +reward model with the same type signature as the RLHF reward model is +constructed, allowing for direct comparison. Additionally, we validate our +construction of the IRM through cross-comparison with classifications of +features generated by an LLM based on their relevance to the RLHF reward model. +Better comprehending IRMs can help minimize discrepencies between LLM behavior +and training objectives, which we believe to be an essential component of the +$\textit{safety}$ and $\textit{alignment}$ of LLMs. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ MERT: Acoustic Music Understanding Model with Large-Scale + Self-supervised Training ICLR 2024 + + +
+ Self-supervised learning (SSL) has recently emerged as a promising paradigm +for training generalisable models on large-scale data in the fields of vision, +text, and speech. Although SSL has been proven effective in speech and audio, +its application to music audio has yet to be thoroughly explored. This is +partially due to the distinctive challenges associated with modelling musical +knowledge, particularly tonal and pitched characteristics of music. To address +this research gap, we propose an acoustic Music undERstanding model with +large-scale self-supervised Training (MERT), which incorporates teacher models +to provide pseudo labels in the masked language modelling (MLM) style acoustic +pre-training. In our exploration, we identified an effective combination of +teacher models, which outperforms conventional speech and audio approaches in +terms of performance. This combination includes an acoustic teacher based on +Residual Vector Quantisation - Variational AutoEncoder (RVQ-VAE) and a musical +teacher based on the Constant-Q Transform (CQT). Furthermore, we explore a wide +range of settings to overcome the instability in acoustic language model +pre-training, which allows our designed paradigm to scale from 95M to 330M +parameters. Experimental results indicate that our model can generalise and +perform well on 14 music understanding tasks and attain state-of-the-art (SOTA) +overall scores. + +
+
+ comment: accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Stable Vectorization of Multiparameter Persistent Homology using Signed + Barcodes as Measures NeurIPS 2023 + + +
+ Persistent homology (PH) provides topological descriptors for geometric data, +such as weighted graphs, which are interpretable, stable to perturbations, and +invariant under, e.g., relabeling. Most applications of PH focus on the +one-parameter case -- where the descriptors summarize the changes in topology +of data as it is filtered by a single quantity of interest -- and there is now +a wide array of methods enabling the use of one-parameter PH descriptors in +data science, which rely on the stable vectorization of these descriptors as +elements of a Hilbert space. Although the multiparameter PH (MPH) of data that +is filtered by several quantities of interest encodes much richer information +than its one-parameter counterpart, the scarceness of stability results for MPH +descriptors has so far limited the available options for the stable +vectorization of MPH. In this paper, we aim to bring together the best of both +worlds by showing how the interpretation of signed barcodes -- a recent family +of MPH descriptors -- as signed measures leads to natural extensions of +vectorization strategies from one parameter to multiple parameters. The +resulting feature vectors are easy to define and to compute, and provably +stable. While, as a proof of concept, we focus on simple choices of signed +barcodes and vectorizations, we already see notable performance improvements +when comparing our feature vectors to state-of-the-art topology-based methods +on various types of data. + +
+
+ comment: 26 pages, 4 figures, 9 tables; v2: final version in NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ RefinedFields: Radiance Fields Refinement for Unconstrained Scenes + + +
+ Modeling large scenes from unconstrained images has proven to be a major +challenge in computer vision. Existing methods tackling in-the-wild scene +modeling operate in closed-world settings, where no conditioning on priors +acquired from real-world images is present. We propose RefinedFields, which is, +to the best of our knowledge, the first method leveraging pre-trained models to +improve in-the-wild scene modeling. We employ pre-trained networks to refine +K-Planes representations via optimization guidance using an alternating +training procedure. We carry out extensive experiments and verify the merit of +our method on synthetic data and real tourism photo collections. RefinedFields +enhances rendered scenes with richer details and outperforms previous work on +the task of novel view synthesis in the wild. Our project page can be found at +https://refinedfields.github.io . + +
+
+
+
+
+ + ♻ ☆ PAC-Chernoff Bounds: Understanding Generalization in the Interpolation + Regime + + +
+ In this paper, we present a distribution-dependent PAC-Chernoff bound that is +perfectly tight for interpolators even under overparametrized model classes. +This bound relies on basic principles of Large Deviation Theory and naturally +provides a characterization of the smoothness of a model described as a simple +real-valued function. Based on this distribution-dependent bound and the novel +definition of smoothness, we propose an unifying theoretical explanation of why +some interpolators generalize remarkably well while others not. And why a wide +range of modern learning techniques (i.e., $\ell_2$-norm, +distance-from-initialization, input-gradient and variance regularization +together with data augmentation, invariant architectures, and +overparameterization) are able to find them. The emergent conclusion is that +all these methods provide complimentary procedures that bias the optimizer to +smoother interpolators, which, according to this theoretical analysis, are the +ones with better generalization error. One of the main insights of this study +is that distribution-dependent bounds serve as a powerful tool better +understand the complex dynamics behind the generalization capabilities of +highly-overparameterized interpolators. + +
+
+ comment: 34 pages, 10 figures, Pre-print +
+
+
+
+
+ + ♻ ☆ Emergence of In-Context Reinforcement Learning from Noise Distillation + + +
+ Recently, extensive studies in Reinforcement Learning have been carried out +on the ability of transformers to adapt in-context to various environments and +tasks. Current in-context RL methods are limited by their strict requirements +for data, which needs to be generated by RL agents or labeled with actions from +an optimal policy. In order to address this prevalent problem, we propose +AD$^\varepsilon$, a new data acquisition approach that enables in-context +Reinforcement Learning from noise-induced curriculum. We show that it is viable +to construct a synthetic noise injection curriculum which helps to obtain +learning histories. Moreover, we experimentally demonstrate that it is possible +to alleviate the need for generation using optimal policies, with in-context RL +still able to outperform the best suboptimal policy in a learning dataset by a +2x margin. + +
+
+ comment: Preprint, Under Review; code: https://github.com/corl-team/ad-eps +
+
+
+
+
+ + ♻ ☆ A Unified Theory of Diversity in Ensemble Learning + + +
+ We present a theory of ensemble diversity, explaining the nature of diversity +for a wide range of supervised learning scenarios. This challenge has been +referred to as the holy grail of ensemble learning, an open research issue for +over 30 years. Our framework reveals that diversity is in fact a hidden +dimension in the bias-variance decomposition of the ensemble loss. We prove a +family of exact bias-variance-diversity decompositions, for a wide range of +losses in both regression and classification, e.g., squared, cross-entropy, and +Poisson losses. For losses where an additive bias-variance decomposition is not +available (e.g., 0/1 loss) we present an alternative approach: quantifying the +effects of diversity, which turn out to be dependent on the label distribution. +Overall, we argue that diversity is a measure of model fit, in precisely the +same sense as bias and variance, but accounting for statistical dependencies +between ensemble members. Thus, we should not be maximising diversity as so +many works aim to do -- instead, we have a bias/variance/diversity trade-off to +manage. + +
+
+
+
+
+ + ♻ ☆ Adversarial Bandits against Arbitrary Strategies + + +
+ We study the adversarial bandit problem against arbitrary strategies, in +which $S$ is the parameter for the hardness of the problem and this parameter +is not given to the agent. To handle this problem, we adopt the master-base +framework using the online mirror descent method (OMD). We first provide a +master-base algorithm with simple OMD, achieving +$\tilde{O}(S^{1/2}K^{1/3}T^{2/3})$, in which $T^{2/3}$ comes from the variance +of loss estimators. To mitigate the impact of the variance, we propose using +adaptive learning rates for OMD and achieve +$\tilde{O}(\min\{\mathbb{E}[\sqrt{SKT\rho_T(h^\dagger)}],S\sqrt{KT}\})$, where +$\rho_T(h^\dagger)$ is a variance term for loss estimators. + +
+
+
+
+
+ + ♻ ☆ Exposing propaganda: an analysis of stylistic cues comparing human + annotations and machine classification EACL 2024 + + +
+ This paper investigates the language of propaganda and its stylistic +features. It presents the PPN dataset, standing for Propagandist Pseudo-News, a +multisource, multilingual, multimodal dataset composed of news articles +extracted from websites identified as propaganda sources by expert agencies. A +limited sample from this set was randomly mixed with papers from the regular +French press, and their URL masked, to conduct an annotation-experiment by +humans, using 11 distinct labels. The results show that human annotators were +able to reliably discriminate between the two types of press across each of the +labels. We propose different NLP techniques to identify the cues used by the +annotators, and to compare them with machine classification. They include the +analyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to +serve as a baseline, and four different classifiers: two RoBERTa-based models, +CATS using syntax, and one XGBoost combining syntactic and semantic features. + +
+
+ comment: Paper to appear in the EACL 2024 Proceedings of the Third Workshop on + Understanding Implicit and Underspecified Language (UnImplicit 2024) +
+
+
+
+
+ + ♻ ☆ Optimization-Free Test-Time Adaptation for Cross-Person Activity + Recognition + + +
+ Human Activity Recognition (HAR) models often suffer from performance +degradation in real-world applications due to distribution shifts in activity +patterns across individuals. Test-Time Adaptation (TTA) is an emerging learning +paradigm that aims to utilize the test stream to adjust predictions in +real-time inference, which has not been explored in HAR before. However, the +high computational cost of optimization-based TTA algorithms makes it +intractable to run on resource-constrained edge devices. In this paper, we +propose an Optimization-Free Test-Time Adaptation (OFTTA) framework for +sensor-based HAR. OFTTA adjusts the feature extractor and linear classifier +simultaneously in an optimization-free manner. For the feature extractor, we +propose Exponential DecayTest-time Normalization (EDTN) to replace the +conventional batch normalization (CBN) layers. EDTN combines CBN and Test-time +batch Normalization (TBN) to extract reliable features against domain shifts +with TBN's influence decreasing exponentially in deeper layers. For the +classifier, we adjust the prediction by computing the distance between the +feature and the prototype, which is calculated by a maintained support set. In +addition, the update of the support set is based on the pseudo label, which can +benefit from reliable features extracted by EDTN. Extensive experiments on +three public cross-person HAR datasets and two different TTA settings +demonstrate that OFTTA outperforms the state-of-the-art TTA approaches in both +classification performance and computational efficiency. Finally, we verify the +superiority of our proposed OFTTA on edge devices, indicating possible +deployment in real applications. Our code is available at +https://github.com/Claydon-Wang/OFTTA. + +
+
+ comment: To be presented at UbiComp 2024; Accepted by Proceedings of the ACM + on Interactive, Mobile, Wearable and Ubiquitous Technologies (IMWUT) +
+
+
+
+
+ + ♻ ☆ Price-Discrimination Game for Distributed Resource Management in + Federated Learning + + +
+ In vanilla federated learning (FL) such as FedAvg, the parameter server (PS) +and multiple distributed clients can form a typical buyer's market, where the +number of PS/buyers of FL services is far less than the number of +clients/sellers. In order to improve the performance of FL and reduce the cost +of motivating clients to participate in FL, this paper proposes to +differentiate the pricing for services provided by different clients rather +than simply providing the same service pricing for different clients. The price +is differentiated based on the performance improvements brought to FL and their +heterogeneity in computing and communication capabilities. To this end, a +price-discrimination game (PDG) is formulated to comprehensively address the +distributed resource management problems in FL, including multi-objective +trade-off, client selection, and incentive mechanism. As the PDG is a +mixed-integer nonlinear programming (MINLP) problem, a distributed +semi-heuristic algorithm with low computational complexity and low +communication overhead is designed to solve it. The simulation result verifies +the effectiveness of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache + Quantization + + +
+ LLMs are seeing growing use for applications such as document analysis and +summarization which require large context windows, and with these large context +windows KV cache activations surface as the dominant contributor to memory +consumption during inference. Quantization is a promising approach for +compressing KV cache activations; however, existing solutions fail to represent +activations accurately in ultra-low precisions, such as sub-4-bit. In this +work, we present KVQuant, which addresses this problem by incorporating novel +methods for quantizing cached KV activations, including: (i) Per-Channel Key +Quantization, where we adjust the dimension along which we quantize the Key +activations to better match the distribution; (ii) Pre-RoPE Key Quantization, +where we quantize Key activations before the rotary positional embedding to +mitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization, +where we derive per-layer sensitivity-weighted non-uniform datatypes that +better represent the distributions; (iv) Per-Vector Dense-and-Sparse +Quantization, where we isolate outliers separately for each vector to minimize +skews in quantization ranges; and (v) Q-Norm, where we normalize quantization +centroids in order to mitigate distribution shift, providing additional +benefits for 2-bit quantization. By applying our method to the LLaMA, LLaMA-2, +and Mistral models, we achieve $<0.1$ perplexity degradation with 3-bit +quantization on both Wikitext-2 and C4, outperforming existing approaches. Our +method enables serving the LLaMA-7B model with a context length of up to 1 +million on a single A100-80GB GPU and up to 10 million on an 8-GPU system. + +
+
+
+
+
+ + ♻ ☆ Accelerating Generalized Linear Models by Trading off Computation for + Uncertainty + + +
+ Bayesian Generalized Linear Models (GLMs) define a flexible probabilistic +framework to model categorical, ordinal and continuous data, and are widely +used in practice. However, exact inference in GLMs is prohibitively expensive +for large datasets, thus requiring approximations in practice. The resulting +approximation error adversely impacts the reliability of the model and is not +accounted for in the uncertainty of the prediction. In this work, we introduce +a family of iterative methods that explicitly model this error. They are +uniquely suited to parallel modern computing hardware, efficiently recycle +computations, and compress information to reduce both the time and memory +requirements for GLMs. As we demonstrate on a realistically large +classification problem, our method significantly accelerates training compared +to competitive baselines by trading off reduced computation for increased +uncertainty. + +
+
+ comment: Main text: 11 pages, 6 figures; Supplements: 13 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Greedy Shapley Client Selection for Communication-Efficient Federated + Learning + + +
+ The standard client selection algorithms for Federated Learning (FL) are +often unbiased and involve uniform random sampling of clients. This has been +proven sub-optimal for fast convergence under practical settings characterized +by significant heterogeneity in data distribution, computing, and communication +resources across clients. For applications having timing constraints due to +limited communication opportunities with the parameter server (PS), the client +selection strategy is critical to complete model training within the fixed +budget of communication rounds. To address this, we develop a biased client +selection strategy, GreedyFed, that identifies and greedily selects the most +contributing clients in each communication round. This method builds on a fast +approximation algorithm for the Shapley Value at the PS, making the computation +tractable for real-world applications with many clients. Compared to various +client selection strategies on several real-world datasets, GreedyFed +demonstrates fast and stable convergence with high accuracy under timing +constraints and when imposing a higher degree of heterogeneity in data +distribution, systems constraints, and privacy requirements. + +
+
+ comment: Accepted for publication in IEEE Networking Letters +
+
+
+
+
+ + ♻ ☆ A Survey on Efficient Federated Learning Methods for Foundation Model + Training + + +
+ Federated Learning (FL) has become an established technique to facilitate +privacy-preserving collaborative training across a multitude of clients. +However, new approaches to FL often discuss their contributions involving small +deep-learning models only and focus on training full models on clients. In the +wake of Foundation Models (FM), the reality is different for many deep learning +applications. Typically, FMs have already been pre-trained across a wide +variety of tasks and can be fine-tuned to specific downstream tasks over +significantly smaller datasets than required for full model training. However, +access to such datasets is often challenging. By its design, FL can help to +open data silos. With this survey, we introduce a novel taxonomy focused on +computational and communication efficiency, the vital elements to make use of +FMs in FL systems. We discuss the benefits and drawbacks of parameter-efficient +fine-tuning (PEFT) for FL applications, elaborate on the readiness of FL +frameworks to work with FMs and provide future research opportunities on how to +evaluate generative models in FL as well as the interplay of privacy and PEFT. + +
+
+
+
+
+ + ♻ ☆ Skip \n: A Simple Method to Reduce Hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Continual Learning for Large Language Models: A Survey + + +
+ Large language models (LLMs) are not amenable to frequent re-training, due to +high training costs arising from their massive scale. However, updates are +necessary to endow LLMs with new skills and keep them up-to-date with rapidly +evolving human knowledge. This paper surveys recent works on continual learning +for LLMs. Due to the unique nature of LLMs, we catalog continue learning +techniques in a novel multi-staged categorization scheme, involving continual +pretraining, instruction tuning, and alignment. We contrast continual learning +for LLMs with simpler adaptation methods used in smaller models, as well as +with other enhancement strategies like retrieval-augmented generation and model +editing. Moreover, informed by a discussion of benchmarks and evaluation, we +identify several challenges and future work directions for this crucial task. + +
+
+
+
+
+ + ♻ ☆ OHQ: On-chip Hardware-aware Quantization + + +
+ Quantization emerges as one of the most promising approaches for deploying +advanced deep models on resource-constrained hardware. Mixed-precision +quantization leverages multiple bit-width architectures to unleash the accuracy +and efficiency potential of quantized models. However, existing mixed-precision +quantization suffers exhaustive search space that causes immense computational +overhead. The quantization process thus relies on separate high-performance +devices rather than locally, which also leads to a significant gap between the +considered hardware metrics and the real deployment. In this paper, we propose +an On-chip Hardware-aware Quantization (OHQ) framework that performs +hardware-aware mixed-precision quantization without accessing online devices. +First, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling +perceive the actual efficiency metrics of the quantization operator on the +hardware. Second, we propose Mask-guided Quantization Estimation (MQE) +technique to efficiently estimate the accuracy metrics of operators under the +constraints of on-chip-level computing power. By synthesizing network and +hardware insights through linear programming, we obtain optimized bit-width +configurations. Notably, the quantization process occurs on-chip entirely +without any additional computing devices and data access. We demonstrate +accelerated inference after quantization for various architectures and +compression ratios, achieving 70% and 73% accuracy for ResNet-18 and +MobileNetV3, respectively. OHQ improves latency by 15~30% compared to INT8 on +deployment. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Regularized Linear Discriminant Analysis Using a Nonlinear Covariance + Matrix Estimator + + +
+ Linear discriminant analysis (LDA) is a widely used technique for data +classification. The method offers adequate performance in many classification +problems, but it becomes inefficient when the data covariance matrix is +ill-conditioned. This often occurs when the feature space's dimensionality is +higher than or comparable to the training data size. Regularized LDA (RLDA) +methods based on regularized linear estimators of the data covariance matrix +have been proposed to cope with such a situation. The performance of RLDA +methods is well studied, with optimal regularization schemes already proposed. +In this paper, we investigate the capability of a positive semidefinite +ridge-type estimator of the inverse covariance matrix that coincides with a +nonlinear (NL) covariance matrix estimator. The estimator is derived by +reformulating the score function of the optimal classifier utilizing linear +estimation methods, which eventually results in the proposed NL-RLDA +classifier. We derive asymptotic and consistent estimators of the proposed +technique's misclassification rate under the assumptions of a double-asymptotic +regime and multivariate Gaussian model for the classes. The consistent +estimator, coupled with a one-dimensional grid search, is used to set the value +of the regularization parameter required for the proposed NL-RLDA classifier. +Performance evaluations based on both synthetic and real data demonstrate the +effectiveness of the proposed classifier. The proposed technique outperforms +state-of-art methods over multiple datasets. When compared to state-of-the-art +methods across various datasets, the proposed technique exhibits superior +performance. + +
+
+ comment: \c{opyright} 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ PBSCSR: The Piano Bootleg Score Composer Style Recognition Dataset + + +
+ This article motivates, describes, and presents the PBSCSR dataset for +studying composer style recognition of piano sheet music. Our overarching goal +was to create a dataset for studying composer style recognition that is "as +accessible as MNIST and as challenging as ImageNet". To achieve this goal, we +use a previously proposed feature representation of sheet music called a +bootleg score, which encodes the position of noteheads relative to the staff +lines. Using this representation, we sample fixed-length bootleg score +fragments from piano sheet music images on IMSLP. The dataset itself contains +40,000 62x64 bootleg score images for a 9-way classification task, 100,000 +62x64 bootleg score images for a 100-way classification task, and 29,310 +unlabeled variable-length bootleg score images for pretraining. The labeled +data is presented in a form that mirrors MNIST images, in order to make it +extremely easy to visualize, manipulate, and train models in an efficient +manner. Additionally, we include relevant metadata to allow access to the +underlying raw sheet music images and other related data on IMSLP. We describe +several research tasks that could be studied with the dataset, including +variations of composer style recognition in a few-shot or zero-shot setting. +For tasks that have previously proposed models, we release code and baseline +results for future works to compare against. We also discuss open research +questions that the PBSCSR data is especially well suited to facilitate research +on and areas of fruitful exploration in future work. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ To be or not to be stable, that is the question: understanding neural + networks for inverse problems + + +
+ The solution of linear inverse problems arising, for example, in signal and +image processing is a challenging problem since the ill-conditioning amplifies, +in the solution, the noise present in the data. Recently introduced algorithms +based on deep learning overwhelm the more traditional model-based approaches in +performance, but they typically suffer from instability with respect to data +perturbation. In this paper, we theoretically analyze the trade-off between +stability and accuracy of neural networks, when used to solve linear imaging +inverse problems for not under-determined cases. Moreover, we propose different +supervised and unsupervised solutions to increase the network stability and +maintain a good accuracy, by means of regularization properties inherited from +a model-based iterative scheme during the network training and pre-processing +stabilizing operator in the neural networks. Extensive numerical experiments on +image deblurring confirm the theoretical results and the effectiveness of the +proposed deep learning-based approaches to handle noise on the data. + +
+
+ comment: 21 pages, 6 figure. Paper will be sent for publication on a journal + soon. This is a preliminary version, updated versions will be uploaded on + ArXiv +
+
+
+
+
+ + ♻ ☆ Detecting Multimedia Generated by Large AI Models: A Survey + + +
+ The rapid advancement of Large AI Models (LAIMs), particularly diffusion +models and large language models, has marked a new era where AI-generated +multimedia is increasingly integrated into various aspects of daily life. +Although beneficial in numerous fields, this content presents significant +risks, including potential misuse, societal disruptions, and ethical concerns. +Consequently, detecting multimedia generated by LAIMs has become crucial, with +a marked rise in related research. Despite this, there remains a notable gap in +systematic surveys that focus specifically on detecting LAIM-generated +multimedia. Addressing this, we provide the first survey to comprehensively +cover existing research on detecting multimedia (such as text, images, videos, +audio, and multimodal content) created by LAIMs. Specifically, we introduce a +novel taxonomy for detection methods, categorized by media modality, and +aligned with two perspectives: pure detection (aiming to enhance detection +performance) and beyond detection (adding attributes like generalizability, +robustness, and interpretability to detectors). Additionally, we have presented +a brief overview of generation mechanisms, public datasets, and online +detection tools to provide a valuable resource for researchers and +practitioners in this field. Furthermore, we identify current challenges in +detection and propose directions for future research that address unexplored, +ongoing, and emerging issues in detecting multimedia generated by LAIMs. Our +aim for this survey is to fill an academic gap and contribute to global AI +security efforts, helping to ensure the integrity of information in the digital +realm. The project link is +https://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey. + +
+
+
+
+
+ + ♻ ☆ Riemannian Preconditioned LoRA for Fine-Tuning Foundation Models + + +
+ In this work we study the enhancement of Low Rank Adaptation (LoRA) +fine-tuning procedure by introducing a Riemannian preconditioner in its +optimization step. Specifically, we introduce an $r\times r$ preconditioner in +each gradient step where $r$ is the LoRA rank. This preconditioner requires a +small change to existing optimizer code and creates virtually minuscule storage +and runtime overhead. Our experimental results with both large language models +and text-to-image diffusion models show that with our preconditioner, the +convergence and reliability of SGD and AdamW can be significantly enhanced. +Moreover, the training process becomes much more robust to hyperparameter +choices such as learning rate. Theoretically, we show that fine-tuning a +two-layer ReLU network in the convex paramaterization with our preconditioner +has convergence rate independent of condition number of the data matrix. This +new Riemannian preconditioner, previously explored in classic low-rank matrix +recovery, is introduced to deep learning tasks for the first time in our work. +We release our code at +https://github.com/pilancilab/Riemannian_Preconditioned_LoRA. + +
+
+
+
+
+ + ♻ ☆ Gradient Coding in Decentralized Learning for Evading Stragglers + + +
+ In this paper, we consider a decentralized learning problem in the presence +of stragglers. Although gradient coding techniques have been developed for +distributed learning to evade stragglers, where the devices send encoded +gradients with redundant training data, it is difficult to apply those +techniques directly to decentralized learning scenarios. To deal with this +problem, we propose a new gossip-based decentralized learning method with +gradient coding (GOCO). In the proposed method, to avoid the negative impact of +stragglers, the parameter vectors are updated locally using encoded gradients +based on the framework of stochastic gradient coding and then averaged in a +gossip-based manner. We analyze the convergence performance of GOCO for +strongly convex loss functions. And we also provide simulation results to +demonstrate the superiority of the proposed method in terms of learning +performance compared with the baseline methods. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ Perceptual-oriented Learned Image Compression with Dynamic Kernel + + +
+ In this paper, we extend our prior research named DKIC and propose the +perceptual-oriented learned image compression method, PO-DKIC. Specifically, +DKIC adopts a dynamic kernel-based dynamic residual block group to enhance the +transform coding and an asymmetric space-channel context entropy model to +facilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC +introduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to +maximize the overall perceptual quality under a rate constraint, we formulate +this challenge into a constrained programming problem and use the Linear +Integer Programming method for resolution. The experiments demonstrate that our +proposed method can generate realistic images with richer textures and finer +details when compared to state-of-the-art image compression techniques. + +
+
+
+
+
+ + ♻ ☆ Detecting Multimedia Generated by Large AI Models: A Survey + + +
+ The rapid advancement of Large AI Models (LAIMs), particularly diffusion +models and large language models, has marked a new era where AI-generated +multimedia is increasingly integrated into various aspects of daily life. +Although beneficial in numerous fields, this content presents significant +risks, including potential misuse, societal disruptions, and ethical concerns. +Consequently, detecting multimedia generated by LAIMs has become crucial, with +a marked rise in related research. Despite this, there remains a notable gap in +systematic surveys that focus specifically on detecting LAIM-generated +multimedia. Addressing this, we provide the first survey to comprehensively +cover existing research on detecting multimedia (such as text, images, videos, +audio, and multimodal content) created by LAIMs. Specifically, we introduce a +novel taxonomy for detection methods, categorized by media modality, and +aligned with two perspectives: pure detection (aiming to enhance detection +performance) and beyond detection (adding attributes like generalizability, +robustness, and interpretability to detectors). Additionally, we have presented +a brief overview of generation mechanisms, public datasets, and online +detection tools to provide a valuable resource for researchers and +practitioners in this field. Furthermore, we identify current challenges in +detection and propose directions for future research that address unexplored, +ongoing, and emerging issues in detecting multimedia generated by LAIMs. Our +aim for this survey is to fill an academic gap and contribute to global AI +security efforts, helping to ensure the integrity of information in the digital +realm. The project link is +https://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 120 + +
+
+
+ + ☆ AnyTool: Self-Reflective, Hierarchical Agents for Large-Scale API Calls + + +
+ We introduce AnyTool, a large language model agent designed to revolutionize +the utilization of a vast array of tools in addressing user queries. We utilize +over 16,000 APIs from Rapid API, operating under the assumption that a subset +of these APIs could potentially resolve the queries. AnyTool primarily +incorporates three elements: an API retriever with a hierarchical structure, a +solver aimed at resolving user queries using a selected set of API candidates, +and a self-reflection mechanism, which re-activates AnyTool if the initial +solution proves impracticable. AnyTool is powered by the function calling +feature of GPT-4, eliminating the need for training external modules. We also +revisit the evaluation protocol introduced by previous works and identify a +limitation in this protocol that leads to an artificially high pass rate. By +revising the evaluation protocol to better reflect practical application +scenarios, we introduce an additional benchmark, termed AnyToolBench. +Experiments across various datasets demonstrate the superiority of our AnyTool +over strong baselines such as ToolLLM and a GPT-4 variant tailored for tool +utilization. For instance, AnyTool outperforms ToolLLM by +35.4% in terms of +average pass rate on ToolBench. Code will be available at +https://github.com/dyabel/AnyTool. + +
+
+
+
+
+ + ☆ Linear-time Minimum Bayes Risk Decoding with Reference Aggregation + + +
+ Minimum Bayes Risk (MBR) decoding is a text generation technique that has +been shown to improve the quality of machine translations, but is expensive, +even if a sampling-based approximation is used. Besides requiring a large +number of sampled sequences, it requires the pairwise calculation of a utility +metric, which has quadratic complexity. In this paper, we propose to +approximate pairwise metric scores with scores calculated against aggregated +reference representations. This changes the complexity of utility estimation +from $O(n^2)$ to $O(n)$, while empirically preserving most of the quality gains +of MBR decoding. We release our source code at https://github.com/ZurichNLP/mbr + +
+
+
+
+
+ + ☆ HarmBench: A Standardized Evaluation Framework for Automated Red Teaming + and Robust Refusal + + +
+ Automated red teaming holds substantial promise for uncovering and mitigating +the risks associated with the malicious use of large language models (LLMs), +yet the field lacks a standardized evaluation framework to rigorously assess +new methods. To address this issue, we introduce HarmBench, a standardized +evaluation framework for automated red teaming. We identify several desirable +properties previously unaccounted for in red teaming evaluations and +systematically design HarmBench to meet these criteria. Using HarmBench, we +conduct a large-scale comparison of 18 red teaming methods and 33 target LLMs +and defenses, yielding novel insights. We also introduce a highly efficient +adversarial training method that greatly enhances LLM robustness across a wide +range of attacks, demonstrating how HarmBench enables codevelopment of attacks +and defenses. We open source HarmBench at +https://github.com/centerforaisafety/HarmBench. + +
+
+ comment: Website: https://www.harmbench.org +
+
+
+
+
+ + ☆ Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science + + +
+ Intelligent agents powered by large language models (LLMs) have demonstrated +substantial promise in autonomously conducting experiments and facilitating +scientific discoveries across various disciplines. While their capabilities are +promising, they also introduce novel vulnerabilities that demand careful +consideration for safety. However, there exists a notable gap in the +literature, as there has been no comprehensive exploration of these +vulnerabilities. This position paper fills this gap by conducting a thorough +examination of vulnerabilities in LLM-based agents within scientific domains, +shedding light on potential risks associated with their misuse and emphasizing +the need for safety measures. We begin by providing a comprehensive overview of +the potential risks inherent to scientific LLM agents, taking into account user +intent, the specific scientific domain, and their potential impact on the +external environment. Then, we delve into the origins of these vulnerabilities +and provide a scoping review of the limited existing works. Based on our +analysis, we propose a triadic framework involving human regulation, agent +alignment, and an understanding of environmental feedback (agent regulation) to +mitigate these identified risks. Furthermore, we highlight the limitations and +challenges associated with safeguarding scientific agents and advocate for the +development of improved models, robust benchmarks, and comprehensive +regulations to address these issues effectively. + +
+
+
+
+
+ + ☆ CogCoM: Train Large Vision-Language Models Diving into Details through + Chain of Manipulations + + +
+ Vision-Language Models (VLMs) have demonstrated their widespread viability +thanks to extensive training in aligning visual instructions to answers. +However, this conclusive alignment leads models to ignore critical visual +reasoning, and further result in failures on meticulous visual problems and +unfaithful responses. In this paper, we propose Chain of Manipulations, a +mechanism that enables VLMs to solve problems with a series of manipulations, +where each manipulation refers to an operation on the visual input, either from +intrinsic abilities (e.g., grounding) acquired through prior training or from +imitating human-like behaviors (e.g., zoom in). This mechanism encourages VLMs +to generate faithful responses with evidential visual reasoning, and permits +users to trace error causes in the interpretable paths. We thus train CogCoM, a +general 17B VLM with a memory-based compatible architecture endowed this +reasoning mechanism. Experiments show that our model achieves the +state-of-the-art performance across 8 benchmarks from 3 categories, and a +limited number of training steps with the data swiftly gains a competitive +performance. The code and data are publicly available at +https://github.com/THUDM/CogCoM. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Can Generative Agents Predict Emotion? + + +
+ Large Language Models (LLMs) have demonstrated a number of human-like +abilities, however the empathic understanding and emotional state of LLMs is +yet to be aligned to that of humans. In this work, we investigate how the +emotional state of generative LLM agents evolves as they perceive new events, +introducing a novel architecture in which new experiences are compared to past +memories. Through this comparison, the agent gains the ability to understand +new experiences in context, which according to the appraisal theory of emotion +is vital in emotion creation. First, the agent perceives new experiences as +time series text data. After perceiving each new input, the agent generates a +summary of past relevant memories, referred to as the norm, and compares the +new experience to this norm. Through this comparison we can analyse how the +agent reacts to the new experience in context. The PANAS, a test of affect, is +administered to the agent, capturing the emotional state of the agent after the +perception of the new event. Finally, the new experience is then added to the +agents memory to be used in the creation of future norms. By creating multiple +experiences in natural language from emotionally charged situations, we test +the proposed architecture on a wide range of scenarios. The mixed results +suggests that introducing context can occasionally improve the emotional +alignment of the agent, but further study and comparison with human evaluators +is necessary. We hope that this paper is another step towards the alignment of +generative agents. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ What is 'Typological Diversity' in NLP? + + +
+ The NLP research community has devoted increased attention to languages +beyond English, resulting in considerable improvements for multilingual NLP. +However, these improvements only apply to a small subset of the world's +languages. Aiming to extend this, an increasing number of papers aspires to +enhance generalizable multilingual performance across languages. To this end, +linguistic typology is commonly used to motivate language selection, on the +basis that a broad typological sample ought to imply generalization across a +broad range of languages. These selections are often described as being +'typologically diverse'. In this work, we systematically investigate NLP +research that includes claims regarding 'typological diversity'. We find there +are no set definitions or criteria for such claims. We introduce metrics to +approximate the diversity of language selection along several axes and find +that the results vary considerably across papers. Furthermore, we show that +skewed language selection can lead to overestimated multilingual performance. +We recommend future work to include an operationalization of 'typological +diversity' that empirically justifies the diversity of language samples. + +
+
+
+
+
+ + ☆ Scaling Laws for Downstream Task Performance of Large Language Models + + +
+ Scaling laws provide important insights that can guide the design of large +language models (LLMs). Existing work has primarily focused on studying scaling +laws for pretraining (upstream) loss. However, in transfer learning settings, +in which LLMs are pretrained on an unsupervised dataset and then finetuned on a +downstream task, we often also care about the downstream performance. In this +work, we study the scaling behavior in a transfer learning setting, where LLMs +are finetuned for machine translation tasks. Specifically, we investigate how +the choice of the pretraining data and its size affect downstream performance +(translation quality) as judged by two metrics: downstream cross-entropy and +BLEU score. Our experiments indicate that the size of the finetuning dataset +and the distribution alignment between the pretraining and downstream data +significantly influence the scaling behavior. With sufficient alignment, both +downstream cross-entropy and BLEU score improve monotonically with more +pretraining data. In such cases, we show that it is possible to predict the +downstream BLEU score with good accuracy using a log-law. However, there are +also cases where moderate misalignment causes the BLEU score to fluctuate or +get worse with more pretraining, whereas downstream cross-entropy monotonically +improves. By analyzing these observations, we provide new practical insights +for choosing appropriate pretraining data. + +
+
+
+
+
+ + ☆ Attention with Markov: A Framework for Principled Analysis of + Transformers via Markov Chains + + +
+ In recent years, attention-based transformers have achieved tremendous +success across a variety of disciplines including natural languages. A key +ingredient behind their success is the generative pretraining procedure, during +which these models are trained on a large text corpus in an auto-regressive +manner. To shed light on this phenomenon, we propose a new framework that +allows both theory and systematic experiments to study the sequential modeling +capabilities of transformers through the lens of Markov chains. Inspired by the +Markovianity of natural languages, we model the data as a Markovian source and +utilize this framework to systematically study the interplay between the +data-distributional properties, the transformer architecture, the learnt +distribution, and the final model performance. In particular, we theoretically +characterize the loss landscape of single-layer transformers and show the +existence of global minima and bad local minima contingent upon the specific +data characteristics and the transformer architecture. Backed by experiments, +we demonstrate that our theoretical findings are in congruence with the +empirical results. We further investigate these findings in the broader context +of higher order Markov chains and deeper architectures, and outline open +problems in this arena. Code is available at +\url{https://github.com/Bond1995/Markov}. + +
+
+
+
+
+ + ☆ Harnessing the Plug-and-Play Controller by Prompting EMNLP 2023 + + +
+ Controllable text generation is a growing field within natural language +generation (NLG) that focuses on producing text that meets specific constraints +in real-world applications. Previous approaches, such as plug-and-play +controllers (PPCs), aimed to steer the properties of generated text in a +flexible manner. However, these methods often compromised the integrity of the +language model's decoding process, resulting in less smooth text generation. +Alternatively, other techniques utilized multiple attribute prompts to align +the generated text with desired attributes, but this approach required prompt +design for each attribute and was dependent on the size of the language model. +This paper introduces a novel method for flexible attribute control in text +generation using pre-trained language models (PLMs). The proposed approach aims +to enhance the fluency of generated text by guiding the generation process with +PPCs. The key idea is to dynamically adjust the distribution of generated text +by modifying prompts, effectively constraining the output space of the language +model and influencing the desired attribute. To enable smooth cooperation +between the PLM and the PPC, our work innovatively proposes a new model +fine-tuning method: Reinforcement Learning with Dynamic Adjust Feedback +(RLDAF).This fine-tuning process adapts a small subset of the language model's +parameters based on the generating actions taken during the PPC control +process. The resulting harmonious collaboration between the PLM and PPC leads +to improved smoothness in text generation during inference. Extensive +experiments were conducted on the SST2 dataset, and the proposed method +outperformed previous approaches in various evaluation metrics, including text +fluency and attribute consistency. + +
+
+ comment: The Third Version of the Generation, Evaluation & Metrics (GEM) + Workshop in EMNLP 2023 +
+
+
+
+
+ + ☆ Behind the Screen: Investigating ChatGPT's Dark Personality Traits and + Conspiracy Beliefs + + +
+ ChatGPT is notorious for its intransparent behavior. This paper tries to shed +light on this, providing an in-depth analysis of the dark personality traits +and conspiracy beliefs of GPT-3.5 and GPT-4. Different psychological tests and +questionnaires were employed, including the Dark Factor Test, the Mach-IV +Scale, the Generic Conspiracy Belief Scale, and the Conspiracy Mentality Scale. +The responses were analyzed computing average scores, standard deviations, and +significance tests to investigate differences between GPT-3.5 and GPT-4. For +traits that have shown to be interdependent in human studies, correlations were +considered. Additionally, system roles corresponding to groups that have shown +distinct answering behavior in the corresponding questionnaires were applied to +examine the models' ability to reflect characteristics associated with these +roles in their responses. Dark personality traits and conspiracy beliefs were +not particularly pronounced in either model with little differences between +GPT-3.5 and GPT-4. However, GPT-4 showed a pronounced tendency to believe in +information withholding. This is particularly intriguing given that GPT-4 is +trained on a significantly larger dataset than GPT-3.5. Apparently, in this +case an increased data exposure correlates with a greater belief in the control +of information. An assignment of extreme political affiliations increased the +belief in conspiracy theories. Test sequencing affected the models' responses +and the observed correlations, indicating a form of contextual memory. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Measuring Implicit Bias in Explicitly Unbiased Large Language Models + + +
+ Large language models (LLMs) can pass explicit bias tests but still harbor +implicit biases, similar to humans who endorse egalitarian beliefs yet exhibit +subtle biases. Measuring such implicit biases can be a challenge: as LLMs +become increasingly proprietary, it may not be possible to access their +embeddings and apply existing bias measures; furthermore, implicit biases are +primarily a concern if they affect the actual decisions that these systems +make. We address both of these challenges by introducing two measures of bias +inspired by psychology: LLM Implicit Association Test (IAT) Bias, which is a +prompt-based method for revealing implicit bias; and LLM Decision Bias for +detecting subtle discrimination in decision-making tasks. Using these measures, +we found pervasive human-like stereotype biases in 6 LLMs across 4 social +domains (race, gender, religion, health) and 21 categories (weapons, guilt, +science, career among others). Our prompt-based measure of implicit bias +correlates with embedding-based methods but better predicts downstream +behaviors measured by LLM Decision Bias. This measure is based on asking the +LLM to decide between individuals, motivated by psychological results +indicating that relative not absolute evaluations are more related to implicit +biases. Using prompt-based measures informed by psychology allows us to +effectively expose nuanced biases and subtle discrimination in proprietary LLMs +that do not show explicit bias on standard benchmarks. + +
+
+
+
+
+ + ☆ The Use of a Large Language Model for Cyberbullying Detection + + +
+ The dominance of social media has added to the channels of bullying for +perpetrators. Unfortunately, cyberbullying (CB) is the most prevalent +phenomenon in todays cyber world, and is a severe threat to the mental and +physical health of citizens. This opens the need to develop a robust system to +prevent bullying content from online forums, blogs, and social media platforms +to manage the impact in our society. Several machine learning (ML) algorithms +have been proposed for this purpose. However, their performances are not +consistent due to high class imbalance and generalisation issues. In recent +years, large language models (LLMs) like BERT and RoBERTa have achieved +state-of-the-art (SOTA) results in several natural language processing (NLP) +tasks. Unfortunately, the LLMs have not been applied extensively for CB +detection. In our paper, we explored the use of these models for cyberbullying +(CB) detection. We have prepared a new dataset (D2) from existing studies +(Formspring and Twitter). Our experimental results for dataset D1 and D2 showed +that RoBERTa outperformed other models. + +
+
+ comment: 14 pages, Journal of Analytics +
+
+
+
+
+ + ☆ Iterative Prompt Refinement for Radiation Oncology Symptom Extraction + Using Teacher-Student Large Language Models + + +
+ This study introduces a novel teacher-student architecture utilizing Large +Language Models (LLMs) to improve prostate cancer radiotherapy symptom +extraction from clinical notes. Mixtral, the student model, initially extracts +symptoms, followed by GPT-4, the teacher model, which refines prompts based on +Mixtral's performance. This iterative process involved 294 single symptom +clinical notes across 12 symptoms, with up to 16 rounds of refinement per +epoch. Results showed significant improvements in extracting symptoms from both +single and multi-symptom notes. For 59 single symptom notes, accuracy increased +from 0.51 to 0.71, precision from 0.52 to 0.82, recall from 0.52 to 0.72, and +F1 score from 0.49 to 0.73. In 375 multi-symptom notes, accuracy rose from 0.24 +to 0.43, precision from 0.6 to 0.76, recall from 0.24 to 0.43, and F1 score +from 0.20 to 0.44. These results demonstrate the effectiveness of advanced +prompt engineering in LLMs for radiation oncology use. + +
+
+
+
+
+ + ☆ Retrieve to Explain: Evidence-driven Predictions with Language Models + + +
+ Machine learning models, particularly language models, are notoriously +difficult to introspect. Black-box models can mask both issues in model +training and harmful biases. For human-in-the-loop processes, opaque +predictions can drive lack of trust, limiting a model's impact even when it +performs effectively. To address these issues, we introduce Retrieve to Explain +(R2E). R2E is a retrieval-based language model that prioritizes amongst a +pre-defined set of possible answers to a research question based on the +evidence in a document corpus, using Shapley values to identify the relative +importance of pieces of evidence to the final prediction. R2E can adapt to new +evidence without retraining, and incorporate structured data through templating +into natural language. We assess on the use case of drug target identification +from published scientific literature, where we show that the model outperforms +an industry-standard genetics-based approach on predicting clinical trial +outcomes. + +
+
+
+
+
+ + ☆ Systematic Biases in LLM Simulations of Debates + + +
+ Recent advancements in natural language processing, especially the emergence +of Large Language Models (LLMs), have opened exciting possibilities for +constructing computational simulations designed to replicate human behavior +accurately. However, LLMs are complex statistical learners without +straightforward deductive rules, making them prone to unexpected behaviors. In +this study, we highlight the limitations of LLMs in simulating human +interactions, particularly focusing on LLMs' ability to simulate political +debates. Our findings indicate a tendency for LLM agents to conform to the +model's inherent social biases despite being directed to debate from certain +political perspectives. This tendency results in behavioral patterns that seem +to deviate from well-established social dynamics among humans. We reinforce +these observations using an automatic self-fine-tuning method, which enables us +to manipulate the biases within the LLM and demonstrate that agents +subsequently align with the altered biases. These results underscore the need +for further research to develop methods that help agents overcome these biases, +a critical step toward creating more realistic simulations. + +
+
+
+
+
+ + ☆ AlbNews: A Corpus of Headlines for Topic Modeling in Albanian + + +
+ The scarcity of available text corpora for low-resource languages like +Albanian is a serious hurdle for research in natural language processing tasks. +This paper introduces AlbNews, a collection of 600 topically labeled news +headlines and 2600 unlabeled ones in Albanian. The data can be freely used for +conducting topic modeling research. We report the initial classification scores +of some traditional machine learning classifiers trained with the AlbNews +samples. These results show that basic models outrun the ensemble learning ones +and can serve as a baseline for future experiments. + +
+
+
+
+
+ + ☆ Google Translate Error Analysis for Mental Healthcare Information: + Evaluating Accuracy, Comprehensibility, and Implications for Multilingual + Healthcare Communication + + +
+ This study explores the use of Google Translate (GT) for translating mental +healthcare (MHealth) information and evaluates its accuracy, comprehensibility, +and implications for multilingual healthcare communication through analysing GT +output in the MHealth domain from English to Persian, Arabic, Turkish, +Romanian, and Spanish. Two datasets comprising MHealth information from the UK +National Health Service website and information leaflets from The Royal College +of Psychiatrists were used. Native speakers of the target languages manually +assessed the GT translations, focusing on medical terminology accuracy, +comprehensibility, and critical syntactic/semantic errors. GT output analysis +revealed challenges in accurately translating medical terminology, particularly +in Arabic, Romanian, and Persian. Fluency issues were prevalent across various +languages, affecting comprehension, mainly in Arabic and Spanish. Critical +errors arose in specific contexts, such as bullet-point formatting, +specifically in Persian, Turkish, and Romanian. Although improvements are seen +in longer-text translations, there remains a need to enhance accuracy in +medical and mental health terminology and fluency, whilst also addressing +formatting issues for a more seamless user experience. The findings highlight +the need to use customised translation engines for Mhealth translation and the +challenges when relying solely on machine-translated medical content, +emphasising the crucial role of human reviewers in multilingual healthcare +communication. + +
+
+
+
+
+ + ☆ REBORN: Reinforcement-Learned Boundary Segmentation with Iterative + Training for Unsupervised ASR + + +
+ Unsupervised automatic speech recognition (ASR) aims to learn the mapping +between the speech signal and its corresponding textual transcription without +the supervision of paired speech-text data. A word/phoneme in the speech signal +is represented by a segment of speech signal with variable length and unknown +boundary, and this segmental structure makes learning the mapping between +speech and text challenging, especially without paired data. In this paper, we +propose REBORN, Reinforcement-Learned Boundary Segmentation with Iterative +Training for Unsupervised ASR. REBORN alternates between (1) training a +segmentation model that predicts the boundaries of the segmental structures in +speech signals and (2) training the phoneme prediction model, whose input is a +segmental structure segmented by the segmentation model, to predict a phoneme +transcription. Since supervised data for training the segmentation model is not +available, we use reinforcement learning to train the segmentation model to +favor segmentations that yield phoneme sequence predictions with a lower +perplexity. We conduct extensive experiments and find that under the same +setting, REBORN outperforms all prior unsupervised ASR models on LibriSpeech, +TIMIT, and five non-English languages in Multilingual LibriSpeech. We +comprehensively analyze why the boundaries learned by REBORN improve the +unsupervised ASR performance. + +
+
+
+
+
+ + ☆ Position Paper: Against Spurious Sparks-Dovelating Inflated AI Claims ICML + + +
+ Humans have a tendency to see 'human'-like qualities in objects around them. +We name our cars, and talk to pets and even household appliances, as if they +could understand us as other humans do. This behavior, called anthropomorphism, +is also seeing traction in Machine Learning (ML), where human-like intelligence +is claimed to be perceived in Large Language Models (LLMs). In this position +paper, considering professional incentives, human biases, and general +methodological setups, we discuss how the current search for Artificial General +Intelligence (AGI) is a perfect storm for over-attributing human-like qualities +to LLMs. In several experiments, we demonstrate that the discovery of +human-interpretable patterns in latent spaces should not be a surprising +outcome. Also in consideration of common AI portrayal in the media, we call for +the academic community to exercise extra caution, and to be extra aware of +principles of academic integrity, in interpreting and communicating about AI +research outcomes. + +
+
+ comment: 20 pages, 15 figures. Preliminary work. Under review by the + International Conference on Machine Learning (ICML) +
+
+
+
+
+ + ☆ Sparse Graph Representations for Procedural Instructional Documents + + +
+ Computation of document similarity is a critical task in various NLP domains +that has applications in deduplication, matching, and recommendation. +Traditional approaches for document similarity computation include learning +representations of documents and employing a similarity or a distance function +over the embeddings. However, pairwise similarities and differences are not +efficiently captured by individual representations. Graph representations such +as Joint Concept Interaction Graph (JCIG) represent a pair of documents as a +joint undirected weighted graph. JCIGs facilitate an interpretable +representation of document pairs as a graph. However, JCIGs are undirected, and +don't consider the sequential flow of sentences in documents. We propose two +approaches to model document similarity by representing document pairs as a +directed and sparse JCIG that incorporates sequential information. We propose +two algorithms inspired by Supergenome Sorting and Hamiltonian Path that +replace the undirected edges with directed edges. Our approach also sparsifies +the graph to $O(n)$ edges from JCIG's worst case of $O(n^2)$. We show that our +sparse directed graph model architecture consisting of a Siamese encoder and +GCN achieves comparable results to the baseline on datasets not containing +sequential information and beats the baseline by ten points on an instructional +documents dataset containing sequential information. + +
+
+
+
+
+ + ☆ Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in + Closed-Source LLMs + + +
+ Natural Language Processing (NLP) research is increasingly focusing on the +use of Large Language Models (LLMs), with some of the most popular ones being +either fully or partially closed-source. The lack of access to model details, +especially regarding training data, has repeatedly raised concerns about data +contamination among researchers. Several attempts have been made to address +this issue, but they are limited to anecdotal evidence and trial and error. +Additionally, they overlook the problem of \emph{indirect} data leaking, where +models are iteratively improved by using data coming from users. In this work, +we conduct the first systematic analysis of work using OpenAI's GPT-3.5 and +GPT-4, the most prominently used LLMs today, in the context of data +contamination. By analysing 255 papers and considering OpenAI's data usage +policy, we extensively document the amount of data leaked to these models +during the first year after the model's release. We report that these models +have been globally exposed to $\sim$4.7M samples from 263 benchmarks. At the +same time, we document a number of evaluation malpractices emerging in the +reviewed papers, such as unfair or missing baseline comparisons and +reproducibility issues. We release our results as a collaborative project on +https://leak-llm.github.io/, where other researchers can contribute to our +efforts. + +
+
+
+
+
+ + ☆ Can Large Language Models Detect Rumors on Social Media? + + +
+ In this work, we investigate to use Large Language Models (LLMs) for rumor +detection on social media. However, it is challenging for LLMs to reason over +the entire propagation information on social media, which contains news +contents and numerous comments, due to LLMs may not concentrate on key clues in +the complex propagation information, and have trouble in reasoning when facing +massive and redundant information. Accordingly, we propose an LLM-empowered +Rumor Detection (LeRuD) approach, in which we design prompts to teach LLMs to +reason over important clues in news and comments, and divide the entire +propagation information into a Chain-of-Propagation for reducing LLMs' burden. +We conduct extensive experiments on the Twitter and Weibo datasets, and LeRuD +outperforms several state-of-the-art rumor detection models by 2.4% to 7.6%. +Meanwhile, by applying LLMs, LeRuD requires no data for training, and thus +shows more promising rumor detection ability in few-shot or zero-shot +scenarios. + +
+
+
+
+
+ + ☆ Pro-HAN: A Heterogeneous Graph Attention Network for Profile-Based + Spoken Language Understanding ICASSP 2024 + + +
+ Recently, Profile-based Spoken Language Understanding (SLU) has gained +increasing attention, which aims to incorporate various types of supplementary +profile information (i.e., Knowledge Graph, User Profile, Context Awareness) to +eliminate the prevalent ambiguities in user utterances. However, existing +approaches can only separately model different profile information, without +considering their interrelationships or excluding irrelevant and conflicting +information within them. To address the above issues, we introduce a +Heterogeneous Graph Attention Network to perform reasoning across multiple +Profile information, called Pro-HAN. Specifically, we design three types of +edges, denoted as intra-Pro, inter-Pro, and utterance-Pro, to capture +interrelationships among multiple Pros. We establish a new state-of-the-art on +the ProSLU dataset, with an improvement of approximately 8% across all three +metrics. Further analysis experiments also confirm the effectiveness of our +method in modeling multi-source profile information. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ DistiLLM: Towards Streamlined Distillation for Large Language Models + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +a smaller student model, reducing its inference cost and memory footprint while +preserving model capabilities. However, current KD methods for auto-regressive +sequence models (e.g., large language models) suffer from missing a +standardized objective function. Moreover, the recent use of student-generated +outputs to address training-inference mismatches has significantly escalated +computational costs. To tackle these issues, we introduce DistiLLM, a more +effective and efficient KD framework for auto-regressive language models. +DistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence +loss, where we unveil and leverage its theoretical properties, and (2) an +adaptive off-policy approach designed to enhance the efficiency in utilizing +student-generated outputs. Extensive experiments, including +instruction-following tasks, demonstrate the effectiveness of DistiLLM in +building high-performing student models while achieving up to 4.3$\times$ +speedup compared to recent KD methods. + +
+
+ comment: Code is available at https://github.com/jongwooko/distillm +
+
+
+
+
+ + ☆ Shifting social norms as a driving force for linguistic change: + Struggles about language and gender in the German Bundestag + + +
+ This paper focuses on language change based on shifting social norms, in +particular with regard to the debate on language and gender. It is a recurring +argument in this debate that language develops "naturally" and that "severe +interventions" - such as gender-inclusive language is often claimed to be - in +the allegedly "organic" language system are inappropriate and even "dangerous". +Such interventions are, however, not unprecedented. Socially motivated +processes of language change are neither unusual nor new. We focus in our +contribution on one important political-social space in Germany, the German +Bundestag. Taking other struggles about language and gender in the plenaries of +the Bundestag as a starting point, our article illustrates that language and +gender has been a recurring issue in the German Bundestag since the 1980s. We +demonstrate how this is reflected in linguistic practices of the Bundestag, by +the use of a) designations for gays and lesbians; b) pair forms such as +B\"urgerinnen und B\"urger (female and male citizens); and c) female forms of +addresses and personal nouns ('Pr\"asidentin' in addition to 'Pr\"asident'). +Lastly, we will discuss implications of these earlier language battles for the +currently very heated debate about gender-inclusive language, especially +regarding new forms with gender symbols like the asterisk or the colon +(Lehrer*innen, Lehrer:innen; male*female teachers) which are intended to +encompass all gender identities. + +
+
+ comment: 40 pages, 9 figures +
+
+
+
+
+ + ☆ Beyond Lines and Circles: Unveiling the Geometric Reasoning Gap in Large + Language Models + + +
+ Large Language Models (LLMs) demonstrate ever-increasing abilities in +mathematical and algorithmic tasks, yet their geometric reasoning skills are +underexplored. We investigate LLMs' abilities in constructive geometric +problem-solving one of the most fundamental steps in the development of human +mathematical reasoning. Our work reveals notable challenges that the +state-of-the-art LLMs face in this domain despite many successes in similar +areas. LLMs exhibit biases in target variable selection and struggle with 2D +spatial relationships, often misrepresenting and hallucinating objects and +their placements. To this end, we introduce a framework that formulates an +LLMs-based multi-agents system that enhances their existing reasoning potential +by conducting an internal dialogue. This work underscores LLMs' current +limitations in geometric reasoning and improves geometric reasoning +capabilities through self-correction, collaboration, and diverse role +specializations. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Less than one percent of words would be affected by gender-inclusive + language in German press texts + + +
+ Research on gender and language is tightly knitted to social debates on +gender equality and non-discriminatory language use. Psycholinguistic scholars +have made significant contributions in this field. However, corpus-based +studies that investigate these matters within the context of language use are +still rare. In our study, we address the question of how much textual material +would actually have to be changed if non-gender-inclusive texts were rewritten +to be gender-inclusive. This quantitative measure is an important empirical +insight, as a recurring argument against the use of gender-inclusive German is +that it supposedly makes written texts too long and complicated. It is also +argued that gender-inclusive language has negative effects on language +learners. However, such effects are only likely if gender-inclusive texts are +very different from those that are not gender-inclusive. In our +corpus-linguistic study, we manually annotated German press texts to identify +the parts that would have to be changed. Our results show that, on average, +less than 1% of all tokens would be affected by gender-inclusive language. This +small proportion calls into question whether gender-inclusive German presents a +substantial barrier to understanding and learning the language, particularly +when we take into account the potential complexities of interpreting masculine +generics. + +
+
+ comment: 27 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ ANLS* -- A Universal Document Processing Metric for Generative Large + Language Models + + +
+ Traditionally, discriminative models have been the predominant choice for +tasks like document classification and information extraction. These models +make predictions that fall into a limited number of predefined classes, +facilitating a binary true or false evaluation and enabling the direct +calculation of metrics such as the F1 score. However, recent advancements in +generative large language models (GLLMs) have prompted a shift in the field due +to their enhanced zero-shot capabilities, which eliminate the need for a +downstream dataset and computationally expensive fine-tuning. However, +evaluating GLLMs presents a challenge as the binary true or false evaluation +used for discriminative models is not applicable to the predictions made by +GLLMs. This paper introduces a new metric for generative models called ANLS* +for evaluating a wide variety of tasks, including information extraction and +classification tasks. The ANLS* metric extends existing ANLS metrics as a +drop-in-replacement and is still compatible with previously reported ANLS +scores. An evaluation of 7 different datasets and 3 different GLLMs using the +ANLS* metric is also provided, demonstrating the importance of the proposed +metric. We also benchmark a novel approach to generate prompts for documents, +called SFT, against other prompting techniques such as LATIN. In 15 out of 21 +cases, SFT outperforms other techniques and improves the state-of-the-art, +sometimes by as much as $15$ percentage points. + Sources are available at https://github.com/deepopinion/anls_star_metric + +
+
+
+
+
+ + ☆ Rethinking Skill Extraction in the Job Market Domain using Large + Language Models EACL + + +
+ Skill Extraction involves identifying skills and qualifications mentioned in +documents such as job postings and resumes. The task is commonly tackled by +training supervised models using a sequence labeling approach with BIO tags. +However, the reliance on manually annotated data limits the generalizability of +such approaches. Moreover, the common BIO setting limits the ability of the +models to capture complex skill patterns and handle ambiguous mentions. In this +paper, we explore the use of in-context learning to overcome these challenges, +on a benchmark of 6 uniformized skill extraction datasets. Our approach +leverages the few-shot learning capabilities of large language models (LLMs) to +identify and extract skills from sentences. We show that LLMs, despite not +being on par with traditional supervised models in terms of performance, can +better handle syntactically complex skill mentions in skill extraction tasks. + +
+
+ comment: Published at NLP4HR 2024 (EACL Workshop) +
+
+
+
+
+ + ☆ RevOrder: A Novel Method for Enhanced Arithmetic in Language Models + + +
+ This paper presents RevOrder, a novel technique aimed at improving arithmetic +operations in large language models (LLMs) by reversing the output digits in +addition, subtraction, and n-digit by 1-digit (nD by 1D) multiplication tasks. +Our method significantly reduces the Count of Sequential Intermediate Digits +(CSID) to $\mathcal{O}(1)$, a new metric we introduce to assess equation +complexity. Through comprehensive testing, RevOrder not only achieves perfect +accuracy in basic arithmetic operations but also substantially boosts LLM +performance in division tasks, particularly with large numbers where +traditional models struggle. Implementation of RevOrder is cost-effective for +both training and inference phases. Moreover, applying RevOrder to fine-tune +the LLaMA2-7B model on the GSM8K math task results in a considerable +improvement, reducing equation calculation errors by 46% and increasing overall +scores from 41.6 to 44.4. + +
+
+
+
+
+ + ☆ Soft Prompt Tuning for Cross-Lingual Transfer: When Less is More EACL 2024 + + +
+ Soft Prompt Tuning (SPT) is a parameter-efficient method for adapting +pre-trained language models (PLMs) to specific tasks by inserting learnable +embeddings, or soft prompts, at the input layer of the PLM, without modifying +its parameters. This paper investigates the potential of SPT for cross-lingual +transfer. Unlike previous studies on SPT for cross-lingual transfer that often +fine-tune both the soft prompt and the model parameters, we adhere to the +original intent of SPT by keeping the model parameters frozen and only training +the soft prompt. This does not only reduce the computational cost and storage +overhead of full-model fine-tuning, but we also demonstrate that this very +parameter efficiency intrinsic to SPT can enhance cross-lingual transfer +performance to linguistically distant languages. Moreover, we explore how +different factors related to the prompt, such as the length or its +reparameterization, affect cross-lingual transfer performance. + +
+
+ comment: Accepted at the 1st Workshop on Modular and Open Multilingual NLP + (co-located with EACL 2024) +
+
+
+
+
+ + ☆ Exposing propaganda: an analysis of stylistic cues comparing human + annotations and machine classification EACL 2024 + + +
+ This paper investigates the language of propaganda and its stylistic +features. It presents the PPN dataset, standing for Propagandist Pseudo-News, a +multisource, multilingual, multimodal dataset composed of news articles +extracted from websites identified as propaganda sources by expert agencies. A +limited sample from this set was randomly mixed with papers from the regular +French press, and their URL masked, to conduct an annotation-experiment by +humans, using 11 distinct labels. The results show that human annotators were +able to reliably discriminate between the two types of press across each of the +labels. We propose different NLP techniques to identify the cues used by the +annotators, and to compare them with machine classification. They include the +analyzer VAGO to measure discourse vagueness and subjectivity, a TF-IDF to +serve as a baseline, and four different classifiers: two RoBERTa-based models, +CATS using syntax, and one XGBoost combining syntactic and semantic features. + Keywords: Propaganda, Fake News, Explainability, AI alignment, Vagueness, +Subjectivity, Exaggeration, Stylistic analysis + +
+
+ comment: Paper to appear in the EACL 2024 Proceedings of the Third Workshop on + Understanding Implicit and Underspecified Language (UnImplicit 2024) +
+
+
+
+
+ + ☆ Large Language Models As MOOCs Graders + + +
+ Massive open online courses (MOOCs) unlock the doors to free education for +anyone around the globe with access to a computer and the internet. Despite +this democratization of learning, the massive enrollment in these courses means +it is almost impossible for one instructor to assess every student's writing +assignment. As a result, peer grading, often guided by a straightforward +rubric, is the method of choice. While convenient, peer grading often falls +short in terms of reliability and validity. In this study, using 18 distinct +settings, we explore the feasibility of leveraging large language models (LLMs) +to replace peer grading in MOOCs. Specifically, we focus on two +state-of-the-art LLMs: GPT-4 and GPT-3.5, across three distinct courses: +Introductory Astronomy, Astrobiology, and the History and Philosophy of +Astronomy. To instruct LLMs, we use three different prompts based on a variant +of the zero-shot chain-of-thought (Zero-shot-CoT) prompting technique: +Zero-shot-CoT combined with instructor-provided correct answers; Zero-shot-CoT +in conjunction with both instructor-formulated answers and rubrics; and +Zero-shot-CoT with instructor-offered correct answers and LLM-generated +rubrics. Our results show that Zero-shot-CoT, when integrated with +instructor-provided answers and rubrics, produces grades that are more aligned +with those assigned by instructors compared to peer grading. However, the +History and Philosophy of Astronomy course proves to be more challenging in +terms of grading as opposed to other courses. Finally, our study reveals a +promising direction for automating grading systems for MOOCs, especially in +subjects with well-defined rubrics. + +
+
+ comment: v1 preprint +
+
+
+
+
+ + ☆ Learning a Decision Tree Algorithm with Transformers + + +
+ Decision trees are renowned for their interpretability capability to achieve +high predictive performance, especially on tabular data. Traditionally, they +are constructed through recursive algorithms, where they partition the data at +every node in a tree. However, identifying the best partition is challenging, +as decision trees optimized for local segments may not bring global +generalization. To address this, we introduce MetaTree, which trains a +transformer-based model on filtered outputs from classical algorithms to +produce strong decision trees for classification. Specifically, we fit both +greedy decision trees and optimized decision trees on a large number of +datasets. We then train MetaTree to produce the trees that achieve strong +generalization performance. This training enables MetaTree to not only emulate +these algorithms, but also to intelligently adapt its strategy according to the +context, thereby achieving superior generalization performance. + +
+
+
+
+
+ + ☆ The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs + + +
+ Large language models (LLMs) have recently experienced remarkable progress, +where the advent of multi-modal large language models (MLLMs) has endowed LLMs +with visual capabilities, leading to impressive performances in various +multi-modal tasks. However, those powerful MLLMs such as GPT-4V still fail +spectacularly when presented with certain image and text inputs. In this paper, +we identify a typical class of inputs that baffles MLLMs, which consist of +images that are highly relevant but inconsistent with answers, causing MLLMs to +suffer from hallucination. To quantify the effect, we propose CorrelationQA, +the first benchmark that assesses the hallucination level given spurious +images. This benchmark contains 7,308 text-image pairs across 13 categories. +Based on the proposed CorrelationQA, we conduct a thorough analysis on 9 +mainstream MLLMs, illustrating that they universally suffer from this +instinctive bias to varying degrees. We hope that our curated benchmark and +evaluation results aid in better assessments of the MLLMs' robustness in the +presence of misleading images. The resource is available in +https://github.com/MasaiahHan/CorrelationQA. + +
+
+
+
+
+ + ☆ INSIDE: LLMs' Internal States Retain the Power of Hallucination + Detection ICLR-2024 + + +
+ Knowledge hallucination have raised widespread concerns for the security and +reliability of deployed LLMs. Previous efforts in detecting hallucinations have +been employed at logit-level uncertainty estimation or language-level +self-consistency evaluation, where the semantic information is inevitably lost +during the token-decoding procedure. Thus, we propose to explore the dense +semantic information retained within LLMs' \textbf{IN}ternal \textbf{S}tates +for halluc\textbf{I}nation \textbf{DE}tection (\textbf{INSIDE}). In particular, +a simple yet effective \textbf{EigenScore} metric is proposed to better +evaluate responses' self-consistency, which exploits the eigenvalues of +responses' covariance matrix to measure the semantic consistency/diversity in +the dense embedding space. Furthermore, from the perspective of self-consistent +hallucination detection, a test time feature clipping approach is explored to +truncate extreme activations in the internal states, which reduces +overconfident generations and potentially benefits the detection of +overconfident hallucinations. Extensive experiments and ablation studies are +performed on several popular LLMs and question-answering (QA) benchmarks, +showing the effectiveness of our proposal. + +
+
+ comment: Accepted by ICLR-2024 +
+
+
+
+
+ + ☆ Deep Outdated Fact Detection in Knowledge Graphs + + +
+ Knowledge graphs (KGs) have garnered significant attention for their vast +potential across diverse domains. However, the issue of outdated facts poses a +challenge to KGs, affecting their overall quality as real-world information +evolves. Existing solutions for outdated fact detection often rely on manual +recognition. In response, this paper presents DEAN (Deep outdatEd fAct +detectioN), a novel deep learning-based framework designed to identify outdated +facts within KGs. DEAN distinguishes itself by capturing implicit structural +information among facts through comprehensive modeling of both entities and +relations. To effectively uncover latent out-of-date information, DEAN employs +a contrastive approach based on a pre-defined Relations-to-Nodes (R2N) graph, +weighted by the number of entities. Experimental results demonstrate the +effectiveness and superiority of DEAN over state-of-the-art baseline methods. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Consistent Joint Decision-Making with Heterogeneous Learning Models EACL 2024 + + +
+ This paper introduces a novel decision-making framework that promotes +consistency among decisions made by diverse models while utilizing external +knowledge. Leveraging the Integer Linear Programming (ILP) framework, we map +predictions from various models into globally normalized and comparable values +by incorporating information about decisions' prior probability, confidence +(uncertainty), and the models' expected accuracy. Our empirical study +demonstrates the superiority of our approach over conventional baselines on +multiple datasets. + +
+
+ comment: EACL 2024 Findings - Short Paper +
+
+
+
+
+ + ☆ Similarity-based Neighbor Selection for Graph LLMs + + +
+ Text-attributed graphs (TAGs) present unique challenges for direct processing +by Language Learning Models (LLMs), yet their extensive commonsense knowledge +and robust reasoning capabilities offer great promise for node classification +in TAGs. Prior research in this field has grappled with issues such as +over-squashing, heterophily, and ineffective graph information integration, +further compounded by inconsistencies in dataset partitioning and +underutilization of advanced LLMs. To address these challenges, we introduce +Similarity-based Neighbor Selection (SNS). Using SimCSE and advanced neighbor +selection techniques, SNS effectively improves the quality of selected +neighbors, thereby improving graph representation and alleviating issues like +over-squashing and heterophily. Besides, as an inductive and training-free +approach, SNS demonstrates superior generalization and scalability over +traditional GNN methods. Our comprehensive experiments, adhering to standard +dataset partitioning practices, demonstrate that SNS, through simple prompt +interactions with LLMs, consistently outperforms vanilla GNNs and achieves +state-of-the-art results on datasets like PubMed in node classification, +showcasing LLMs' potential in graph structure understanding. Our research +further underscores the significance of graph structure integration in LLM +applications and identifies key factors for their success in node +classification. Code is available at https://github.com/ruili33/SNS. + +
+
+
+
+
+ + ☆ Empowering Language Models with Active Inquiry for Deeper Understanding + + +
+ The rise of large language models (LLMs) has revolutionized the way that we +interact with artificial intelligence systems through natural language. +However, LLMs often misinterpret user queries because of their uncertain +intention, leading to less helpful responses. In natural human interactions, +clarification is sought through targeted questioning to uncover obscure +information. Thus, in this paper, we introduce LaMAI (Language Model with +Active Inquiry), designed to endow LLMs with this same level of interactive +engagement. LaMAI leverages active learning techniques to raise the most +informative questions, fostering a dynamic bidirectional dialogue. This +approach not only narrows the contextual gap but also refines the output of the +LLMs, aligning it more closely with user expectations. Our empirical studies, +across a variety of complex datasets where LLMs have limited conversational +context, demonstrate the effectiveness of LaMAI. The method improves answer +accuracy from 31.9% to 50.9%, outperforming other leading question-answering +frameworks. Moreover, in scenarios involving human participants, LaMAI +consistently generates responses that are superior or comparable to baseline +methods in more than 82% of the cases. The applicability of LaMAI is further +evidenced by its successful integration with various LLMs, highlighting its +potential for the future of interactive language models. + +
+
+
+
+
+ + ☆ Clarify: Improving Model Robustness With Natural Language Corrections + + +
+ In supervised learning, models are trained to extract correlations from a +static dataset. This often leads to models that rely on high-level +misconceptions. To prevent such misconceptions, we must necessarily provide +additional information beyond the training data. Existing methods incorporate +forms of additional instance-level supervision, such as labels for spurious +features or additional labeled data from a balanced distribution. Such +strategies can become prohibitively costly for large-scale datasets since they +require additional annotation at a scale close to the original training data. +We hypothesize that targeted natural language feedback about a model's +misconceptions is a more efficient form of additional supervision. We introduce +Clarify, a novel interface and method for interactively correcting model +misconceptions. Through Clarify, users need only provide a short text +description to describe a model's consistent failure patterns. Then, in an +entirely automated way, we use such descriptions to improve the training +process by reweighting the training data or gathering additional targeted data. +Our user studies show that non-expert users can successfully describe model +misconceptions via Clarify, improving worst-group accuracy by an average of +17.1% in two datasets. Additionally, we use Clarify to find and rectify 31 +novel hard subpopulations in the ImageNet dataset, improving minority-split +accuracy from 21.1% to 28.7%. + +
+
+
+
+
+ + ☆ Listen, Chat, and Edit: Text-Guided Soundscape Modification for Enhanced + Auditory Experience + + +
+ In daily life, we encounter a variety of sounds, both desirable and +undesirable, with limited control over their presence and volume. Our work +introduces "Listen, Chat, and Edit" (LCE), a novel multimodal sound mixture +editor that modifies each sound source in a mixture based on user-provided text +instructions. LCE distinguishes itself with a user-friendly chat interface and +its unique ability to edit multiple sound sources simultaneously within a +mixture, without needing to separate them. Users input open-vocabulary text +prompts, which are interpreted by a large language model to create a semantic +filter for editing the sound mixture. The system then decomposes the mixture +into its components, applies the semantic filter, and reassembles it into the +desired output. We developed a 160-hour dataset with over 100k mixtures, +including speech and various audio sources, along with text prompts for diverse +editing tasks like extraction, removal, and volume control. Our experiments +demonstrate significant improvements in signal quality across all editing tasks +and robust performance in zero-shot scenarios with varying numbers and types of +sound sources. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Minds versus Machines: Rethinking Entailment Verification with Language + Models + + +
+ Humans make numerous inferences in text comprehension to understand +discourse. This paper aims to understand the commonalities and disparities in +the inference judgments between humans and state-of-the-art Large Language +Models (LLMs). Leveraging a comprehensively curated entailment verification +benchmark, we evaluate both human and LLM performance across various reasoning +categories. Our benchmark includes datasets from three categories (NLI, +contextual QA, and rationales) that include multi-sentence premises and +different knowledge types, thereby evaluating the inference capabilities in +complex reasoning instances. Notably, our findings reveal LLMs' superiority in +multi-hop reasoning across extended contexts, while humans excel in tasks +necessitating simple deductive reasoning. Leveraging these insights, we +introduce a fine-tuned Flan-T5 model that outperforms GPT-3.5 and rivals with +GPT-4, offering a robust open-source solution for entailment verification. As a +practical application, we showcase the efficacy of our finetuned model in +enhancing self-consistency in model-generated explanations, resulting in a 6% +performance boost on average across three multiple-choice question-answering +datasets. + +
+
+
+
+
+ + ☆ Large Language Models as an Indirect Reasoner: Contrapositive and + Contradiction for Automated Reasoning + + +
+ Recently, increasing attention has been focused drawn on to improve the +ability of Large Language Models (LLMs) to perform complex reasoning. However, +previous methods, such as Chain-of-Thought and Self-Consistency, mainly follow +Direct Reasoning (DR) frameworks, so they will meet difficulty in solving +numerous real-world tasks which can hardly be solved via DR. Therefore, to +strengthen the reasoning power of LLMs, this paper proposes a novel Indirect +Reasoning (IR) method that employs the logic of contrapositives and +contradictions to tackle IR tasks such as factual reasoning and mathematic +proof. Specifically, our methodology comprises two steps. Firstly, we leverage +the logical equivalence of contrapositive to augment the data and rules to +enhance the comprehensibility of LLMs. Secondly, we design a set of prompt +templates to trigger LLMs to conduct IR based on proof by contradiction that is +logically equivalent to the original DR process. Our IR method is simple yet +effective and can be straightforwardly integrated with existing DR methods to +further boost the reasoning abilities of LLMs. The experimental results on +popular LLMs, such as GPT-3.5-turbo and Gemini-pro, show that our IR method +enhances the overall accuracy of factual reasoning by 27.33% and mathematical +proof by 31.43%, when compared with traditional DR methods. Moreover, the +methods combining IR and DR significantly outperform the methods solely using +IR or DR, further demonstrating the effectiveness of our strategy. + +
+
+ comment: 20 pages,13 figures,4 tables +
+
+
+
+
+ + ☆ Learning to Generate Explainable Stock Predictions using Self-Reflective + Large Language Models WWW 2024 + + +
+ Explaining stock predictions is generally a difficult task for traditional +non-generative deep learning models, where explanations are limited to +visualizing the attention weights on important texts. Today, Large Language +Models (LLMs) present a solution to this problem, given their known +capabilities to generate human-readable explanations for their decision-making +process. However, the task of stock prediction remains challenging for LLMs, as +it requires the ability to weigh the varying impacts of chaotic social texts on +stock prices. The problem gets progressively harder with the introduction of +the explanation component, which requires LLMs to explain verbally why certain +factors are more important than the others. On the other hand, to fine-tune +LLMs for such a task, one would need expert-annotated samples of explanation +for every stock movement in the training set, which is expensive and +impractical to scale. To tackle these issues, we propose our +Summarize-Explain-Predict (SEP) framework, which utilizes a self-reflective +agent and Proximal Policy Optimization (PPO) to let a LLM teach itself how to +generate explainable stock predictions in a fully autonomous manner. The +reflective agent learns how to explain past stock movements through +self-reasoning, while the PPO trainer trains the model to generate the most +likely explanations from input texts. The training samples for the PPO trainer +are also the responses generated during the reflective process, which +eliminates the need for human annotators. Using our SEP framework, we fine-tune +a LLM that can outperform both traditional deep-learning and LLM methods in +prediction accuracy and Matthews correlation coefficient for the stock +classification task. To justify the generalization capability of our framework, +we further test it on the portfolio construction task, and demonstrate its +effectiveness through various portfolio metrics. + +
+
+ comment: WWW 2024 +
+
+
+
+
+ + ☆ Sentiment-enhanced Graph-based Sarcasm Explanation in Dialogue + + +
+ Sarcasm Explanation in Dialogue (SED) is a new yet challenging task, which +aims to generate a natural language explanation for the given sarcastic +dialogue that involves multiple modalities (i.e., utterance, video, and audio). +Although existing studies have achieved great success based on the generative +pretrained language model BART, they overlook exploiting the sentiments +residing in the utterance, video and audio, which are vital clues for sarcasm +explanation. In fact, it is non-trivial to incorporate sentiments for boosting +SED performance, due to three main challenges: 1) diverse effects of utterance +tokens on sentiments; 2) gap between video-audio sentiment signals and the +embedding space of BART; and 3) various relations among utterances, utterance +sentiments, and video-audio sentiments. To tackle these challenges, we propose +a novel sEntiment-enhanceD Graph-based multimodal sarcasm Explanation +framework, named EDGE. In particular, we first propose a lexicon-guided +utterance sentiment inference module, where a heuristic utterance sentiment +refinement strategy is devised. We then develop a module named Joint Cross +Attention-based Sentiment Inference (JCA-SI) by extending the multimodal +sentiment analysis model JCA to derive the joint sentiment label for each +video-audio clip. Thereafter, we devise a context-sentiment graph to +comprehensively model the semantic relations among the utterances, utterance +sentiments, and video-audio sentiments, to facilitate sarcasm explanation +generation. Extensive experiments on the publicly released dataset WITS verify +the superiority of our model over cutting-edge methods. + +
+
+
+
+
+ + ☆ Stanceosaurus 2.0: Classifying Stance Towards Russian and Spanish + Misinformation + + +
+ The Stanceosaurus corpus (Zheng et al., 2022) was designed to provide +high-quality, annotated, 5-way stance data extracted from Twitter, suitable for +analyzing cross-cultural and cross-lingual misinformation. In the Stanceosaurus +2.0 iteration, we extend this framework to encompass Russian and Spanish. The +former is of current significance due to prevalent misinformation amid +escalating tensions with the West and the violent incursion into Ukraine. The +latter, meanwhile, represents an enormous community that has been largely +overlooked on major social media platforms. By incorporating an additional +3,874 Spanish and Russian tweets over 41 misinformation claims, our objective +is to support research focused on these issues. To demonstrate the value of +this data, we employed zero-shot cross-lingual transfer on multilingual BERT, +yielding results on par with the initial Stanceosaurus study with a macro F1 +score of 43 for both languages. This underlines the viability of stance +classification as an effective tool for identifying multicultural +misinformation. + +
+
+ comment: WNUT2024 +
+
+
+
+
+ + ☆ Professional Agents -- Evolving Large Language Models into Autonomous + Experts with Human-Level Competencies + + +
+ The advent of large language models (LLMs) such as ChatGPT, PaLM, and GPT-4 +has catalyzed remarkable advances in natural language processing, demonstrating +human-like language fluency and reasoning capacities. This position paper +introduces the concept of Professional Agents (PAgents), an application +framework harnessing LLM capabilities to create autonomous agents with +controllable, specialized, interactive, and professional-level competencies. We +posit that PAgents can reshape professional services through continuously +developed expertise. Our proposed PAgents framework entails a tri-layered +architecture for genesis, evolution, and synergy: a base tool layer, a middle +agent layer, and a top synergy layer. This paper aims to spur discourse on +promising real-world applications of LLMs. We argue the increasing +sophistication and integration of PAgents could lead to AI systems exhibiting +professional mastery over complex domains, serving critical needs, and +potentially achieving artificial general intelligence. + +
+
+ comment: 14 pages, 1 figure +
+
+
+
+
+ + ☆ Partially Recentralization Softmax Loss for Vision-Language Models + Robustness + + +
+ As Large Language Models make a breakthrough in natural language processing +tasks (NLP), multimodal technique becomes extremely popular. However, it has +been shown that multimodal NLP are vulnerable to adversarial attacks, where the +outputs of a model can be dramatically changed by a perturbation to the input. +While several defense techniques have been proposed both in computer vision and +NLP models, the multimodal robustness of models have not been fully explored. +In this paper, we study the adversarial robustness provided by modifying loss +function of pre-trained multimodal models, by restricting top K softmax +outputs. Based on the evaluation and scoring, our experiments show that after a +fine-tuning, adversarial robustness of pre-trained models can be significantly +improved, against popular attacks. Further research should be studying, such as +output diversity, generalization and the robustness-performance trade-off of +this kind of loss functions. Our code will be available after this paper is +accepted + +
+
+
+
+
+ + ☆ Self-Discover: Large Language Models Self-Compose Reasoning Structures + + +
+ We introduce SELF-DISCOVER, a general framework for LLMs to self-discover the +task-intrinsic reasoning structures to tackle complex reasoning problems that +are challenging for typical prompting methods. Core to the framework is a +self-discovery process where LLMs select multiple atomic reasoning modules such +as critical thinking and step-by-step thinking, and compose them into an +explicit reasoning structure for LLMs to follow during decoding. SELF-DISCOVER +substantially improves GPT-4 and PaLM 2's performance on challenging reasoning +benchmarks such as BigBench-Hard, grounded agent reasoning, and MATH, by as +much as 32% compared to Chain of Thought (CoT). Furthermore, SELF-DISCOVER +outperforms inference-intensive methods such as CoT-Self-Consistency by more +than 20%, while requiring 10-40x fewer inference compute. Finally, we show that +the self-discovered reasoning structures are universally applicable across +model families: from PaLM 2-L to GPT-4, and from GPT-4 to Llama2, and share +commonalities with human reasoning patterns. + +
+
+ comment: 17 pages, 11 figures, 5 tables +
+
+
+
+
+ + ☆ Comparing Abstraction in Humans and Large Language Models Using + Multimodal Serial Reproduction + + +
+ Humans extract useful abstractions of the world from noisy sensory data. +Serial reproduction allows us to study how people construe the world through a +paradigm similar to the game of telephone, where one person observes a stimulus +and reproduces it for the next to form a chain of reproductions. Past serial +reproduction experiments typically employ a single sensory modality, but humans +often communicate abstractions of the world to each other through language. To +investigate the effect language on the formation of abstractions, we implement +a novel multimodal serial reproduction framework by asking people who receive a +visual stimulus to reproduce it in a linguistic format, and vice versa. We ran +unimodal and multimodal chains with both humans and GPT-4 and find that adding +language as a modality has a larger effect on human reproductions than GPT-4's. +This suggests human visual and linguistic representations are more dissociable +than those of GPT-4. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Hybrid Workplace Decision Support + + +
+ Large Language Models (LLMs) hold the potential to perform a variety of text +processing tasks and provide textual explanations for proposed actions or +decisions. In the era of hybrid work, LLMs can provide intelligent decision +support for workers who are designing their hybrid work plans. In particular, +they can offer suggestions and explanations to workers balancing numerous +decision factors, thereby enhancing their work experience. In this paper, we +present a decision support model for workspaces in hybrid work environments, +leveraging the reasoning skill of LLMs. We first examine LLM's capability of +making suitable workspace suggestions. We find that its reasoning extends +beyond the guidelines in the prompt and the LLM can manage the trade-off among +the available resources in the workspaces. We conduct an extensive user study +to understand workers' decision process for workspace choices and evaluate the +effectiveness of the system. We observe that a worker's decision could be +influenced by the LLM's suggestions and explanations. The participants in our +study find the system to be convenient, regardless of whether reasons are +provided or not. Our results show that employees can benefit from the +LLM-empowered system for their workspace selection in hybrid workplace. + +
+
+
+
+
+ + ☆ RAP: Retrieval-Augmented Planning with Contextual Memory for Multimodal + LLM Agents + + +
+ Owing to recent advancements, Large Language Models (LLMs) can now be +deployed as agents for increasingly complex decision-making applications in +areas including robotics, gaming, and API integration. However, reflecting past +experiences in current decision-making processes, an innate human behavior, +continues to pose significant challenges. Addressing this, we propose +Retrieval-Augmented Planning (RAP) framework, designed to dynamically leverage +past experiences corresponding to the current situation and context, thereby +enhancing agents' planning capabilities. RAP distinguishes itself by being +versatile: it excels in both text-only and multimodal environments, making it +suitable for a wide range of tasks. Empirical evaluations demonstrate RAP's +effectiveness, where it achieves SOTA performance in textual scenarios and +notably enhances multimodal LLM agents' performance for embodied tasks. These +results highlight RAP's potential in advancing the functionality and +applicability of LLM agents in complex, real-world applications. + +
+
+
+
+
+ + ☆ Improving Contextual Congruence Across Modalities for Effective + Multimodal Marketing using Knowledge-infused Learning + + +
+ The prevalence of smart devices with the ability to capture moments in +multiple modalities has enabled users to experience multimodal information +online. However, large Language (LLMs) and Vision models (LVMs) are still +limited in capturing holistic meaning with cross-modal semantic relationships. +Without explicit, common sense knowledge (e.g., as a knowledge graph), Visual +Language Models (VLMs) only learn implicit representations by capturing +high-level patterns in vast corpora, missing essential contextual cross-modal +cues. In this work, we design a framework to couple explicit commonsense +knowledge in the form of knowledge graphs with large VLMs to improve the +performance of a downstream task, predicting the effectiveness of multi-modal +marketing campaigns. While the marketing application provides a compelling +metric for assessing our methods, our approach enables the early detection of +likely persuasive multi-modal campaigns and the assessment and augmentation of +marketing theory. + +
+
+
+
+
+ + ☆ Identifying Reasons for Contraceptive Switching from Real-World Data + Using Large Language Models + + +
+ Prescription contraceptives play a critical role in supporting women's +reproductive health. With nearly 50 million women in the United States using +contraceptives, understanding the factors that drive contraceptives selection +and switching is of significant interest. However, many factors related to +medication switching are often only captured in unstructured clinical notes and +can be difficult to extract. Here, we evaluate the zero-shot abilities of a +recently developed large language model, GPT-4 (via HIPAA-compliant Microsoft +Azure API), to identify reasons for switching between classes of contraceptives +from the UCSF Information Commons clinical notes dataset. We demonstrate that +GPT-4 can accurately extract reasons for contraceptive switching, outperforming +baseline BERT-based models with microF1 scores of 0.849 and 0.881 for +contraceptive start and stop extraction, respectively. Human evaluation of +GPT-4-extracted reasons for switching showed 91.4% accuracy, with minimal +hallucinations. Using extracted reasons, we identified patient preference, +adverse events, and insurance as key reasons for switching using unsupervised +topic modeling approaches. Notably, we also showed using our approach that +"weight gain/mood change" and "insurance coverage" are disproportionately found +as reasons for contraceptive switching in specific demographic populations. Our +code and supplemental data are available at +https://github.com/BMiao10/contraceptive-switching. + +
+
+
+
+
+ + ☆ Detecting Mode Collapse in Language Models via Narration EACL 2024 + + +
+ No two authors write alike. Personal flourishes invoked in written +narratives, from lexicon to rhetorical devices, imply a particular author--what +literary theorists label the implied or virtual author; distinct from the real +author or narrator of a text. Early large language models trained on unfiltered +training sets drawn from a variety of discordant sources yielded incoherent +personalities, problematic for conversational tasks but proving useful for +sampling literature from multiple perspectives. Successes in alignment research +in recent years have allowed researchers to impose subjectively consistent +personae on language models via instruction tuning and reinforcement learning +from human feedback (RLHF), but whether aligned models retain the ability to +model an arbitrary virtual author has received little scrutiny. By studying +4,374 stories sampled from three OpenAI language models, we show successive +versions of GPT-3 suffer from increasing degrees of "mode collapse" whereby +overfitting the model during alignment constrains it from generalizing over +authorship: models suffering from mode collapse become unable to assume a +multiplicity of perspectives. Our method and results are significant for +researchers seeking to employ language models in sociological simulations. + +
+
+ comment: To appear in the proceedings of the first Workshop on the Scaling + Behavior of Large Language Models (EACL 2024) +
+
+
+
+
+ + ☆ Dual-View Visual Contextualization for Web Navigation + + +
+ Automatic web navigation aims to build a web agent that can follow language +instructions to execute complex and diverse tasks on real-world websites. +Existing work primarily takes HTML documents as input, which define the +contents and action spaces (i.e., actionable elements and operations) of +webpages. Nevertheless, HTML documents may not provide a clear task-related +context for each element, making it hard to select the right (sequence of) +actions. In this paper, we propose to contextualize HTML elements through their +"dual views" in webpage screenshots: each HTML element has its corresponding +bounding box and visual content in the screenshot. We build upon the insight -- +web developers tend to arrange task-related elements nearby on webpages to +enhance user experiences -- and propose to contextualize each element with its +neighbor elements, using both textual and visual features. The resulting +representations of HTML elements are more informative for the agent to take +action. We validate our method on the recently released Mind2Web dataset, which +features diverse navigation domains and tasks on real-world websites. Our +method consistently outperforms the baseline in all the scenarios, including +cross-task, cross-website, and cross-domain ones. + +
+
+
+
+
+ + ☆ Evaluating Embeddings for One-Shot Classification of Doctor-AI + Consultations + + +
+ Effective communication between healthcare providers and patients is crucial +to providing high-quality patient care. In this work, we investigate how +Doctor-written and AI-generated texts in healthcare consultations can be +classified using state-of-the-art embeddings and one-shot classification +systems. By analyzing embeddings such as bag-of-words, character n-grams, +Word2Vec, GloVe, fastText, and GPT2 embeddings, we examine how well our +one-shot classification systems capture semantic information within medical +consultations. Results show that the embeddings are capable of capturing +semantic features from text in a reliable and adaptable manner. Overall, +Word2Vec, GloVe and Character n-grams embeddings performed well, indicating +their suitability for modeling targeted to this task. GPT2 embedding also shows +notable performance, indicating its suitability for models tailored to this +task as well. Our machine learning architectures significantly improved the +quality of health conversations when training data are scarce, improving +communication between patients and healthcare providers. + +
+
+
+
+
+ + ☆ Structured Entity Extraction Using Large Language Models + + +
+ Recent advances in machine learning have significantly impacted the field of +information extraction, with Large Language Models (LLMs) playing a pivotal +role in extracting structured information from unstructured text. This paper +explores the challenges and limitations of current methodologies in structured +entity extraction and introduces a novel approach to address these issues. We +contribute to the field by first introducing and formalizing the task of +Structured Entity Extraction (SEE), followed by proposing Approximate Entity +Set OverlaP (AESOP) Metric designed to appropriately assess model performance +on this task. Later, we propose a new model that harnesses the power of LLMs +for enhanced effectiveness and efficiency through decomposing the entire +extraction task into multiple stages. Quantitative evaluation and human +side-by-side evaluation confirm that our model outperforms baselines, offering +promising directions for future advancements in structured entity extraction. + +
+
+
+
+
+ + ☆ Chatbot Meets Pipeline: Augment Large Language Model with Definite + Finite Automaton + + +
+ This paper introduces the Definite Finite Automaton augmented large language +model (DFA-LLM), a novel framework designed to enhance the capabilities of +conversational agents using large language models (LLMs). Traditional LLMs face +challenges in generating regulated and compliant responses in special scenarios +with predetermined response guidelines, like emotional support and customer +service. Our framework addresses these challenges by embedding a Definite +Finite Automaton (DFA), learned from training dialogues, within the LLM. This +structured approach enables the LLM to adhere to a deterministic response +pathway, guided by the DFA. The advantages of DFA-LLM include an interpretable +structure through human-readable DFA, context-aware retrieval for responses in +conversations, and plug-and-play compatibility with existing LLMs. Extensive +benchmarks validate DFA-LLM's effectiveness, indicating its potential as a +valuable contribution to the conversational agent. + +
+
+ comment: 21 pages, 11 figures +
+
+
+
+
+ + ☆ Democratizing Large Language Models via Personalized Parameter-Efficient + Fine-tuning + + +
+ Personalization in large language models (LLMs) is increasingly important, +aiming to align LLM's interactions, content, and recommendations with +individual user preferences. Recent advances in LLM personalization have +spotlighted effective prompt design, by enriching user queries with +non-parametric knowledge through behavior history retrieval and textual +profiles. However, these approaches were limited due to a lack of model +ownership, resulting in constrained customization and privacy issues. Moreover, +they often failed to accurately capture user behavior patterns, especially in +cases where user data were complex and dynamic. To address these shortcomings, +we introduce One PEFT Per User (OPPU), which employs personalized +parameter-efficient fine-tuning (PEFT) modules, to store user-specific behavior +patterns and preferences. By plugging in users' personal PEFT parameters, they +can own and use their LLMs personally. OPPU integrates parametric user +knowledge in the personal PEFT parameters with the non-parametric knowledge +acquired through retrieval and profile. This integration adapts individual LLMs +to user behavior shifts. Experimental results demonstrate that OPPU +significantly outperforms existing prompt-based methods across seven diverse +tasks in the LaMP benchmark. Further in-depth studies reveal OPPU's enhanced +capabilities in handling user behavior shifts, modeling users at different +active levels, maintaining robustness across various user history formats, and +displaying versatility with different PEFT methods. + +
+
+
+
+
+ + ☆ QuIP#: Even Better LLM Quantization with Hadamard Incoherence and + Lattice Codebooks + + +
+ Post-training quantization (PTQ) reduces the memory footprint of LLMs by +quantizing their weights to low-precision. In this work, we introduce QuIP#, a +weight-only PTQ method that achieves state-of-the-art results in extreme +compression regimes ($\le$ 4 bits per weight) using three novel techniques. +First, QuIP# improves the incoherence processing from QuIP by using the +randomized Hadamard transform, which is faster and has better theoretical +properties. Second, QuIP# uses vector quantization techniques to take advantage +of the ball-shaped sub-Gaussian distribution that incoherent weights possess: +specifically, we introduce a set of hardware-efficient codebooks based on the +highly symmetric $E_8$ lattice, which achieves the optimal 8-dimension unit +ball packing. Third, QuIP# uses fine-tuning to improve fidelity to the original +model. Our experiments show that QuIP# outperforms existing PTQ methods, +enables new behaviors in PTQ scaling, and supports fast inference. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ The World of Generative AI: Deepfakes and Large Language Models + + +
+ We live in the era of Generative Artificial Intelligence (GenAI). Deepfakes +and Large Language Models (LLMs) are two examples of GenAI. Deepfakes, in +particular, pose an alarming threat to society as they are capable of spreading +misinformation and changing the truth. LLMs are powerful language models that +generate general-purpose language. However due to its generative aspect, it can +also be a risk for people if used with ill intentions. The ethical use of these +technologies is a big concern. This short article tries to find out the +interrelationship between them. + +
+
+
+
+
+ + ☆ The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax + Mimicry ICLR 2024 + + +
+ Linear attentions have shown potential for improving Transformer efficiency, +reducing attention's quadratic complexity to linear in sequence length. This +holds exciting promise for (1) training linear Transformers from scratch, (2) +"finetuned-conversion" of task-specific Transformers into linear versions that +recover task performance, and (3) "pretrained-conversion" of Transformers such +as large language models into linear versions finetunable on downstream tasks. +However, linear attentions often underperform standard softmax attention in +quality. To close this performance gap, we find prior linear attentions lack +key properties of softmax attention tied to good performance: low-entropy (or +"spiky") weights and dot-product monotonicity. We further observe surprisingly +simple feature maps that retain these properties and match softmax performance, +but are inefficient to compute in linear attention. We thus propose Hedgehog, a +learnable linear attention that retains the spiky and monotonic properties of +softmax attention while maintaining linear complexity. Hedgehog uses simple +trainable MLPs to produce attention weights mimicking softmax attention. +Experiments show Hedgehog recovers over 99% of standard Transformer quality in +train-from-scratch and finetuned-conversion settings, outperforming prior +linear attentions up to 6 perplexity points on WikiText-103 with causal GPTs, +and up to 8.7 GLUE score points on finetuned bidirectional BERTs. Hedgehog also +enables pretrained-conversion. Converting a pretrained GPT-2 into a linear +attention variant achieves state-of-the-art 16.7 perplexity on WikiText-103 for +125M subquadratic decoder models. We finally turn a pretrained Llama-2 7B into +a viable linear attention Llama. With low-rank adaptation, Hedgehog-Llama2 7B +achieves 28.1 higher ROUGE-1 points over the base standard attention model, +where prior linear attentions lead to 16.5 point drops. + +
+
+ comment: 30 pages, 20 figures, 15 tables, ICLR 2024 +
+
+
+
+
+ + ☆ LegalLens: Leveraging LLMs for Legal Violation Identification in + Unstructured Text + + +
+ In this study, we focus on two main tasks, the first for detecting legal +violations within unstructured textual data, and the second for associating +these violations with potentially affected individuals. We constructed two +datasets using Large Language Models (LLMs) which were subsequently validated +by domain expert annotators. Both tasks were designed specifically for the +context of class-action cases. The experimental design incorporated fine-tuning +models from the BERT family and open-source LLMs, and conducting few-shot +experiments using closed-source LLMs. Our results, with an F1-score of 62.69\% +(violation identification) and 81.02\% (associating victims), show that our +datasets and setups can be used for both tasks. Finally, we publicly release +the datasets and the code used for the experiments in order to advance further +research in the area of legal natural language processing (NLP). + +
+
+
+
+
+ + ☆ LESS: Selecting Influential Data for Targeted Instruction Tuning + + +
+ Instruction tuning has unlocked powerful capabilities in large language +models (LLMs), effectively using combined datasets to develop generalpurpose +chatbots. However, real-world applications often require a specialized suite of +skills (e.g., reasoning). The challenge lies in identifying the most relevant +data from these extensive datasets to effectively develop specific +capabilities, a setting we frame as targeted instruction tuning. We propose +LESS, an optimizer-aware and practically efficient algorithm to effectively +estimate data influences and perform Low-rank gradiEnt Similarity Search for +instruction data selection. Crucially, LESS adapts existing influence +formulations to work with the Adam optimizer and variable-length instruction +data. LESS first constructs a highly reusable and transferable gradient +datastore with low-dimensional gradient features and then selects examples +based on their similarity to few-shot examples embodying a specific capability. +Experiments show that training on a LESS-selected 5% of the data can often +outperform training on the full dataset across diverse downstream tasks. +Furthermore, the selected data is highly transferable: smaller models can be +leveraged to select useful data for larger models and models from different +families. Our qualitative analysis shows that our method goes beyond surface +form cues to identify data that exemplifies the necessary reasoning skills for +the intended downstream application. + +
+
+ comment: Code and data are available at https://github.com/princeton-nlp/LESS +
+
+
+
+
+ + ☆ Training Language Models to Generate Text with Citations via + Fine-grained Rewards + + +
+ While recent Large Language Models (LLMs) have proven useful in answering +user queries, they are prone to hallucination, and their responses often lack +credibility due to missing references to reliable sources. An intuitive +solution to these issues would be to include in-text citations referring to +external documents as evidence. While previous works have directly prompted +LLMs to generate in-text citations, their performances are far from +satisfactory, especially when it comes to smaller LLMs. In this work, we +propose an effective training framework using fine-grained rewards to teach +LLMs to generate highly supportive and relevant citations, while ensuring the +correctness of their responses. We also conduct a systematic analysis of +applying these fine-grained rewards to common LLM training strategies, +demonstrating its advantage over conventional practices. We conduct extensive +experiments on Question Answering (QA) datasets taken from the ALCE benchmark +and validate the model's generalizability using EXPERTQA. On LLaMA-2-7B, the +incorporation of fine-grained rewards achieves the best performance among the +baselines, even surpassing that of GPT-3.5-turbo. + +
+
+
+
+
+ + ☆ BiLLM: Pushing the Limit of Post-Training Quantization for LLMs + + +
+ Pretrained large language models (LLMs) exhibit exceptional general language +processing capabilities but come with significant demands on memory and +computational resources. As a powerful compression technology, binarization can +extremely reduce model weights to a mere 1 bit, lowering the expensive +computation and memory requirements. However, existing quantization techniques +fall short of maintaining LLM performance under ultra-low bit-widths. In +response to this challenge, we present BiLLM, a groundbreaking 1-bit +post-training quantization scheme tailored for pretrained LLMs. Based on the +weight distribution of LLMs, BiLLM first identifies and structurally selects +salient weights, and minimizes the compression loss through an effective binary +residual approximation strategy. Moreover, considering the bell-shaped +distribution of the non-salient weights, we propose an optimal splitting search +to group and binarize them accurately. BiLLM achieving for the first time +high-accuracy inference (e.g. 8.41 perplexity on LLaMA2-70B) with only 1.08-bit +weights across various LLMs families and evaluation metrics, outperforms SOTA +quantization methods of LLM by significant margins. Moreover, BiLLM enables the +binarization process of the LLM with 7 billion weights within 0.5 hours on a +single GPU, demonstrating satisfactory time efficiency. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Extreme Compression of Large Language Models via Additive Quantization + + +
+ The emergence of accurate open large language models (LLMs) has led to a race +towards quantization techniques for such models enabling execution on end-user +devices. In this paper, we revisit the problem of "extreme" LLM +compression--defined as targeting extremely low bit counts, such as 2 to 3 bits +per parameter, from the point of view of classic methods in Multi-Codebook +Quantization (MCQ). Our work builds on top of Additive Quantization, a classic +algorithm from the MCQ family, and adapts it to the quantization of language +models. The resulting algorithm advances the state-of-the-art in LLM +compression, outperforming all recently-proposed techniques in terms of +accuracy at a given compression budget. For instance, when compressing Llama 2 +models to 2 bits per parameter, our algorithm quantizes the 7B model to 6.93 +perplexity (a 1.29 improvement relative to the best prior work, and 1.81 points +from FP16), the 13B model to 5.70 perplexity (a .36 improvement) and the 70B +model to 3.94 perplexity (a .22 improvement) on WikiText2. We release our +implementation of Additive Quantization for Language Models AQLM as a baseline +to facilitate future research in LLM quantization. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ DirecT2V: Large Language Models are Frame-Level Directors for Zero-Shot + Text-to-Video Generation + + +
+ In the paradigm of AI-generated content (AIGC), there has been increasing +attention to transferring knowledge from pre-trained text-to-image (T2I) models +to text-to-video (T2V) generation. Despite their effectiveness, these +frameworks face challenges in maintaining consistent narratives and handling +shifts in scene composition or object placement from a single abstract user +prompt. Exploring the ability of large language models (LLMs) to generate +time-dependent, frame-by-frame prompts, this paper introduces a new framework, +dubbed DirecT2V. DirecT2V leverages instruction-tuned LLMs as directors, +enabling the inclusion of time-varying content and facilitating consistent +video generation. To maintain temporal consistency and prevent mapping the +value to a different object, we equip a diffusion model with a novel value +mapping method and dual-softmax filtering, which do not require any additional +training. The experimental results validate the effectiveness of our framework +in producing visually coherent and storyful videos from abstract user prompts, +successfully addressing the challenges of zero-shot video generation. + +
+
+ comment: The code and demo will be available at + https://github.com/KU-CVLAB/DirecT2V +
+
+
+
+
+ + ♻ ☆ DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open + Language Models + + +
+ Mathematical reasoning poses a significant challenge for language models due +to its complex and structured nature. In this paper, we introduce DeepSeekMath +7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B +math-related tokens sourced from Common Crawl, together with natural language +and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the +competition-level MATH benchmark without relying on external toolkits and +voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. +Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. +The mathematical reasoning capability of DeepSeekMath is attributed to two key +factors: First, we harness the significant potential of publicly available web +data through a meticulously engineered data selection pipeline. Second, we +introduce Group Relative Policy Optimization (GRPO), a variant of Proximal +Policy Optimization (PPO), that enhances mathematical reasoning abilities while +concurrently optimizing the memory usage of PPO. + +
+
+
+
+
+ + ♻ ☆ Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction + + +
+ Efforts to align Large Language Models (LLMs) are mainly conducted via +Reinforcement Learning from Human Feedback (RLHF) methods. However, RLHF +encounters major challenges including training reward models, actor-critic +engineering, and importantly, it requires access to LLM parameters. Here we +introduce Aligner, a new efficient alignment paradigm that bypasses the whole +RLHF process by learning the correctional residuals between the aligned and the +unaligned answers. Our Aligner offers several key advantages. Firstly, it is an +autoregressive seq2seq model that is trained on the query-answer-correction +dataset via supervised learning; this offers a parameter-efficient alignment +solution with minimal resources. Secondly, the Aligner facilitates +weak-to-strong generalization; finetuning large pretrained models by Aligner's +supervisory signals demonstrates strong performance boost. Thirdly, Aligner +functions as a model-agnostic plug-and-play module, allowing for its direct +application on different open-source and API-based models. Remarkably, +Aligner-7B improves 11 different LLMs by 21.9% in helpfulness and 23.8% in +harmlessness on average (GPT-4 by 17.5% and 26.9%). When finetuning (strong) +Llama2-70B with (weak) Aligner-13B's supervision, we can improve Llama2 by 8.2% +in helpfulness and 61.6% in harmlessness. See our dataset and code at +https://aligner2024.github.io + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise + Comparisons using Large Language Models EACL 2024 + + +
+ Current developments in large language models (LLMs) have enabled impressive +zero-shot capabilities across various natural language tasks. An interesting +application of these systems is in the automated assessment of natural language +generation (NLG), a highly challenging area with great practical benefit. In +this paper, we explore two options for exploiting the emergent abilities of +LLMs for zero-shot NLG assessment: absolute score prediction, and comparative +assessment which uses relative comparisons between pairs of candidates. Though +comparative assessment has not been extensively studied in NLG assessment, we +note that humans often find it more intuitive to compare two options rather +than scoring each one independently. This work examines comparative assessment +from multiple perspectives: performance compared to absolute grading; +positional biases in the prompt; and efficient ranking in terms of the number +of comparisons. We illustrate that LLM comparative assessment is a simple, +general and effective approach for NLG assessment. For moderate-sized +open-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is +superior to prompt scoring, and in many cases can achieve performance +competitive with state-of-the-art methods. Additionally, we demonstrate that +LLMs often exhibit strong positional biases when making pairwise comparisons, +and we propose debiasing methods that can further improve performance. + +
+
+ comment: To Appear at EACL 2024 +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reason may be the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean +domain, which is expert in various ocean science tasks. We propose DoInstruct, +a novel framework to automatically obtain a large volume of ocean domain +instruction data, which generates instructions based on multi-agent +collaboration. Additionally, we construct the first oceanography benchmark, +OceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though +comprehensive experiments, OceanGPT not only shows a higher level of knowledge +expertise for oceans science tasks but also gains preliminary embodied +intelligence capabilities in ocean technology. Codes, data and checkpoints will +soon be available at https://github.com/zjunlp/KnowLM. + +
+
+ comment: Work in progress. Project Website: + https://zjunlp.github.io/project/OceanGPT/ +
+
+
+
+
+ + ♻ ☆ Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware + Direct Preference Optimization + + +
+ Multimodal large language models have made significant advancements in recent +years, yet they still suffer from a common issue known as the "hallucination +problem", in which the models generate textual descriptions that inaccurately +depict or entirely fabricate content from associated images. This paper +introduces a novel solution, Hallucination-Aware Direct Preference Optimization +(HA-DPO), which reframes the hallucination problem as a preference selection +task. The model is trained to favor the non-hallucinating response when +presented with two responses of the same image (one accurate and one +hallucinatory). Furthermore, this paper proposes an efficient pipeline for +constructing positive~(non-hallucinatory) and negative~(hallucinatory) sample +pairs, ensuring a high-quality, style-consistent dataset for robust preference +learning. When applied to three mainstream multimodal models, HA-DPO +significantly reduced hallucination issues and amplified the models' +generalization capabilities. Notably, the MiniGPT-4 model, when enhanced with +HA-DPO, demonstrated a substantial improvement: POPE accuracy rose from 51.13% +to 86.13% (an absolute improvement of 35%), and the MME score surged from +932.00 to 1326.46 (a relative improvement of 42.32%). The codes, models, and +datasets are made accessible at https://opendatalab.github.io/HA-DPO. + +
+
+ comment: Project Website: https://opendatalab.github.io/HA-DPO, Code: + https://github.com/opendatalab/HA-DPO +
+
+
+
+
+ + ♻ ☆ Language Model Training Paradigms for Clinical Feature Embeddings NeurIPS 2023 + + +
+ In research areas with scarce data, representation learning plays a +significant role. This work aims to enhance representation learning for +clinical time series by deriving universal embeddings for clinical features, +such as heart rate and blood pressure. We use self-supervised training +paradigms for language models to learn high-quality clinical feature +embeddings, achieving a finer granularity than existing time-step and +patient-level representation learning. We visualize the learnt embeddings via +unsupervised dimension reduction techniques and observe a high degree of +consistency with prior clinical knowledge. We also evaluate the model +performance on the MIMIC-III benchmark and demonstrate the effectiveness of +using clinical feature embeddings. We publish our code online for replication. + +
+
+ comment: Poster at "NeurIPS 2023 Workshop: Self-Supervised Learning - Theory + and Practice" +
+
+
+
+
+ + ♻ ☆ Critical Data Size of Language Models from a Grokking Perspective + + +
+ We explore the critical data size in language models, a threshold that marks +a fundamental shift from quick memorization to slow generalization. We +formalize the phase transition under the grokking configuration into the Data +Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus +regimes in language models training dynamics. We develop a grokking +configuration to reproduce grokking on simplistic language models stably by +rescaling initialization and weight decay. We show that generalization occurs +only when language models reach a critical size. We analyze grokking across +sample-wise and model-wise, verifying the proposed data efficiency hypothesis. +Our experiments reveal smoother phase transitions occurring at the critical +dataset size for language datasets. As the model size increases, this critical +point also becomes larger, indicating that larger models require more data. Our +results deepen the understanding of language model training, offering a novel +perspective on the role of data in the learning mechanism of language models. + +
+
+
+
+
+ + ♻ ☆ LLsM: Generative Linguistic Steganography with Large Language Model + + +
+ Linguistic Steganography (LS) tasks aim to generate steganographic text +(stego) based on secret information. Only authorized recipients can perceive +the existence of secrets in the texts and extract them, thereby preserving +privacy. However, the controllability of the stego generated by existing +schemes is poor, and the stego is difficult to contain specific discourse +characteristics such as style. As a result, the stego is easily detectable, +compromising covert communication. To address these problems, this paper +proposes LLsM, the first LS with the Large Language Model (LLM). We fine-tuned +the LLaMA2 with a large-scale constructed dataset encompassing rich discourse +characteristics, which enables the fine-tuned LLM to generate texts with +specific discourse in a controllable manner. Then the discourse is used as +guiding information and inputted into the fine-tuned LLM in the form of the +Prompt together with secret. On this basis, the constructed candidate pool will +be range encoded and use secret to determine the interval. The same prefix of +this interval's beginning and ending is the secret embedded at this moment. +Experiments show that LLsM performs superior to prevalent LS-task and +related-task baselines regarding text quality, statistical analysis, discourse +matching, and anti-steganalysis. In particular, LLsM's MAUVE matric surpasses +some baselines by 70%-80%, and its anti-steganalysis performance is 30%-40% +higher. Notably, we also present examples of longer stegos generated by LLsM, +showing its potential superiority in long LS tasks. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Distilling Event Sequence Knowledge From Large Language Models + + +
+ Event sequence models have been found to be highly effective in the analysis +and prediction of events. Building such models requires availability of +abundant high-quality event sequence data. In certain applications, however, +clean structured event sequences are not available, and automated sequence +extraction results in data that is too noisy and incomplete. In this work, we +explore the use of Large Language Models (LLMs) to generate event sequences +that can effectively be used for probabilistic event model construction. This +can be viewed as a mechanism of distilling event sequence knowledge from LLMs. +Our approach relies on a Knowledge Graph (KG) of event concepts with partial +causal relations to guide the generative language model for causal event +sequence generation. We show that our approach can generate high-quality event +sequences, filling a knowledge gap in the input KG. Furthermore, we explore how +the generated sequences can be leveraged to discover useful and more complex +structured knowledge from pattern mining and probabilistic event models. We +release our sequence generation code and evaluation framework, as well as +corpus of event sequence data. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation + + +
+ In this work, we introduce a novel evaluation paradigm for Large Language +Models, one that challenges them to engage in meta-reasoning. This approach +addresses critical shortcomings in existing math problem-solving benchmarks, +traditionally used to evaluate the cognitive capabilities of agents. Our +paradigm shifts the focus from result-oriented assessments, which often +overlook the reasoning process, to a more holistic evaluation that effectively +differentiates the cognitive capabilities among models. For example, in our +benchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The +significance of this new paradigm lies in its ability to reveal potential +cognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to +uncover due to their saturation and lack of effective differentiation among +varying reasoning abilities. Our comprehensive analysis includes several +state-of-the-art math models from both open-source and closed-source +communities, uncovering fundamental deficiencies in their training and +evaluation approaches. This paper not only advocates for a paradigm shift in +the assessment of LLMs but also contributes to the ongoing discourse on the +trajectory towards Artificial General Intelligence (AGI). By promoting the +adoption of meta-reasoning evaluation methods similar to ours, we aim to +facilitate a more accurate assessment of the true cognitive abilities of LLMs. + +
+
+ comment: Code: https://github.com/dvlab-research/MR-GSM8K +
+
+
+
+
+ + ♻ ☆ Sig-Networks Toolkit: Signature Networks for Longitudinal Language + Modelling EACL 2024 + + +
+ We present an open-source, pip installable toolkit, Sig-Networks, the first +of its kind for longitudinal language modelling. A central focus is the +incorporation of Signature-based Neural Network models, which have recently +shown success in temporal tasks. We apply and extend published research +providing a full suite of signature-based models. Their components can be used +as PyTorch building blocks in future architectures. Sig-Networks enables +task-agnostic dataset plug-in, seamless pre-processing for sequential data, +parameter flexibility, automated tuning across a range of models. We examine +signature networks under three different NLP tasks of varying temporal +granularity: counselling conversations, rumour stance switch and mood changes +in social media threads, showing SOTA performance in all three, and provide +guidance for future tasks. We release the Toolkit as a PyTorch package with an +introductory video, Git repositories for preprocessing and modelling including +sample notebooks on the modeled NLP tasks. + +
+
+ comment: To appear in EACL 2024: System Demonstrations +
+
+
+
+
+ + ♻ ☆ Think Twice: Measuring the Efficiency of Eliminating Prediction + Shortcuts of Question Answering Models EACL 2024 + + +
+ While the Large Language Models (LLMs) dominate a majority of language +understanding tasks, previous work shows that some of these results are +supported by modelling spurious correlations of training datasets. Authors +commonly assess model robustness by evaluating their models on +out-of-distribution (OOD) datasets of the same task, but these datasets might +share the bias of the training dataset. + We propose a simple method for measuring a scale of models' reliance on any +identified spurious feature and assess the robustness towards a large set of +known and newly found prediction biases for various pre-trained models and +debiasing methods in Question Answering (QA). We find that while existing +debiasing methods can mitigate reliance on a chosen spurious feature, the OOD +performance gains of these methods can not be explained by mitigated reliance +on biased features, suggesting that biases are shared among different QA +datasets. Finally, we evidence this to be the case by measuring that the +performance of models trained on different QA datasets relies comparably on the +same bias features. We hope these results will motivate future work to refine +the reports of LMs' robustness to a level of adversarial samples addressing +specific spurious features. + +
+
+ comment: Long paper in Proceedings of EACL 2024: Main track +
+
+
+
+
+ + ♻ ☆ Software-Based Dialogue Systems: Survey, Taxonomy and Challenges + + +
+ The use of natural language interfaces in the field of human-computer +interaction is undergoing intense study through dedicated scientific and +industrial research. The latest contributions in the field, including deep +learning approaches like recurrent neural networks, the potential of +context-aware strategies and user-centred design approaches, have brought back +the attention of the community to software-based dialogue systems, generally +known as conversational agents or chatbots. Nonetheless, and given the novelty +of the field, a generic, context-independent overview on the current state of +research of conversational agents covering all research perspectives involved +is missing. Motivated by this context, this paper reports a survey of the +current state of research of conversational agents through a systematic +literature review of secondary studies. The conducted research is designed to +develop an exhaustive perspective through a clear presentation of the +aggregated knowledge published by recent literature within a variety of +domains, research focuses and contexts. As a result, this research proposes a +holistic taxonomy of the different dimensions involved in the conversational +agents' field, which is expected to help researchers and to lay the groundwork +for future research in the field of natural language interfaces. + +
+
+
+
+
+ + ♻ ☆ Scaling Transformer to 1M tokens and beyond with RMT + + +
+ A major limitation for the broader scope of problems solvable by transformers +is the quadratic scaling of computational complexity with input size. In this +study, we investigate the recurrent memory augmentation of pre-trained +transformer models to extend input context length while linearly scaling +compute. Our approach demonstrates the capability to store information in +memory for sequences of up to an unprecedented two million tokens while +maintaining high retrieval accuracy. Experiments with language modeling tasks +show perplexity improvement as the number of processed input segments +increases. These results underscore the effectiveness of our method, which has +significant potential to enhance long-term dependency handling in natural +language understanding and generation tasks, as well as enable large-scale +context processing for memory-intensive applications. + +
+
+
+
+
+ + ♻ ☆ SimLM: Can Language Models Infer Parameters of Physical Systems? + + +
+ Several machine learning methods aim to learn or reason about complex +physical systems. A common first-step towards reasoning is to infer system +parameters from observations of its behavior. In this paper, we investigate the +performance of Large Language Models (LLMs) at performing parameter inference +in the context of physical systems. Our experiments suggest that they are not +inherently suited to this task, even for simple systems. We propose a promising +direction of exploration, which involves the use of physical simulators to +augment the context of LLMs. We assess and compare the performance of different +LLMs on a simple example with and without access to physical simulation. + +
+
+
+
+
+ + ♻ ☆ LPNL: Scalable Link Prediction with Large Language Models + + +
+ Exploring the application of large language models (LLMs) to graph learning +is a emerging endeavor. However, the vast amount of information inherent in +large graphs poses significant challenges to this process. This work focuses on +the link prediction task and introduces $\textbf{LPNL}$ (Link Prediction via +Natural Language), a framework based on large language models designed for +scalable link prediction on large-scale heterogeneous graphs. We design novel +prompts for link prediction that articulate graph details in natural language. +We propose a two-stage sampling pipeline to extract crucial information from +the graphs, and a divide-and-conquer strategy to control the input tokens +within predefined limits, addressing the challenge of overwhelming information. +We fine-tune a T5 model based on our self-supervised learning designed for link +prediction. Extensive experimental results demonstrate that LPNL outperforms +multiple advanced baselines in link prediction tasks on large-scale graphs. + +
+
+
+
+
+ + ♻ ☆ Linear Alignment of Vision-language Models for Image Captioning + + +
+ Recently, vision-language models like CLIP have advanced the state of the art +in a variety of multi-modal tasks including image captioning and caption +evaluation. Many approaches adapt CLIP-style models to a downstream task by +training a mapping network between CLIP and a language model. This is costly as +it usually involves calculating gradients for large models. We propose a more +efficient training protocol that fits a linear mapping between image and text +embeddings of CLIP via a closed-form solution. This bypasses the need for +gradient computation and results in a lightweight captioning method called +ReCap, which can be trained up to 1000 times faster than existing lightweight +methods. Moreover, we propose two new learning-based image-captioning metrics +that build on CLIP score along with our linear mapping. Furthermore, we combine +ReCap with our new metrics to design an iterative datastore-augmentation loop +(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k, +VizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art +lightweight methods on established metrics while outperforming them on our new +metrics, which are better aligned with human ratings on Flickr8k-Expert and +Flickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to +other domains and that our DAL leads to a performance boost. + +
+
+ comment: 8 pages (+ references and appendix) +
+
+
+
+
+ + ♻ ☆ Personas as a Way to Model Truthfulness in Language Models + + +
+ Large language models (LLMs) are trained on vast amounts of text from the +internet, which contains both factual and misleading information about the +world. While unintuitive from a classic view of LMs, recent work has shown that +the truth value of a statement can be elicited from the model's +representations. This paper presents an explanation for why LMs appear to know +the truth despite not being trained with truth labels. We hypothesize that the +pretraining data is generated by groups of (un)truthful agents whose outputs +share common features, and they form a (un)truthful persona. By training on +this data, LMs can infer and represent the persona in its activation space. +This allows the model to separate truth from falsehoods and controls the +truthfulness of its generation. We show evidence for the persona hypothesis via +two observations: (1) we can probe whether a model's answer will be truthful +before it is generated; (2) finetuning a model on a set of facts improves its +truthfulness on unseen topics. Next, using arithmetics as a synthetic +environment, we show that structures of the pretraining data are crucial for +the model to infer the truthful persona. Overall, our findings suggest that +models can exploit hierarchical structures in the data to learn abstract +concepts like truthfulness. + +
+
+
+
+
+ + ♻ ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with + Graph, Image, and Text + + +
+ Large language models have made significant strides in natural language +processing, enabling innovative applications in molecular science by processing +textual representations of molecules. However, most existing language models +cannot capture the rich information with complex molecular structures or +images. In this paper, we introduce GIT-Mol, a multi-modal large language model +that integrates the Graph, Image, and Text information. To facilitate the +integration of multi-modal molecular data, we propose GIT-Former, a novel +architecture that is capable of aligning all modalities into a unified latent +space. We achieve a 5%-10% accuracy increase in properties prediction and a +20.2% boost in molecule generation validity compared to the baselines. With the +any-to-language molecular translation strategy, our model has the potential to +perform more downstream tasks, such as compound name recognition and chemical +reaction prediction. + +
+
+ comment: The article has been accepted by Computers in Biology and Medicine, + with 14 pages and 4 figures +
+
+
+
+
+ + ♻ ☆ States as Strings as Strategies: Steering Language Models with + Game-Theoretic Solvers + + +
+ Game theory is the study of mathematical models of strategic interactions +among rational agents. Language is a key medium of interaction for humans, +though it has historically proven difficult to model dialogue and its strategic +motivations mathematically. A suitable model of the players, strategies, and +payoffs associated with linguistic interactions (i.e., a binding to the +conventional symbolic logic of game theory) would enable existing +game-theoretic algorithms to provide strategic solutions in the space of +language. In other words, a binding could provide a route to computing stable, +rational conversational strategies in dialogue. Large language models (LLMs) +have arguably reached a point where their generative capabilities can enable +realistic, human-like simulations of natural dialogue. By prompting them in +various ways, we can steer their responses towards different output utterances. +Leveraging the expressivity of natural language, LLMs can also help us quickly +generate new dialogue scenarios, which are grounded in real world applications. +In this work, we present one possible binding from dialogue to game theory as +well as generalizations of existing equilibrium finding algorithms to this +setting. In addition, by exploiting LLMs generation capabilities along with our +proposed binding, we can synthesize a large repository of formally-defined +games in which one can study and test game-theoretic solution concepts. We also +demonstrate how one can combine LLM-driven game generation, game-theoretic +solvers, and imitation learning to construct a process for improving the +strategic capabilities of LLMs. + +
+
+ comment: 32 pages, 8 figures, code available @ + https://github.com/google-deepmind/open_spiel/blob/master/open_spiel/python/games/chat_game.py +
+
+
+
+
+ + ♻ ☆ Eliciting Latent Knowledge from Quirky Language Models + + +
+ Eliciting Latent Knowledge (ELK) aims to find patterns in a capable neural +network's activations which robustly track the true state of the world, even +when the network's overt output is false or misleading. To further ELK +research, we introduce 12 datasets and a corresponding suite of "quirky" +language models that are LoRA finetuned to make systematic errors when +answering questions if and only if the keyword "Bob" is present in the prompt. +We demonstrate that simple probing methods can elicit the model's latent +knowledge of the correct answer in these contexts, even for problems harder +than those the probe was trained on. This is enabled by context-independent +knowledge representations located in middle layer activations. We also find +that a mechanistic anomaly detection approach can flag untruthful behavior with +94% AUROC. Our results show promise for eliciting reliable knowledge from +capable but untrusted models, and facilitates future research empirically +investigating ELK methods. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ A Paradigm Shift in Machine Translation: Boosting Translation + Performance of Large Language Models ICLR 2024 + + +
+ Generative Large Language Models (LLMs) have achieved remarkable advancements +in various NLP tasks. However, these advances have not been reflected in the +translation task, especially those with moderate model sizes (i.e., 7B or 13B +parameters), which still lag behind conventional supervised encoder-decoder +translation models. Previous studies have attempted to improve the translation +capabilities of these moderate LLMs, but their gains have been limited. In this +study, we propose a novel fine-tuning approach for LLMs that is specifically +designed for the translation task, eliminating the need for the abundant +parallel data that traditional translation models usually depend on. Our +approach consists of two fine-tuning stages: initial fine-tuning on monolingual +data followed by subsequent fine-tuning on a small set of high-quality parallel +data. We introduce the LLM developed through this strategy as Advanced Language +Model-based trAnslator (ALMA). Based on LLaMA-2 as our underlying model, our +results show that the model can achieve an average improvement of more than 12 +BLEU and 12 COMET over its zero-shot performance across 10 translation +directions from the WMT'21 (2 directions) and WMT'22 (8 directions) test +datasets. The performance is significantly better than all prior work and even +superior to the NLLB-54B model and GPT-3.5-text-davinci-003, with only 7B or +13B parameters. This method establishes the foundation for a novel training +paradigm in machine translation. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models in Analysing Classroom Dialogue + + +
+ This study explores the application of Large Language Models (LLMs), +specifically GPT-4, in the analysis of classroom dialogue, a crucial research +task for both teaching diagnosis and quality improvement. Recognizing the +knowledge-intensive and labor-intensive nature of traditional qualitative +methods in educational research, this study investigates the potential of LLM +to streamline and enhance the analysis process. The study involves datasets +from a middle school, encompassing classroom dialogues across mathematics and +Chinese classes. These dialogues were manually coded by educational experts and +then analyzed using a customised GPT-4 model. This study focuses on comparing +manual annotations with the outputs of GPT-4 to evaluate its efficacy in +analyzing educational dialogues. Time efficiency, inter-coder agreement, and +inter-coder reliability between human coders and GPT-4 are evaluated. Results +indicate substantial time savings with GPT-4, and a high degree of consistency +in coding between the model and human coders, with some discrepancies in +specific codes. These findings highlight the strong potential of LLM in +teaching evaluation and facilitation. + +
+
+
+
+
+ + ♻ ☆ Selecting Seed Words for Wordle using Character Statistics + + +
+ Wordle, a word guessing game rose to global popularity in the January of +2022. The goal of the game is to guess a five-letter English word within six +tries. Each try provides the player with hints by means of colour changing +tiles which inform whether or not a given character is part of the solution as +well as, in cases where it is part of the solution, whether or not it is in the +correct placement. Numerous attempts have been made to find the best starting +word and best strategy to solve the daily wordle. This study uses character +statistics of five-letter words to determine the best three starting words. + +
+
+
+
+
+ + ♻ ☆ Video-LaVIT: Unified Video-Language Pre-training with Decoupled + Visual-Motional Tokenization + + +
+ In light of recent advances in multimodal Large Language Models (LLMs), there +is increasing attention to scaling them from image-text data to more +informative real-world videos. Compared to static images, video poses unique +challenges for effective large-scale pre-training due to the modeling of its +spatiotemporal dynamics. In this paper, we address such limitations in +video-language pre-training with an efficient video decomposition that +represents each video as keyframes and temporal motions. These are then adapted +to an LLM using well-designed tokenizers that discretize visual and temporal +information as a few tokens, thus enabling unified generative pre-training of +videos, images, and text. At inference, the generated tokens from the LLM are +carefully recovered to the original continuous pixel space to create various +video content. Our proposed framework is both capable of comprehending and +generating image and video content, as demonstrated by its competitive +performance across 13 multimodal benchmarks in image and video understanding +and generation. Our code and models will be available at +https://video-lavit.github.io. + +
+
+
+
+
+ + ♻ ☆ SMUTF: Schema Matching Using Generative Tags and Hybrid Features + + +
+ We introduce SMUTF, a unique approach for large-scale tabular data schema +matching (SM), which assumes that supervised learning does not affect +performance in open-domain tasks, thereby enabling effective cross-domain +matching. This system uniquely combines rule-based feature engineering, +pre-trained language models, and generative large language models. In an +innovative adaptation inspired by the Humanitarian Exchange Language, we deploy +'generative tags' for each data column, enhancing the effectiveness of SM. +SMUTF exhibits extensive versatility, working seamlessly with any pre-existing +pre-trained embeddings, classification methods, and generative models. + Recognizing the lack of extensive, publicly available datasets for SM, we +have created and open-sourced the HDXSM dataset from the public humanitarian +data. We believe this to be the most exhaustive SM dataset currently available. +In evaluations across various public datasets and the novel HDXSM dataset, +SMUTF demonstrated exceptional performance, surpassing existing +state-of-the-art models in terms of accuracy and efficiency, and} improving the +F1 score by 11.84% and the AUC of ROC by 5.08%. + +
+
+
+
+
+ + ♻ ☆ Compressed Context Memory For Online Language Model Interaction ICLR 2024 + + +
+ This paper presents a context key/value compression method for Transformer +language models in online scenarios, where the context continually expands. As +the context lengthens, the attention process demands increasing memory and +computations, which in turn reduces the throughput of the language model. To +address this challenge, we propose a compressed context memory system that +continually compresses the accumulating attention key/value pairs into a +compact memory space, facilitating language model inference in a limited memory +space of computing environments. Our compression process involves integrating a +lightweight conditional LoRA into the language model's forward pass during +inference, without the need for fine-tuning the model's entire set of weights. +We achieve efficient training by modeling the recursive compression process as +a single parallelized forward computation. Through evaluations on conversation, +personalization, and multi-task learning, we demonstrate that our approach +achieves the performance level of a full context model with $5\times$ smaller +context memory size. We further demonstrate the applicability of our approach +in a streaming setting with an unlimited context length, outperforming the +sliding window approach. Codes are available at +https://github.com/snu-mllab/context-memory. + +
+
+ comment: ICLR 2024. Add streaming setting results and training set analyses +
+
+
+
+
+ + ♻ ☆ Rethinking Optimization and Architecture for Tiny Language Models + + +
+ The power of large language models (LLMs) has been demonstrated through +numerous data and computing resources. However, the application of language +models on mobile devices is facing huge challenge on the computation and memory +costs, that is, tiny language models with high performance are urgently +required. Limited by the highly complex training process, there are many +details for optimizing language models that are seldom studied carefully. In +this study, based on a tiny language model with 1B parameters, we carefully +design a series of empirical study to analyze the effect of each component. +Three perspectives are mainly discussed, \ie, neural architecture, parameter +initialization, and optimization strategy. Several design formulas are +empirically proved especially effective for tiny language models, including +tokenizer compression, architecture tweaking, parameter inheritance and +multiple-round training. Then we train PanGu-$\pi$-1B Pro and PanGu-$\pi$-1.5B +Pro on 1.6T multilingual corpora, following the established formulas. +Experimental results demonstrate the improved optimization and architecture +yield a notable average improvement of 8.87 on benchmark evaluation sets for +PanGu-$\pi$-1B Pro. Besides, PanGu-$\pi$-1.5B Pro surpasses a range of SOTA +models with larger model sizes, validating its superior performance. The code +is available at https://github.com/YuchuanTian/RethinkTinyLM. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Time Series: A Survey + + +
+ Large Language Models (LLMs) have seen significant use in domains such as +natural language processing and computer vision. Going beyond text, image and +graphics, LLMs present a significant potential for analysis of time series +data, benefiting domains such as climate, IoT, healthcare, traffic, audio and +finance. This survey paper provides an in-depth exploration and a detailed +taxonomy of the various methodologies employed to harness the power of LLMs for +time series analysis. We address the inherent challenge of bridging the gap +between LLMs' original text data training and the numerical nature of time +series data, and explore strategies for transferring and distilling knowledge +from LLMs to numerical time series analysis. We detail various methodologies, +including (1) direct prompting of LLMs, (2) time series quantization, (3) +alignment techniques, (4) utilization of the vision modality as a bridging +mechanism, and (5) the combination of LLMs with tools. Additionally, this +survey offers a comprehensive overview of the existing multimodal time series +and text datasets and delves into the challenges and future opportunities of +this emerging field. We maintain an up-to-date Github repository which includes +all the papers and datasets discussed in the survey. + +
+
+ comment: GitHub repository: + https://github.com/xiyuanzh/awesome-llm-time-series +
+
+
+
+
+ + ♻ ☆ An LLM Compiler for Parallel Function Calling + + +
+ Recent language models have shown remarkable results on various complex +reasoning benchmarks. The reasoning capabilities of LLMs enable them to execute +external function calls to overcome their inherent limitations, such as +knowledge cutoffs, poor arithmetic skills, or lack of access to private data. +This development has allowed LLMs to select and coordinate multiple functions +based on the context to tackle more complex problems. However, current methods +for multiple function calling often require sequential reasoning and acting for +each function which can result in high latency, cost, and sometimes inaccurate +behavior. To address this, we introduce LLMCompiler, which executes functions +in parallel to efficiently orchestrate multiple function calling. Drawing from +the principles of classical compilers, LLMCompiler streamlines parallel +function calling with three components: (i) an LLM Planner, formulating +execution plans; (ii) a Task Fetching Unit, dispatching function calling tasks; +and (iii) an Executor, executing these tasks in parallel. LLMCompiler +automatically generates an optimized orchestration for the function calls and +can be used with both open-source and closed-source models. We have benchmarked +LLMCompiler on a range of tasks with different patterns of function calling. We +observe consistent latency speedup of up to 3.7x, cost savings of up to 6.7x, +and accuracy improvement of up to ~9% compared to ReAct. + +
+
+
+
+
+ + ♻ ☆ Skip \n: A simple method to reduce hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Testing the Depth of ChatGPT's Comprehension via Cross-Modal Tasks Based + on ASCII-Art: GPT3.5's Abilities in Regard to Recognizing and Generating + ASCII-Art Are Not Totally Lacking EACL 2024 + + +
+ Over the eight months since its release, ChatGPT and its underlying model, +GPT3.5, have garnered massive attention, due to their potent mix of capability +and accessibility. While a niche-industry of papers have emerged examining the +scope of capabilities these models possess, the information fed to and +extracted from these networks has been either natural language text or +stylized, code-like language. Drawing inspiration from the prowess we expect a +truly human-level intelligent agent to have across multiple signal modalities, +in this work we examine GPT3.5's aptitude for visual tasks, where the inputs +feature content provided as ASCII-art without overt distillation into a lingual +summary. We conduct experiments analyzing the model's performance on image +recognition tasks after various transforms typical in visual settings, trials +investigating knowledge of image parts, and tasks covering image generation. + +
+
+ comment: Accepted in EACL 2024 as a long paper. See + https://2024.eacl.org/program/findings-accepted/#long-papers . Note: this + paper's ArXiv version includes additional discussion, analysis, and types of + experiments compared to the EACL version. Changes introduced in V2 of ArXiv + paper: only this comment metadata. V1 was initially submission on July 26th, + 2023 - release was delayed by ArXiv for a few days +
+
+
+
+
+ + ♻ ☆ An Examination of the Robustness of Reference-Free Image Captioning + Evaluation Metrics + + +
+ Recently, reference-free metrics such as CLIPScore (Hessel et al., 2021), +UMIC (Lee et al., 2021), and PAC-S (Sarto et al., 2023) have been proposed for +automatic reference-free evaluation of image captions. Our focus lies in +evaluating the robustness of these metrics in scenarios that require +distinguishing between two captions with high lexical overlap but very +different meanings. Our findings reveal that despite their high correlation +with human judgments, CLIPScore, UMIC, and PAC-S struggle to identify +fine-grained errors. While all metrics exhibit strong sensitivity to visual +grounding errors, their sensitivity to caption implausibility errors is +limited. Furthermore, we found that all metrics are sensitive to variations in +the size of image-relevant objects mentioned in the caption, while CLIPScore +and PAC-S are also sensitive to the number of mentions of image-relevant +objects in the caption. Regarding linguistic aspects of a caption, all metrics +show weak comprehension of negation, and CLIPScore and PAC-S are insensitive to +the structure of the caption to a great extent. We hope our findings will guide +further improvements in reference-free evaluation of image captioning. + +
+
+
+
+
+ + ♻ ☆ Do LLMs exhibit human-like response biases? A case study in survey + design + + +
+ As large language models (LLMs) become more capable, there is growing +excitement about the possibility of using LLMs as proxies for humans in +real-world tasks where subjective labels are desired, such as in surveys and +opinion polling. One widely-cited barrier to the adoption of LLMs as proxies +for humans in subjective tasks is their sensitivity to prompt wording - but +interestingly, humans also display sensitivities to instruction changes in the +form of response biases. We investigate the extent to which LLMs reflect human +response biases, if at all. We look to survey design, where human response +biases caused by changes in the wordings of "prompts" have been extensively +explored in social psychology literature. Drawing from these works, we design a +dataset and framework to evaluate whether LLMs exhibit human-like response +biases in survey questionnaires. Our comprehensive evaluation of nine models +shows that popular open and commercial LLMs generally fail to reflect +human-like behavior, particularly in models that have undergone RLHF. +Furthermore, even if a model shows a significant change in the same direction +as humans, we find that they are sensitive to perturbations that do not elicit +significant changes in humans. These results highlight the pitfalls of using +LLMs as human proxies, and underscore the need for finer-grained +characterizations of model behavior. Our code, dataset, and collected samples +are available at https://github.com/lindiatjuatja/BiasMonkey + +
+
+
+
+
+ + ♻ ☆ How to Estimate Model Transferability of Pre-Trained Speech Models? + + +
+ In this work, we introduce a "score-based assessment" framework for +estimating the transferability of pre-trained speech models (PSMs) for +fine-tuning target tasks. We leverage upon two representation theories, +Bayesian likelihood estimation and optimal transport, to generate rank scores +for the PSM candidates using the extracted representations. Our framework +efficiently computes transferability scores without actual fine-tuning of +candidate models or layers by making a temporal independent hypothesis. We +evaluate some popular supervised speech models (e.g., Conformer RNN-Transducer) +and self-supervised speech models (e.g., HuBERT) in cross-layer and cross-model +settings using public data. Experimental results show a high Spearman's rank +correlation and low $p$-value between our estimation framework and fine-tuning +ground truth. Our proposed transferability framework requires less +computational time and resources, making it a resource-saving and +time-efficient approach for tuning speech foundation models. + +
+
+ comment: Accepted to Interspeech. Code is available at: + https://github.com/virginiakm1988/LogME-CTC. Fixed a typo +
+
+
+
+
+ + ♻ ☆ Language is All a Graph Needs EACL 2024 + + +
+ The emergence of large-scale pre-trained language models has revolutionized +various AI research domains. Transformers-based Large Language Models (LLMs) +have gradually replaced CNNs and RNNs to unify fields of computer vision and +natural language processing. Compared with independent data samples such as +images, videos or texts, graphs usually contain rich structural and relational +information. Meanwhile, language, especially natural language, being one of the +most expressive mediums, excels in describing complex structures. However, +existing work on incorporating graph problems into the generative language +modeling framework remains very limited. Considering the rising prominence of +LLMs, it becomes essential to explore whether LLMs can also replace GNNs as the +foundation model for graphs. In this paper, we propose InstructGLM +(Instruction-finetuned Graph Language Model) with highly scalable prompts based +on natural language instructions. We use natural language to describe +multi-scale geometric structure of the graph and then instruction finetune an +LLM to perform graph tasks, which enables Generative Graph Learning. Our method +surpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets, +underscoring its effectiveness and sheds light on generative LLMs as new +foundation model for graph machine learning. Our code is open-sourced at +https://github.com/agiresearch/InstructGLM. + +
+
+ comment: In EACL 2024 +
+
+
+
+
+ + ♻ ☆ EasyInstruct: An Easy-to-use Instruction Processing Framework for Large + Language Models + + +
+ In recent years, instruction tuning has gained increasing attention and +emerged as a crucial technique to enhance the capabilities of Large Language +Models (LLMs). To construct high-quality instruction datasets, many instruction +processing approaches have been proposed, aiming to achieve a delicate balance +between data quantity and data quality. Nevertheless, due to inconsistencies +that persist among various instruction processing methods, there is no standard +open-source instruction processing implementation framework available for the +community, which hinders practitioners from further developing and advancing. +To facilitate instruction processing research and development, we present +EasyInstruct, an easy-to-use instruction processing framework for LLMs, which +modularizes instruction generation, selection, and prompting, while also +considering their combination and interaction. EasyInstruct is publicly +released and actively maintained at https://github.com/zjunlp/EasyInstruct, +along with a running demo App at +https://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for +broader research centered on instruction data. + +
+
+ comment: Ongoing work; the project website is at + https://zjunlp.github.io/project/EasyInstruct, code is at + https://github.com/zjunlp/EasyInstruct, demo is at + https://huggingface.co/spaces/zjunlp/EasyInstruct +
+
+
+
+
+ + ♻ ☆ "It's how you do things that matters": Attending to Process to Better + Serve Indigenous Communities with Language Technologies + + +
+ Indigenous languages are historically under-served by Natural Language +Processing (NLP) technologies, but this is changing for some languages with the +recent scaling of large multilingual models and an increased focus by the NLP +community on endangered languages. This position paper explores ethical +considerations in building NLP technologies for Indigenous languages, based on +the premise that such projects should primarily serve Indigenous communities. +We report on interviews with 17 researchers working in or with Aboriginal +and/or Torres Strait Islander communities on language technology projects in +Australia. Drawing on insights from the interviews, we recommend practices for +NLP researchers to increase attention to the process of engagements with +Indigenous communities, rather than focusing only on decontextualised +artefacts. + +
+
+
+
+
+ + ♻ ☆ On the Relationship between Sentence Analogy Identification and Sentence + Structure Encoding in Large Language Models EACL 2024 + + +
+ The ability of Large Language Models (LLMs) to encode syntactic and semantic +structures of language is well examined in NLP. Additionally, analogy +identification, in the form of word analogies are extensively studied in the +last decade of language modeling literature. In this work we specifically look +at how LLMs' abilities to capture sentence analogies (sentences that convey +analogous meaning to each other) vary with LLMs' abilities to encode syntactic +and semantic structures of sentences. Through our analysis, we find that LLMs' +ability to identify sentence analogies is positively correlated with their +ability to encode syntactic and semantic structures of sentences. Specifically, +we find that the LLMs which capture syntactic structures better, also have +higher abilities in identifying sentence analogies. + +
+
+ comment: To appear in Findings of EACL 2024 +
+
+
+
+
+ + ♻ ☆ TiMix: Text-aware Image Mixing for Effective Vision-Language + Pre-training AAAI2024 + + +
+ Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances +modern Vision-Language Pre-training (VLP) models by aligning visual and +linguistic modalities. Due to noises in web-harvested text-image pairs, +however, scaling up training data volume in SMCL presents considerable +obstacles in terms of computational cost and data inefficiency. To improve data +efficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates +mix-based data augmentation techniques into SMCL, yielding significant +performance improvements without significantly increasing computational +overhead. We provide a theoretical analysis of TiMixfrom a mutual information +(MI) perspective, showing that mixed data samples for cross-modal contrastive +learning implicitly serve as a regularizer for the contrastive loss. The +experimental results demonstrate that TiMix exhibits a comparable performance +on downstream tasks, even with a reduced amount of training data and shorter +training time, when benchmarked against existing methods. This work empirically +and theoretically demonstrates the potential of data mixing for data-efficient +and computationally viable VLP, benefiting broader VLP model adoption in +practical scenarios. + +
+
+ comment: Accepted on AAAI2024 +
+
+
+
+
+ + ♻ ☆ Toward a Reinforcement-Learning-Based System for Adjusting Medication to + Minimize Speech Disfluency AAAI 2024 + + +
+ We propose a reinforcement learning (RL)-based system that would +automatically prescribe a hypothetical patient medication that may help the +patient with their mental health-related speech disfluency, and adjust the +medication and the dosages in response to zero-cost frequent measurement of the +fluency of the patient. We demonstrate the components of the system: a module +that detects and evaluates speech disfluency on a large dataset we built, and +an RL algorithm that automatically finds good combinations of medications. To +support the two modules, we collect data on the effect of psychiatric +medications for speech disfluency from the literature, and build a plausible +patient simulation system. We demonstrate that the RL system is, under some +circumstances, able to converge to a good medication regime. We collect and +label a dataset of people with possible speech disfluency and demonstrate our +methods using that dataset. Our work is a proof of concept: we show that there +is promise in the idea of using automatic data collection to address speech +disfluency. + +
+
+ comment: In Proc. Machine Learning for Cognitive and Mental Health Workshop + (ML4CMH) at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ When Large Language Models Meet Vector Databases: A Survey + + +
+ This survey explores the synergistic potential of Large Language Models +(LLMs) and Vector Databases (VecDBs), a burgeoning but rapidly evolving +research area. With the proliferation of LLMs comes a host of challenges, +including hallucinations, outdated knowledge, prohibitive commercial +application costs, and memory issues. VecDBs emerge as a compelling solution to +these issues by offering an efficient means to store, retrieve, and manage the +high-dimensional vector representations intrinsic to LLM operations. Through +this nuanced review, we delineate the foundational principles of LLMs and +VecDBs and critically analyze their integration's impact on enhancing LLM +functionalities. This discourse extends into a discussion on the speculative +future developments in this domain, aiming to catalyze further research into +optimizing the confluence of LLMs and VecDBs for advanced data handling and +knowledge extraction capabilities. + +
+
+
+
+
+ + ♻ ☆ CodeScope: An Execution-based Multilingual Multitask Multidimensional + Benchmark for Evaluating LLMs on Code Understanding and Generation + + +
+ Large Language Models (LLMs) have demonstrated remarkable performance on +coding related tasks, particularly on assisting humans in programming and +facilitating programming automation. However, existing benchmarks for +evaluating the code understanding and generation capacities of LLMs suffer from +severe limitations. First, most benchmarks are deficient as they focus on a +narrow range of popular programming languages and specific tasks, whereas the +real-world software development scenarios show dire need to implement systems +with multilingual programming environments to satisfy diverse requirements. +Practical programming practices also strongly expect multi-task settings for +testing coding capabilities of LLMs comprehensively and robustly. Second, most +benchmarks also fail to consider the actual executability and the consistency +of execution results of the generated code. To bridge these gaps between +existing benchmarks and expectations from practical applications, we introduce +CodeScope, an execution-based, multilingual, multi-task, multi-dimensional +evaluation benchmark for comprehensively gauging LLM capabilities on coding +tasks. CodeScope covers 43 programming languages and 8 coding tasks. It +evaluates the coding performance of LLMs from three dimensions (perspectives): +difficulty, efficiency, and length. To facilitate execution-based evaluations +of code generation, we develop MultiCodeEngine, an automated code execution +engine that supports 14 programming languages. Finally, we systematically +evaluate and analyze 8 mainstream LLMs on CodeScope tasks and demonstrate the +superior breadth and challenges of CodeScope for evaluating LLMs on code +understanding and generation tasks compared to other benchmarks. The CodeScope +benchmark and datasets are publicly available at +https://github.com/WeixiangYAN/CodeScope. + +
+
+
+
+
+ + ♻ ☆ Interpretability at Scale: Identifying Causal Mechanisms in Alpaca NeurIPS 2023 + + +
+ Obtaining human-interpretable explanations of large, general-purpose language +models is an urgent goal for AI safety. However, it is just as important that +our interpretability methods are faithful to the causal dynamics underlying +model behavior and able to robustly generalize to unseen inputs. Distributed +Alignment Search (DAS) is a powerful gradient descent method grounded in a +theory of causal abstraction that has uncovered perfect alignments between +interpretable symbolic algorithms and small deep learning models fine-tuned for +specific tasks. In the present paper, we scale DAS significantly by replacing +the remaining brute-force search steps with learned parameters -- an approach +we call Boundless DAS. This enables us to efficiently search for interpretable +causal structure in large language models while they follow instructions. We +apply Boundless DAS to the Alpaca model (7B parameters), which, off the shelf, +solves a simple numerical reasoning problem. With Boundless DAS, we discover +that Alpaca does this by implementing a causal model with two interpretable +boolean variables. Furthermore, we find that the alignment of neural +representations with these variables is robust to changes in inputs and +instructions. These findings mark a first step toward faithfully understanding +the inner-workings of our ever-growing and most widely deployed language +models. Our tool is extensible to larger LLMs and is released publicly at +`https://github.com/stanfordnlp/pyvene`. + +
+
+ comment: NeurIPS 2023 with Author Corrections +
+
+
+
+
+ + ♻ ☆ AV2Wav: Diffusion-Based Re-synthesis from Continuous Self-supervised + Features for Audio-Visual Speech Enhancement ICASSP 2024 + + +
+ Speech enhancement systems are typically trained using pairs of clean and +noisy speech. In audio-visual speech enhancement (AVSE), there is not as much +ground-truth clean data available; most audio-visual datasets are collected in +real-world environments with background noise and reverberation, hampering the +development of AVSE. In this work, we introduce AV2Wav, a resynthesis-based +audio-visual speech enhancement approach that can generate clean speech despite +the challenges of real-world training data. We obtain a subset of nearly clean +speech from an audio-visual corpus using a neural quality estimator, and then +train a diffusion model on this subset to generate waveforms conditioned on +continuous speech representations from AV-HuBERT with noise-robust training. We +use continuous rather than discrete representations to retain prosody and +speaker information. With this vocoding task alone, the model can perform +speech enhancement better than a masking-based baseline. We further fine-tune +the diffusion model on clean/noisy utterance pairs to improve the performance. +Our approach outperforms a masking-based baseline in terms of both automatic +metrics and a human listening test and is close in quality to the target speech +in the listening test. Audio samples can be found at +https://home.ttic.edu/~jcchou/demo/avse/avse_demo.html. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Towards Optimal Statistical Watermarking + + +
+ We study statistical watermarking by formulating it as a hypothesis testing +problem, a general framework which subsumes all previous statistical +watermarking methods. Key to our formulation is a coupling of the output tokens +and the rejection region, realized by pseudo-random generators in practice, +that allows non-trivial trade-offs between the Type I error and Type II error. +We characterize the Uniformly Most Powerful (UMP) watermark in the general +hypothesis testing setting and the minimax Type II error in the model-agnostic +setting. In the common scenario where the output is a sequence of $n$ tokens, +we establish nearly matching upper and lower bounds on the number of i.i.d. +tokens required to guarantee small Type I and Type II errors. Our rate of +$\Theta(h^{-1} \log (1/h))$ with respect to the average entropy per token $h$ +highlights potentials for improvement from the rate of $h^{-2}$ in the previous +works. Moreover, we formulate the robust watermarking problem where the user is +allowed to perform a class of perturbations on the generated texts, and +characterize the optimal Type II error of robust UMP tests via a linear +programming problem. To the best of our knowledge, this is the first systematic +statistical treatment on the watermarking problem with near-optimal rates in +the i.i.d. setting, which might be of interest for future works. + +
+
+
+
+
+ + ♻ ☆ Increasing Trust in Language Models through the Reuse of Verified + Circuits + + +
+ Language Models (LMs) are increasingly used for a wide range of prediction +tasks, but their training can often neglect rare edge cases, reducing their +reliability. Here, we define a stringent standard of trustworthiness whereby +the task algorithm and circuit implementation must be verified, accounting for +edge cases, with no known failure modes. We show that a transformer model can +be trained to meet this standard if built using mathematically and logically +specified frameworks. In this paper, we fully verify a model for n-digit +integer addition. To exhibit the reusability of verified modules, we insert the +trained integer addition model into an untrained model and train the combined +model to perform both addition and subtraction. We find extensive reuse of the +addition circuits for both tasks, easing verification of the more complex +subtractor model. We discuss how inserting verified task modules into LMs can +leverage model reuse to improve verifiability and trustworthiness of language +models built using them. The reuse of verified circuits reduces the effort to +verify more complex composite models which we believe to be a significant step +towards safety of language models. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ (Why) Is My Prompt Getting Worse? Rethinking Regression Testing for + Evolving LLM APIs + + +
+ Large Language Models (LLMs) are increasingly integrated into software +applications. Downstream application developers often access LLMs through APIs +provided as a service. However, LLM APIs are often updated silently and +scheduled to be deprecated, forcing users to continuously adapt to evolving +models. This can cause performance regression and affect prompt design choices, +as evidenced by our case study on toxicity detection. Based on our case study, +we emphasize the need for and re-examine the concept of regression testing for +evolving LLM APIs. We argue that regression testing LLMs requires fundamental +changes to traditional testing approaches, due to different correctness +notions, prompting brittleness, and non-determinism in LLM APIs. + +
+
+ comment: conference version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 134 + +
+
+
+ + ☆ EVA-CLIP-18B: Scaling CLIP to 18 Billion Parameters + + +
+ Scaling up contrastive language-image pretraining (CLIP) is critical for +empowering both vision and multimodal models. We present EVA-CLIP-18B, the +largest and most powerful open-source CLIP model to date, with 18-billion +parameters. With only 6-billion training samples seen, EVA-CLIP-18B achieves an +exceptional 80.7% zero-shot top-1 accuracy averaged across 27 widely recognized +image classification benchmarks, outperforming its forerunner EVA-CLIP +(5-billion parameters) and other open-source CLIP models by a large margin. +Remarkably, we observe a consistent performance improvement with the model size +scaling of EVA-CLIP, despite maintaining a constant training dataset of +2-billion image-text pairs from LAION-2B and COYO-700M. This dataset is openly +available and much smaller than the in-house datasets (e.g., DFN-5B, WebLI-10B) +employed in other state-of-the-art CLIP models. EVA-CLIP-18B demonstrates the +potential of EVA-style weak-to-strong visual model scaling. With our model +weights made publicly available, we hope to facilitate future research in +vision and multimodal foundation models. + +
+
+
+
+
+ + ☆ HarmBench: A Standardized Evaluation Framework for Automated Red Teaming + and Robust Refusal + + +
+ Automated red teaming holds substantial promise for uncovering and mitigating +the risks associated with the malicious use of large language models (LLMs), +yet the field lacks a standardized evaluation framework to rigorously assess +new methods. To address this issue, we introduce HarmBench, a standardized +evaluation framework for automated red teaming. We identify several desirable +properties previously unaccounted for in red teaming evaluations and +systematically design HarmBench to meet these criteria. Using HarmBench, we +conduct a large-scale comparison of 18 red teaming methods and 33 target LLMs +and defenses, yielding novel insights. We also introduce a highly efficient +adversarial training method that greatly enhances LLM robustness across a wide +range of attacks, demonstrating how HarmBench enables codevelopment of attacks +and defenses. We open source HarmBench at +https://github.com/centerforaisafety/HarmBench. + +
+
+ comment: Website: https://www.harmbench.org +
+
+
+
+
+ + ☆ CogCoM: Train Large Vision-Language Models Diving into Details through + Chain of Manipulations + + +
+ Vision-Language Models (VLMs) have demonstrated their widespread viability +thanks to extensive training in aligning visual instructions to answers. +However, this conclusive alignment leads models to ignore critical visual +reasoning, and further result in failures on meticulous visual problems and +unfaithful responses. In this paper, we propose Chain of Manipulations, a +mechanism that enables VLMs to solve problems with a series of manipulations, +where each manipulation refers to an operation on the visual input, either from +intrinsic abilities (e.g., grounding) acquired through prior training or from +imitating human-like behaviors (e.g., zoom in). This mechanism encourages VLMs +to generate faithful responses with evidential visual reasoning, and permits +users to trace error causes in the interpretable paths. We thus train CogCoM, a +general 17B VLM with a memory-based compatible architecture endowed this +reasoning mechanism. Experiments show that our model achieves the +state-of-the-art performance across 8 benchmarks from 3 categories, and a +limited number of training steps with the data swiftly gains a competitive +performance. The code and data are publicly available at +https://github.com/THUDM/CogCoM. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Instance by Instance: An Iterative Framework for Multi-instance 3D + Registration + + +
+ Multi-instance registration is a challenging problem in computer vision and +robotics, where multiple instances of an object need to be registered in a +standard coordinate system. In this work, we propose the first iterative +framework called instance-by-instance (IBI) for multi-instance 3D registration +(MI-3DReg). It successively registers all instances in a given scenario, +starting from the easiest and progressing to more challenging ones. Throughout +the iterative process, outliers are eliminated continuously, leading to an +increasing inlier rate for the remaining and more challenging instances. Under +the IBI framework, we further propose a sparse-to-dense-correspondence-based +multi-instance registration method (IBI-S2DC) to achieve robust MI-3DReg. +Experiments on the synthetic and real datasets have demonstrated the +effectiveness of IBI and suggested the new state-of-the-art performance of +IBI-S2DC, e.g., our MHF1 is 12.02%/12.35% higher than the existing +state-of-the-art method ECC on the synthetic/real datasets. + +
+
+ comment: 14 pages, 12 figures, 10 tables +
+
+
+
+
+ + ☆ SHIELD : An Evaluation Benchmark for Face Spoofing and Forgery Detection + with Multimodal Large Language Models + + +
+ Multimodal large language models (MLLMs) have demonstrated remarkable +problem-solving capabilities in various vision fields (e.g., generic object +recognition and grounding) based on strong visual semantic representation and +language reasoning ability. However, whether MLLMs are sensitive to subtle +visual spoof/forged clues and how they perform in the domain of face attack +detection (e.g., face spoofing and forgery detection) is still unexplored. In +this paper, we introduce a new benchmark, namely SHIELD, to evaluate the +ability of MLLMs on face spoofing and forgery detection. Specifically, we +design true/false and multiple-choice questions to evaluate multimodal face +data in these two face security tasks. For the face anti-spoofing task, we +evaluate three different modalities (i.e., RGB, infrared, depth) under four +types of presentation attacks (i.e., print attack, replay attack, rigid mask, +paper mask). For the face forgery detection task, we evaluate GAN-based and +diffusion-based data with both visual and acoustic modalities. Each question is +subjected to both zero-shot and few-shot tests under standard and chain of +thought (COT) settings. The results indicate that MLLMs hold substantial +potential in the face security domain, offering advantages over traditional +specific models in terms of interpretability, multimodal flexible reasoning, +and joint face spoof and forgery detection. Additionally, we develop a novel +Multi-Attribute Chain of Thought (MA-COT) paradigm for describing and judging +various task-specific and task-irrelevant attributes of face images, which +provides rich task-related knowledge for subtle spoof/forged clue mining. +Extensive experiments in separate face anti-spoofing, separate face forgery +detection, and joint detection tasks demonstrate the effectiveness of the +proposed MA-COT. The project is available at +https$:$//github.com/laiyingxin2/SHIELD + +
+
+
+
+
+ + ☆ 3D Volumetric Super-Resolution in Radiology Using 3D RRDB-GAN + + +
+ This study introduces the 3D Residual-in-Residual Dense Block GAN (3D +RRDB-GAN) for 3D super-resolution for radiology imagery. A key aspect of 3D +RRDB-GAN is the integration of a 2.5D perceptual loss function, which +contributes to improved volumetric image quality and realism. The effectiveness +of our model was evaluated through 4x super-resolution experiments across +diverse datasets, including Mice Brain MRH, OASIS, HCP1200, and MSD-Task-6. +These evaluations, encompassing both quantitative metrics like LPIPS and FID +and qualitative assessments through sample visualizations, demonstrate the +models effectiveness in detailed image analysis. The 3D RRDB-GAN offers a +significant contribution to medical imaging, particularly by enriching the +depth, clarity, and volumetric detail of medical images. Its application shows +promise in enhancing the interpretation and analysis of complex medical imagery +from a comprehensive 3D perspective. + +
+
+
+
+
+ + ☆ Informed Reinforcement Learning for Situation-Aware Traffic Rule + Exceptions ICRA 2024 + + +
+ Reinforcement Learning is a highly active research field with promising +advancements. In the field of autonomous driving, however, often very simple +scenarios are being examined. Common approaches use non-interpretable control +commands as the action space and unstructured reward designs which lack +structure. In this work, we introduce Informed Reinforcement Learning, where a +structured rulebook is integrated as a knowledge source. We learn trajectories +and asses them with a situation-aware reward design, leading to a dynamic +reward which allows the agent to learn situations which require controlled +traffic rule exceptions. Our method is applicable to arbitrary RL models. We +successfully demonstrate high completion rates of complex scenarios with recent +model-based agents. + +
+
+ comment: Daniel Bogdoll and Jing Qin contributed equally. Accepted for + publication at ICRA 2024 +
+
+
+
+
+ + ☆ U-shaped Vision Mamba for Single Image Dehazing + + +
+ Currently, Transformer is the most popular architecture for image dehazing, +but due to its large computational complexity, its ability to handle long-range +dependency is limited on resource-constrained devices. To tackle this +challenge, we introduce the U-shaped Vision Mamba (UVM-Net), an efficient +single-image dehazing network. Inspired by the State Space Sequence Models +(SSMs), a new deep sequence model known for its power to handle long sequences, +we design a Bi-SSM block that integrates the local feature extraction ability +of the convolutional layer with the ability of the SSM to capture long-range +dependencies. Extensive experimental results demonstrate the effectiveness of +our method. Our method provides a more highly efficient idea of long-range +dependency modeling for image dehazing as well as other image restoration +tasks. The URL of the code is \url{https://github.com/zzr-idam}. + +
+
+
+
+
+ + ☆ OVOR: OnePrompt with Virtual Outlier Regularization for Rehearsal-Free + Class-Incremental Learning ICLR 2024 + + +
+ Recent works have shown that by using large pre-trained models along with +learnable prompts, rehearsal-free methods for class-incremental learning (CIL) +settings can achieve superior performance to prominent rehearsal-based ones. +Rehearsal-free CIL methods struggle with distinguishing classes from different +tasks, as those are not trained together. In this work we propose a +regularization method based on virtual outliers to tighten decision boundaries +of the classifier, such that confusion of classes among different tasks is +mitigated. Recent prompt-based methods often require a pool of task-specific +prompts, in order to prevent overwriting knowledge of previous tasks with that +of the new task, leading to extra computation in querying and composing an +appropriate prompt from the pool. This additional cost can be eliminated, +without sacrificing accuracy, as we reveal in the paper. We illustrate that a +simplified prompt-based method can achieve results comparable to previous +state-of-the-art (SOTA) methods equipped with a prompt pool, using much less +learnable parameters and lower inference cost. Our regularization method has +demonstrated its compatibility with different prompt-based methods, boosting +those previous SOTA rehearsal-free CIL methods' accuracy on the ImageNet-R and +CIFAR-100 benchmarks. Our source code is available at +https://github.com/jpmorganchase/ovor. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ VRMM: A Volumetric Relightable Morphable Head Model + + +
+ In this paper, we introduce the Volumetric Relightable Morphable Model +(VRMM), a novel volumetric and parametric facial prior for 3D face modeling. +While recent volumetric prior models offer improvements over traditional +methods like 3D Morphable Models (3DMMs), they face challenges in model +learning and personalized reconstructions. Our VRMM overcomes these by +employing a novel training framework that efficiently disentangles and encodes +latent spaces of identity, expression, and lighting into low-dimensional +representations. This framework, designed with self-supervised learning, +significantly reduces the constraints for training data, making it more +feasible in practice. The learned VRMM offers relighting capabilities and +encompasses a comprehensive range of expressions. We demonstrate the +versatility and effectiveness of VRMM through various applications like avatar +generation, facial reconstruction, and animation. Additionally, we address the +common issue of overfitting in generative volumetric models with a novel +prior-preserving personalization framework based on VRMM. Such an approach +enables accurate 3D face reconstruction from even a single portrait input. Our +experiments showcase the potential of VRMM to significantly enhance the field +of 3D face modeling. + +
+
+
+
+
+ + ☆ Analysis of Deep Image Prior and Exploiting Self-Guidance for Image + Reconstruction + + +
+ The ability of deep image prior (DIP) to recover high-quality images from +incomplete or corrupted measurements has made it popular in inverse problems in +image restoration and medical imaging including magnetic resonance imaging +(MRI). However, conventional DIP suffers from severe overfitting and spectral +bias effects.In this work, we first provide an analysis of how DIP recovers +information from undersampled imaging measurements by analyzing the training +dynamics of the underlying networks in the kernel regime for different +architectures.This study sheds light on important underlying properties for +DIP-based recovery.Current research suggests that incorporating a reference +image as network input can enhance DIP's performance in image reconstruction +compared to using random inputs. However, obtaining suitable reference images +requires supervision, and raises practical difficulties. In an attempt to +overcome this obstacle, we further introduce a self-driven reconstruction +process that concurrently optimizes both the network weights and the input +while eliminating the need for training data. Our method incorporates a novel +denoiser regularization term which enables robust and stable joint estimation +of both the network input and reconstructed image.We demonstrate that our +self-guided method surpasses both the original DIP and modern supervised +methods in terms of MR image reconstruction performance and outperforms +previous DIP-based schemes for image inpainting. + +
+
+
+
+
+ + ☆ A Hard-to-Beat Baseline for Training-free CLIP-based Adaptation ICLR 2024 + + +
+ Contrastive Language-Image Pretraining (CLIP) has gained popularity for its +remarkable zero-shot capacity. Recent research has focused on developing +efficient fine-tuning methods, such as prompt learning and adapter, to enhance +CLIP's performance in downstream tasks. However, these methods still require +additional training time and computational resources, which is undesirable for +devices with limited resources. In this paper, we revisit a classical +algorithm, Gaussian Discriminant Analysis (GDA), and apply it to the downstream +classification of CLIP. Typically, GDA assumes that features of each class +follow Gaussian distributions with identical covariance. By leveraging Bayes' +formula, the classifier can be expressed in terms of the class means and +covariance, which can be estimated from the data without the need for training. +To integrate knowledge from both visual and textual modalities, we ensemble it +with the original zero-shot classifier within CLIP. Extensive results on 17 +datasets validate that our method surpasses or achieves comparable results with +state-of-the-art methods on few-shot classification, imbalanced learning, and +out-of-distribution generalization. In addition, we extend our method to +base-to-new generalization and unsupervised learning, once again demonstrating +its superiority over competing approaches. Our code is publicly available at +\url{https://github.com/mrflogs/ICLR24}. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Multi-class Road Defect Detection and Segmentation using Spatial and + Channel-wise Attention for Autonomous Road Repairing ICRA 2024 + + +
+ Road pavement detection and segmentation are critical for developing +autonomous road repair systems. However, developing an instance segmentation +method that simultaneously performs multi-class defect detection and +segmentation is challenging due to the textural simplicity of road pavement +image, the diversity of defect geometries, and the morphological ambiguity +between classes. We propose a novel end-to-end method for multi-class road +defect detection and segmentation. The proposed method comprises multiple +spatial and channel-wise attention blocks available to learn global +representations across spatial and channel-wise dimensions. Through these +attention blocks, more globally generalised representations of morphological +information (spatial characteristics) of road defects and colour and depth +information of images can be learned. To demonstrate the effectiveness of our +framework, we conducted various ablation studies and comparisons with prior +methods on a newly collected dataset annotated with nine road defect classes. +The experiments show that our proposed method outperforms existing +state-of-the-art methods for multi-class road defect detection and segmentation +methods. + +
+
+ comment: Accepted to the ICRA 2024 +
+
+
+
+
+ + ☆ Connecting the Dots: Collaborative Fine-tuning for Black-Box + Vision-Language Models + + +
+ With the emergence of pretrained vision-language models (VLMs), considerable +efforts have been devoted to fine-tuning them for downstream tasks. Despite the +progress made in designing efficient fine-tuning methods, such methods require +access to the model's parameters, which can be challenging as model owners +often opt to provide their models as a black box to safeguard model ownership. +This paper proposes a \textbf{C}ollabo\textbf{ra}tive +\textbf{F}ine-\textbf{T}uning (\textbf{CraFT}) approach for fine-tuning +black-box VLMs to downstream tasks, where one only has access to the input +prompts and the output predictions of the model. CraFT comprises two modules, a +prompt generation module for learning text prompts and a prediction refinement +module for enhancing output predictions in residual style. Additionally, we +introduce an auxiliary prediction-consistent loss to promote consistent +optimization across these modules. These modules are optimized by a novel +collaborative training algorithm. Extensive experiments on few-shot +classification over 15 datasets demonstrate the superiority of CraFT. The +results show that CraFT achieves a decent gain of about 12\% with 16-shot +datasets and only 8,000 queries. Moreover, CraFT trains faster and uses only +about 1/80 of the memory footprint for deployment, while sacrificing only +1.62\% compared to the white-box method. + +
+
+
+
+
+ + ☆ Polyp-DDPM: Diffusion-Based Semantic Polyp Synthesis for Enhanced + Segmentation + + +
+ This study introduces Polyp-DDPM, a diffusion-based method for generating +realistic images of polyps conditioned on masks, aimed at enhancing the +segmentation of gastrointestinal (GI) tract polyps. Our approach addresses the +challenges of data limitations, high annotation costs, and privacy concerns +associated with medical images. By conditioning the diffusion model on +segmentation masks-binary masks that represent abnormal areas-Polyp-DDPM +outperforms state-of-the-art methods in terms of image quality (achieving a +Frechet Inception Distance (FID) score of 78.47, compared to scores above +83.79) and segmentation performance (achieving an Intersection over Union (IoU) +of 0.7156, versus less than 0.6694 for synthetic images from baseline models +and 0.7067 for real data). Our method generates a high-quality, diverse +synthetic dataset for training, thereby enhancing polyp segmentation models to +be comparable with real images and offering greater data augmentation +capabilities to improve segmentation models. The source code and pretrained +weights for Polyp-DDPM are made publicly available at +https://github.com/mobaidoctor/polyp-ddpm. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Privacy Leakage on DNNs: A Survey of Model Inversion Attacks and + Defenses + + +
+ Model Inversion (MI) attacks aim to disclose private information about the +training data by abusing access to the pre-trained models. These attacks enable +adversaries to reconstruct high-fidelity data that closely aligns with the +private training data, which has raised significant privacy concerns. Despite +the rapid advances in the field, we lack a comprehensive overview of existing +MI attacks and defenses. To fill this gap, this paper thoroughly investigates +this field and presents a holistic survey. Firstly, our work briefly reviews +the traditional MI on machine learning scenarios. We then elaborately analyze +and compare numerous recent attacks and defenses on \textbf{D}eep +\textbf{N}eural \textbf{N}etworks (DNNs) across multiple modalities and +learning tasks. + +
+
+
+
+
+ + ☆ Low-rank Attention Side-Tuning for Parameter-Efficient Fine-Tuning + + +
+ In finetuning a large pretrained model to downstream tasks, +parameter-efficient fine-tuning (PEFT) methods can effectively finetune +pretrained models with few trainable parameters, but suffer from high GPU +memory consumption and slow training speed. Because learnable parameters from +these methods are entangled with the pretrained model, gradients related to the +frozen pretrained model's parameters have to be computed and stored during +finetuning. We propose Low-rank Attention Side-Tuning (LAST), which +disentangles the trainable module from the pretrained model by freezing not +only parameters but also outputs of the pretrained network. LAST trains a +side-network composed of only low-rank self-attention modules. By viewing the +pretrained model as a frozen feature extractor, the side-network takes +intermediate output from the pretrained model and focus on learning +task-specific knowledge. We also show that LAST can be highly parallel across +multiple optimization objectives, making it very efficient in downstream task +adaptation, for example, in finding optimal hyperparameters. LAST outperforms +previous state-of-the-art methods on VTAB-1K and other visual adaptation tasks +with roughly only 30\% of GPU memory footprint and 60\% of training time +compared to existing PEFT methods, but achieves significantly higher accuracy. + +
+
+
+
+
+ + ☆ YOLOPoint Joint Keypoint and Object Detection + + +
+ Intelligent vehicles of the future must be capable of understanding and +navigating safely through their surroundings. Camera-based vehicle systems can +use keypoints as well as objects as low- and high-level landmarks for +GNSS-independent SLAM and visual odometry. To this end we propose YOLOPoint, a +convolutional neural network model that simultaneously detects keypoints and +objects in an image by combining YOLOv5 and SuperPoint to create a single +forward-pass network that is both real-time capable and accurate. By using a +shared backbone and a light-weight network structure, YOLOPoint is able to +perform competitively on both the HPatches and KITTI benchmarks. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Controllable Diverse Sampling for Diffusion Based Motion Behavior + Forecasting + + +
+ In autonomous driving tasks, trajectory prediction in complex traffic +environments requires adherence to real-world context conditions and behavior +multimodalities. Existing methods predominantly rely on prior assumptions or +generative models trained on curated data to learn road agents' stochastic +behavior bounded by scene constraints. However, they often face mode averaging +issues due to data imbalance and simplistic priors, and could even suffer from +mode collapse due to unstable training and single ground truth supervision. +These issues lead the existing methods to a loss of predictive diversity and +adherence to the scene constraints. To address these challenges, we introduce a +novel trajectory generator named Controllable Diffusion Trajectory (CDT), which +integrates map information and social interactions into a Transformer-based +conditional denoising diffusion model to guide the prediction of future +trajectories. To ensure multimodality, we incorporate behavioral tokens to +direct the trajectory's modes, such as going straight, turning right or left. +Moreover, we incorporate the predicted endpoints as an alternative behavioral +token into the CDT model to facilitate the prediction of accurate trajectories. +Extensive experiments on the Argoverse 2 benchmark demonstrate that CDT excels +in generating diverse and scene-compliant trajectories in complex urban +settings. + +
+
+
+
+
+ + ☆ Humans Beat Deep Networks at Recognizing Objects in Unusual Poses, Given + Enough Time + + +
+ Deep learning is closing the gap with humans on several object recognition +benchmarks. Here we investigate this gap in the context of challenging images +where objects are seen from unusual viewpoints. We find that humans excel at +recognizing objects in unusual poses, in contrast with state-of-the-art +pretrained networks (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) which are +systematically brittle in this condition. Remarkably, as we limit image +exposure time, human performance degrades to the level of deep networks, +suggesting that additional mental processes (requiring additional time) take +place when humans identify objects in unusual poses. Finally, our analysis of +error patterns of humans vs. networks reveals that even time-limited humans are +dissimilar to feed-forward deep networks. We conclude that more work is needed +to bring computer vision systems to the level of robustness of the human visual +system. Understanding the nature of the mental processes taking place during +extra viewing time may be key to attain such robustness. + +
+
+
+
+
+ + ☆ Boosting Adversarial Transferability across Model Genus by + Deformation-Constrained Warping AAAI 2024 + + +
+ Adversarial examples generated by a surrogate model typically exhibit limited +transferability to unknown target systems. To address this problem, many +transferability enhancement approaches (e.g., input transformation and model +augmentation) have been proposed. However, they show poor performances in +attacking systems having different model genera from the surrogate model. In +this paper, we propose a novel and generic attacking strategy, called +Deformation-Constrained Warping Attack (DeCoWA), that can be effectively +applied to cross model genus attack. Specifically, DeCoWA firstly augments +input examples via an elastic deformation, namely Deformation-Constrained +Warping (DeCoW), to obtain rich local details of the augmented input. To avoid +severe distortion of global semantics led by random deformation, DeCoW further +constrains the strength and direction of the warping transformation by a novel +adaptive control strategy. Extensive experiments demonstrate that the +transferable examples crafted by our DeCoWA on CNN surrogates can significantly +hinder the performance of Transformers (and vice versa) on various tasks, +including image classification, video action recognition, and audio +recognition. Code is made available at https://github.com/LinQinLiang/DeCoWA. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Elastic Feature Consolidation for Cold Start Exemplar-free Incremental + Learning ICLR 2024 + + +
+ Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a +sequence of tasks without having access to previous task data. In this paper, +we consider the challenging Cold Start scenario in which insufficient data is +available in the first task to learn a high-quality backbone. This is +especially challenging for EFCIL since it requires high plasticity, which +results in feature drift which is difficult to compensate for in the +exemplar-free setting. To address this problem, we propose a simple and +effective approach that consolidates feature representations by regularizing +drift in directions highly relevant to previous tasks and employs prototypes to +reduce task-recency bias. Our method, called Elastic Feature Consolidation +(EFC), exploits a tractable second-order approximation of feature drift based +on an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in +feature space which we use to regularize feature drift in important directions +and to update Gaussian prototypes used in a novel asymmetric cross entropy loss +which effectively balances prototype rehearsal with data from new tasks. +Experimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and +ImageNet-1K demonstrate that Elastic Feature Consolidation is better able to +learn new tasks by maintaining model plasticity and significantly outperform +the state-of-the-art. + +
+
+ comment: Accepted at Twelfth International Conference on Learning + Representations (ICLR 2024) +
+
+
+
+
+ + ☆ EscherNet: A Generative Model for Scalable View Synthesis + + +
+ We introduce EscherNet, a multi-view conditioned diffusion model for view +synthesis. EscherNet learns implicit and generative 3D representations coupled +with a specialised camera positional encoding, allowing precise and continuous +relative control of the camera transformation between an arbitrary number of +reference and target views. EscherNet offers exceptional generality, +flexibility, and scalability in view synthesis -- it can generate more than 100 +consistent target views simultaneously on a single consumer-grade GPU, despite +being trained with a fixed number of 3 reference views to 3 target views. As a +result, EscherNet not only addresses zero-shot novel view synthesis, but also +naturally unifies single- and multi-image 3D reconstruction, combining these +diverse tasks into a single, cohesive framework. Our extensive experiments +demonstrate that EscherNet achieves state-of-the-art performance in multiple +benchmarks, even when compared to methods specifically tailored for each +individual problem. This remarkable versatility opens up new directions for +designing scalable neural architectures for 3D vision. Project page: +\url{https://kxhit.github.io/EscherNet}. + +
+
+ comment: Project Page: https://kxhit.github.io/EscherNet +
+
+
+
+
+ + ☆ Deep MSFOP: Multiple Spectral filter Operators Preservation in Deep + Functional Maps for Unsupervised Shape Matching + + +
+ We propose a novel constraint called Multiple Spectral filter Operators +Preservation (MSFOR) to compute functional maps and based on it, develop an +efficient deep functional map architecture called Deep MSFOP for shape +matching. The core idea is that, instead of using the general descriptor +preservation constraint, we require our maps to preserve multiple spectral +filter operators. This allows us to incorporate more informative geometrical +information, contained in different frequency bands of functions, into the +functional map computing. This can be confirmed by that some previous +techniques like wavelet preservation and LBO commutativity are actually our +special cases. Moreover, we also develop a very efficient way to compute the +maps with MSFOP constraint, which can be conveniently embedded into the deep +learning, especially having learnable filter operators. Utilizing the above +results, we finally design our Deep MSFOP pipeline, equipped with a suitable +unsupervised loss jointly penalizing the functional map and the underlying +pointwise map. Our deep functional map has notable advantages, including that +the functional map is more geometrically informative and guaranteed to be +proper, and the computing is numerically stable. Extensive experimental results +on different datasets demonstrate that our approach outperforms the existing +state-of-the-art methods, especially in challenging settings like non-isometric +and inconsistent topology datasets. + +
+
+
+
+
+ + ☆ Convincing Rationales for Visual Question Answering Reasoning + + +
+ Visual Question Answering (VQA) is a challenging task of predicting the +answer to a question about the content of an image. It requires deep +understanding of both the textual question and visual image. Prior works +directly evaluate the answering models by simply calculating the accuracy of +the predicted answers. However, the inner reasoning behind the prediction is +disregarded in such a "black box" system, and we do not even know if one can +trust the predictions. In some cases, the models still get the correct answers +even when they focus on irrelevant visual regions or textual tokens, which +makes the models unreliable and illogical. To generate both visual and textual +rationales next to the predicted answer to the given image/question pair, we +propose Convincing Rationales for VQA, CRVQA. Considering the extra annotations +brought by the new outputs, {CRVQA} is trained and evaluated by samples +converted from some existing VQA datasets and their visual labels. The +extensive experiments demonstrate that the visual and textual rationales +support the prediction of the answers, and further improve the accuracy. +Furthermore, {CRVQA} achieves competitive performance on generic VQA datatsets +in the zero-shot evaluation setting. The dataset and source code will be +released under https://github.com/lik1996/CRVQA2024. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ A new method for optical steel rope non-destructive damage detection + + +
+ This paper presents a novel algorithm for non-destructive damage detection +for steel ropes in high-altitude environments (aerial ropeway). The algorithm +comprises two key components: First, a segmentation model named RGBD-UNet is +designed to accurately extract steel ropes from complex backgrounds. This model +is equipped with the capability to process and combine color and depth +information through the proposed CMA module. Second, a detection model named +VovNetV3.5 is developed to differentiate between normal and abnormal steel +ropes. It integrates the VovNet architecture with a DBB module to enhance +performance. Besides, a novel background augmentation method is proposed to +enhance the generalization ability of the segmentation model. Datasets +containing images of steel ropes in different scenarios are created for the +training and testing of both the segmentation and detection models. Experiments +demonstrate a significant improvement over baseline models. On the proposed +dataset, the highest accuracy achieved by the detection model reached 0.975, +and the maximum F-measure achieved by the segmentation model reached 0.948. + +
+
+
+
+
+ + ☆ Belief Scene Graphs: Expanding Partial Scenes with Objects through + Computation of Expectation ICRA 2024 + + +
+ In this article, we propose the novel concept of Belief Scene Graphs, which +are utility-driven extensions of partial 3D scene graphs, that enable efficient +high-level task planning with partial information. We propose a graph-based +learning methodology for the computation of belief (also referred to as +expectation) on any given 3D scene graph, which is then used to strategically +add new nodes (referred to as blind nodes) that are relevant for a robotic +mission. We propose the method of Computation of Expectation based on +Correlation Information (CECI), to reasonably approximate real +Belief/Expectation, by learning histograms from available training data. A +novel Graph Convolutional Neural Network (GCN) model is developed, to learn +CECI from a repository of 3D scene graphs. As no database of 3D scene graphs +exists for the training of the novel CECI model, we present a novel methodology +for generating a 3D scene graph dataset based on semantically annotated +real-life 3D spaces. The generated dataset is then utilized to train the +proposed CECI model and for extensive validation of the proposed method. We +establish the novel concept of \textit{Belief Scene Graphs} (BSG), as a core +component to integrate expectations into abstract representations. This new +concept is an evolution of the classical 3D scene graph concept and aims to +enable high-level reasoning for the task planning and optimization of a variety +of robotics missions. The efficacy of the overall framework has been evaluated +in an object search scenario, and has also been tested on a real-life +experiment to emulate human common sense of unseen-objects. + +
+
+ comment: Accepted at ICRA 2024 +
+
+
+
+
+ + ☆ An SVD-free Approach to Nonlinear Dictionary Learning based on RVFL + + +
+ This paper presents a novel nonlinear dictionary learning algorithm +leveraging the theory of a feed-forward neural network called Random Vector +Functional Link (RVFL). The proposed RVFL-based nonlinear Dictionary Learning +(RVFLDL) learns a dictionary as a sparse-to-dense feature map from nonlinear +sparse coefficients to the dense input features. Kernel-based nonlinear +dictionary learning methods operate in a feature space obtained by an implicit +feature map, and they are not independent of computationally expensive +operations like Singular Value Decomposition (SVD). Training the RVFL-based +dictionary is free from SVD computation as RVFL generates weights from the +input to the output layer analytically. Sparsity-inducing Horse-shoe prior is +assumed on the coefficients to generate a sparse coefficient matrix w.r.t an +initial random dictionary. Higher-order dependencies between the input sparse +coefficients and the dictionary atoms are incorporated into the training +process by nonlinearly transforming the sparse coefficients and adding them as +enhanced features. Thus the method projects sparse coefficients to a higher +dimensional space while inducing nonlinearities into the dictionary. For +classification using RVFL-net, a classifier matrix is learned as a transform +that maps nonlinear sparse coefficients to the labels. The performance of the +method illustrated in image classification and reconstruction applications is +comparable to that of other nonlinear dictionary learning methods. Experiments +show that RVFLDL is scalable and provides a solution better than those obtained +using other nonlinear dictionary learning methods. + +
+
+
+
+
+ + ☆ OASim: an Open and Adaptive Simulator based on Neural Rendering for + Autonomous Driving + + +
+ With deep learning and computer vision technology development, autonomous +driving provides new solutions to improve traffic safety and efficiency. The +importance of building high-quality datasets is self-evident, especially with +the rise of end-to-end autonomous driving algorithms in recent years. Data +plays a core role in the algorithm closed-loop system. However, collecting +real-world data is expensive, time-consuming, and unsafe. With the development +of implicit rendering technology and in-depth research on using generative +models to produce data at scale, we propose OASim, an open and adaptive +simulator and autonomous driving data generator based on implicit neural +rendering. It has the following characteristics: (1) High-quality scene +reconstruction through neural implicit surface reconstruction technology. (2) +Trajectory editing of the ego vehicle and participating vehicles. (3) Rich +vehicle model library that can be freely selected and inserted into the scene. +(4) Rich sensors model library where you can select specified sensors to +generate data. (5) A highly customizable data generation system can generate +data according to user needs. We demonstrate the high quality and fidelity of +the generated data through perception performance evaluation on the Carla +simulator and real-world data acquisition. Code is available at +https://github.com/PJLab-ADG/OASim. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Face Detection: Present State and Research Directions + + +
+ The majority of computer vision applications that handle images featuring +humans use face detection as a core component. Face detection still has issues, +despite much research on the topic. Face detection's accuracy and speed might +yet be increased. This review paper shows the progress made in this area as +well as the substantial issues that still need to be tackled. The paper +provides research directions that can be taken up as research projects in the +field of face detection. + +
+
+
+
+
+ + ☆ Energy-based Domain-Adaptive Segmentation with Depth Guidance + + +
+ Recent endeavors have been made to leverage self-supervised depth estimation +as guidance in unsupervised domain adaptation (UDA) for semantic segmentation. +Prior arts, however, overlook the discrepancy between semantic and depth +features, as well as the reliability of feature fusion, thus leading to +suboptimal segmentation performance. To address this issue, we propose a novel +UDA framework called SMART (croSs doMain semAntic segmentation based on eneRgy +esTimation) that utilizes Energy-Based Models (EBMs) to obtain task-adaptive +features and achieve reliable feature fusion for semantic segmentation with +self-supervised depth estimates. Our framework incorporates two novel +components: energy-based feature fusion (EB2F) and energy-based reliable fusion +Assessment (RFA) modules. The EB2F module produces task-adaptive semantic and +depth features by explicitly measuring and reducing their discrepancy using +Hopfield energy for better feature fusion. The RFA module evaluates the +reliability of the feature fusion using an energy score to improve the +effectiveness of depth guidance. Extensive experiments on two datasets +demonstrate that our method achieves significant performance gains over prior +works, validating the effectiveness of our energy-based learning approach. + +
+
+
+
+
+ + ☆ Exploring Low-Resource Medical Image Classification with Weakly + Supervised Prompt Learning + + +
+ Most advances in medical image recognition supporting clinical auxiliary +diagnosis meet challenges due to the low-resource situation in the medical +field, where annotations are highly expensive and professional. This +low-resource problem can be alleviated by leveraging the transferable +representations of large-scale pre-trained vision-language models via relevant +medical text prompts. However, existing pre-trained vision-language models +require domain experts to carefully design the medical prompts, which greatly +increases the burden on clinicians. To address this problem, we propose a +weakly supervised prompt learning method MedPrompt to automatically generate +medical prompts, which includes an unsupervised pre-trained vision-language +model and a weakly supervised prompt learning model. The unsupervised +pre-trained vision-language model utilizes the natural correlation between +medical images and corresponding medical texts for pre-training, without any +manual annotations. The weakly supervised prompt learning model only utilizes +the classes of images in the dataset to guide the learning of the specific +class vector in the prompt, while the learning of other context vectors in the +prompt requires no manual annotations for guidance. To the best of our +knowledge, this is the first model to automatically generate medical prompts. +With these prompts, the pre-trained vision-language model can be freed from the +strong expert dependency of manual annotation and manual prompt design. +Experimental results show that the model using our automatically generated +prompts outperforms its full-shot learning hand-crafted prompts counterparts +with only a minimal number of labeled samples for few-shot learning, and +reaches superior or comparable accuracy on zero-shot image classification. The +proposed prompt generator is lightweight and therefore can be embedded into any +network architecture. + +
+
+ comment: Accepted by Pattern Recognition +
+
+
+
+
+ + ☆ AttackNet: Enhancing Biometric Security via Tailored Convolutional + Neural Network Architectures for Liveness Detection + + +
+ Biometric security is the cornerstone of modern identity verification and +authentication systems, where the integrity and reliability of biometric +samples is of paramount importance. This paper introduces AttackNet, a bespoke +Convolutional Neural Network architecture, meticulously designed to combat +spoofing threats in biometric systems. Rooted in deep learning methodologies, +this model offers a layered defense mechanism, seamlessly transitioning from +low-level feature extraction to high-level pattern discernment. Three +distinctive architectural phases form the crux of the model, each underpinned +by judiciously chosen activation functions, normalization techniques, and +dropout layers to ensure robustness and resilience against adversarial attacks. +Benchmarking our model across diverse datasets affirms its prowess, showcasing +superior performance metrics in comparison to contemporary models. Furthermore, +a detailed comparative analysis accentuates the model's efficacy, drawing +parallels with prevailing state-of-the-art methodologies. Through iterative +refinement and an informed architectural strategy, AttackNet underscores the +potential of deep learning in safeguarding the future of biometric security. + +
+
+
+
+
+ + ☆ MobileVLM V2: Faster and Stronger Baseline for Vision Language Model + + +
+ We introduce MobileVLM V2, a family of significantly improved vision language +models upon MobileVLM, which proves that a delicate orchestration of novel +architectural design, an improved training scheme tailored for mobile VLMs, and +rich high-quality dataset curation can substantially benefit VLMs' performance. +Specifically, MobileVLM V2 1.7B achieves better or on-par performance on +standard VLM benchmarks compared with much larger VLMs at the 3B scale. +Notably, our 3B model outperforms a large variety of VLMs at the 7B+ scale. Our +models will be released at https://github.com/Meituan-AutoML/MobileVLM . + +
+
+
+
+
+ + ☆ MoD-SLAM: Monocular Dense Mapping for Unbounded 3D Scene Reconstruction + + +
+ Neural implicit representations have recently been demonstrated in many +fields including Simultaneous Localization And Mapping (SLAM). Current neural +SLAM can achieve ideal results in reconstructing bounded scenes, but this +relies on the input of RGB-D images. Neural-based SLAM based only on RGB images +is unable to reconstruct the scale of the scene accurately, and it also suffers +from scale drift due to errors accumulated during tracking. To overcome these +limitations, we present MoD-SLAM, a monocular dense mapping method that allows +global pose optimization and 3D reconstruction in real-time in unbounded +scenes. Optimizing scene reconstruction by monocular depth estimation and using +loop closure detection to update camera pose enable detailed and precise +reconstruction on large scenes. Compared to previous work, our approach is more +robust, scalable and versatile. Our experiments demonstrate that MoD-SLAM has +more excellent mapping performance than prior neural SLAM methods, especially +in large borderless scenes. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Virtual Classification: Modulating Domain-Specific Knowledge for + Multidomain Crowd Counting + + +
+ Multidomain crowd counting aims to learn a general model for multiple diverse +datasets. However, deep networks prefer modeling distributions of the dominant +domains instead of all domains, which is known as domain bias. In this study, +we propose a simple-yet-effective Modulating Domain-specific Knowledge Network +(MDKNet) to handle the domain bias issue in multidomain crowd counting. MDKNet +is achieved by employing the idea of `modulating', enabling deep network +balancing and modeling different distributions of diverse datasets with little +bias. Specifically, we propose an Instance-specific Batch Normalization (IsBN) +module, which serves as a base modulator to refine the information flow to be +adaptive to domain distributions. To precisely modulating the domain-specific +information, the Domain-guided Virtual Classifier (DVC) is then introduced to +learn a domain-separable latent space. This space is employed as an input +guidance for the IsBN modulator, such that the mixture distributions of +multiple datasets can be well treated. Extensive experiments performed on +popular benchmarks, including Shanghai-tech A/B, QNRF and NWPU, validate the +superiority of MDKNet in tackling multidomain crowd counting and the +effectiveness for multidomain learning. Code is available at +\url{https://github.com/csguomy/MDKNet}. + +
+
+ comment: Multidomain learning; Domain-guided virtual classifier; + Instance-specific batch normalization +
+
+
+
+
+ + ☆ The Instinctive Bias: Spurious Images lead to Hallucination in MLLMs + + +
+ Large language models (LLMs) have recently experienced remarkable progress, +where the advent of multi-modal large language models (MLLMs) has endowed LLMs +with visual capabilities, leading to impressive performances in various +multi-modal tasks. However, those powerful MLLMs such as GPT-4V still fail +spectacularly when presented with certain image and text inputs. In this paper, +we identify a typical class of inputs that baffles MLLMs, which consist of +images that are highly relevant but inconsistent with answers, causing MLLMs to +suffer from hallucination. To quantify the effect, we propose CorrelationQA, +the first benchmark that assesses the hallucination level given spurious +images. This benchmark contains 7,308 text-image pairs across 13 categories. +Based on the proposed CorrelationQA, we conduct a thorough analysis on 9 +mainstream MLLMs, illustrating that they universally suffer from this +instinctive bias to varying degrees. We hope that our curated benchmark and +evaluation results aid in better assessments of the MLLMs' robustness in the +presence of misleading images. The resource is available in +https://github.com/MasaiahHan/CorrelationQA. + +
+
+
+
+
+ + ☆ Intensive Vision-guided Network for Radiology Report Generation + + +
+ Automatic radiology report generation is booming due to its huge application +potential for the healthcare industry. However, existing computer vision and +natural language processing approaches to tackle this problem are limited in +two aspects. First, when extracting image features, most of them neglect +multi-view reasoning in vision and model single-view structure of medical +images, such as space-view or channel-view. However, clinicians rely on +multi-view imaging information for comprehensive judgment in daily clinical +diagnosis. Second, when generating reports, they overlook context reasoning +with multi-modal information and focus on pure textual optimization utilizing +retrieval-based methods. We aim to address these two issues by proposing a +model that better simulates clinicians' perspectives and generates more +accurate reports. Given the above limitation in feature extraction, we propose +a Globally-intensive Attention (GIA) module in the medical image encoder to +simulate and integrate multi-view vision perception. GIA aims to learn three +types of vision perception: depth view, space view, and pixel view. On the +other hand, to address the above problem in report generation, we explore how +to involve multi-modal signals to generate precisely matched reports, i.e., how +to integrate previously predicted words with region-aware visual content in +next word prediction. Specifically, we design a Visual Knowledge-guided Decoder +(VKGD), which can adaptively consider how much the model needs to rely on +visual information and previously predicted text to assist next word +prediction. Hence, our final Intensive Vision-guided Network (IVGN) framework +includes a GIA-guided Visual Encoder and the VKGD. Experiments on two +commonly-used datasets IU X-Ray and MIMIC-CXR demonstrate the superior ability +of our method compared with other state-of-the-art approaches. + +
+
+ comment: Accepted by Physics in Medicine & Biology +
+
+
+
+
+ + ☆ Pre-training of Lightweight Vision Transformers on Small Datasets with + Minimally Scaled Images + + +
+ Can a lightweight Vision Transformer (ViT) match or exceed the performance of +Convolutional Neural Networks (CNNs) like ResNet on small datasets with small +image resolutions? This report demonstrates that a pure ViT can indeed achieve +superior performance through pre-training, using a masked auto-encoder +technique with minimal image scaling. Our experiments on the CIFAR-10 and +CIFAR-100 datasets involved ViT models with fewer than 3.65 million parameters +and a multiply-accumulate (MAC) count below 0.27G, qualifying them as +'lightweight' models. Unlike previous approaches, our method attains +state-of-the-art performance among similar lightweight transformer-based +architectures without significantly scaling up images from CIFAR-10 and +CIFAR-100. This achievement underscores the efficiency of our model, not only +in handling small datasets but also in effectively processing images close to +their original scale. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Vision Superalignment: Weak-to-Strong Generalization for Vision + Foundation Models + + +
+ Recent advancements in large language models have sparked interest in their +extraordinary and near-superhuman capabilities, leading researchers to explore +methods for evaluating and optimizing these abilities, which is called +superalignment. In this context, our paper delves into the realm of vision +foundation models, focusing on the concept of weak-to-strong generalization, +which involves using a weaker model to supervise a stronger one, aiming to +enhance the latter's capabilities beyond the former's limits. We introduce a +novel and adaptively adjustable loss function for weak-to-strong supervision. +Our comprehensive experiments span various scenarios, including few-shot +learning, transfer learning, noisy label learning, and common knowledge +distillation settings. The results are striking: our approach not only exceeds +the performance benchmarks set by strong-to-strong generalization but also +surpasses the outcomes of fine-tuning strong models with whole datasets. This +compelling evidence underscores the significant potential of weak-to-strong +generalization, showcasing its capability to substantially elevate the +performance of vision foundation models. The code is available at +https://github.com/ggjy/vision_weak_to_strong. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Tuning Large Multimodal Models for Videos using Reinforcement Learning + from AI Feedback + + +
+ Recent advancements in large language models have influenced the development +of video large multimodal models (VLMMs). The previous approaches for VLMMs +involved Supervised Fine-Tuning (SFT) with instruction-tuned datasets, +integrating LLM with visual encoders, and adding additional learnable modules. +Video and text multimodal alignment remains challenging, primarily due to the +deficient volume and quality of multimodal instruction-tune data compared to +text-only data. We present a novel alignment strategy that employs multimodal +AI system to oversee itself called Reinforcement Learning from AI Feedback +(RLAIF), providing self-preference feedback to refine itself and facilitating +the alignment of video and text modalities. In specific, we propose +context-aware reward modeling by providing detailed video descriptions as +context during the generation of preference feedback in order to enrich the +understanding of video content. Demonstrating enhanced performance across +diverse video benchmarks, our multimodal RLAIF approach, VLM-RLAIF, outperforms +existing approaches, including the SFT model. We commit to open-sourcing our +code, models, and datasets to foster further research in this area. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ AoSRNet: All-in-One Scene Recovery Networks via Multi-knowledge + Integration + + +
+ Scattering and attenuation of light in no-homogeneous imaging media or +inconsistent light intensity will cause insufficient contrast and color +distortion in the collected images, which limits the developments such as +vision-driven smart urban, autonomous vehicles, and intelligent robots. In this +paper, we propose an all-in-one scene recovery network via multi-knowledge +integration (termed AoSRNet) to improve the visibility of imaging devices in +typical low-visibility imaging scenes (e.g., haze, sand dust, and low light). +It combines gamma correction (GC) and optimized linear stretching (OLS) to +create the detail enhancement module (DEM) and color restoration module (CRM). +Additionally, we suggest a multi-receptive field extraction module (MEM) to +attenuate the loss of image texture details caused by GC nonlinear and OLS +linear transformations. Finally, we refine the coarse features generated by +DEM, CRM, and MEM through Encoder-Decoder to generate the final restored image. +Comprehensive experimental results demonstrate the effectiveness and stability +of AoSRNet compared to other state-of-the-art methods. The source code is +available at \url{https://github.com/LouisYuxuLu/AoSRNet}. + +
+
+
+
+
+ + ☆ Rig3DGS: Creating Controllable Portraits from Casual Monocular Videos + + +
+ Creating controllable 3D human portraits from casual smartphone videos is +highly desirable due to their immense value in AR/VR applications. The recent +development of 3D Gaussian Splatting (3DGS) has shown improvements in rendering +quality and training efficiency. However, it still remains a challenge to +accurately model and disentangle head movements and facial expressions from a +single-view capture to achieve high-quality renderings. In this paper, we +introduce Rig3DGS to address this challenge. We represent the entire scene, +including the dynamic subject, using a set of 3D Gaussians in a canonical +space. Using a set of control signals, such as head pose and expressions, we +transform them to the 3D space with learned deformations to generate the +desired rendering. Our key innovation is a carefully designed deformation +method which is guided by a learnable prior derived from a 3D morphable model. +This approach is highly efficient in training and effective in controlling +facial expressions, head positions, and view synthesis across various captures. +We demonstrate the effectiveness of our learned deformation through extensive +quantitative and qualitative experiments. The project page can be found at +http://shahrukhathar.github.io/2024/02/05/Rig3DGS.html + +
+
+
+
+
+ + ☆ Attention-based Shape and Gait Representations Learning for Video-based + Cloth-Changing Person Re-Identification + + +
+ Current state-of-the-art Video-based Person Re-Identification (Re-ID) +primarily relies on appearance features extracted by deep learning models. +These methods are not applicable for long-term analysis in real-world scenarios +where persons have changed clothes, making appearance information unreliable. +In this work, we deal with the practical problem of Video-based Cloth-Changing +Person Re-ID (VCCRe-ID) by proposing "Attention-based Shape and Gait +Representations Learning" (ASGL) for VCCRe-ID. Our ASGL framework improves +Re-ID performance under clothing variations by learning clothing-invariant gait +cues using a Spatial-Temporal Graph Attention Network (ST-GAT). Given the +3D-skeleton-based spatial-temporal graph, our proposed ST-GAT comprises +multi-head attention modules, which are able to enhance the robustness of gait +embeddings under viewpoint changes and occlusions. The ST-GAT amplifies the +important motion ranges and reduces the influence of noisy poses. Then, the +multi-head learning module effectively reserves beneficial local temporal +dynamics of movement. We also boost discriminative power of person +representations by learning body shape cues using a GAT. Experiments on two +large-scale VCCRe-ID datasets demonstrate that our proposed framework +outperforms state-of-the-art methods by 12.2% in rank-1 accuracy and 7.0% in +mAP. + +
+
+
+
+
+ + ☆ SISP: A Benchmark Dataset for Fine-grained Ship Instance Segmentation in + Panchromatic Satellite Images + + +
+ Fine-grained ship instance segmentation in satellite images holds +considerable significance for monitoring maritime activities at sea. However, +existing datasets often suffer from the scarcity of fine-grained information or +pixel-wise localization annotations, as well as the insufficient image +diversity and variations, thus limiting the research of this task. To this end, +we propose a benchmark dataset for fine-grained Ship Instance Segmentation in +Panchromatic satellite images, namely SISP, which contains 56,693 +well-annotated ship instances with four fine-grained categories across 10,000 +sliced images, and all the images are collected from SuperView-1 satellite with +the resolution of 0.5m. Targets in the proposed SISP dataset have +characteristics that are consistent with real satellite scenes, such as high +class imbalance, various scenes, large variations in target densities and +scales, and high inter-class similarity and intra-class diversity, all of which +make the SISP dataset more suitable for real-world applications. In addition, +we introduce a Dynamic Feature Refinement-assist Instance segmentation network, +namely DFRInst, as the benchmark method for ship instance segmentation in +satellite images, which can fortify the explicit representation of crucial +features, thus improving the performance of ship instance segmentation. +Experiments and analysis are performed on the proposed SISP dataset to evaluate +the benchmark method and several state-of-the-art methods to establish +baselines for facilitating future research. The proposed dataset and source +codes will be available at: https://github.com/Justlovesmile/SISP. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ MMAUD: A Comprehensive Multi-Modal Anti-UAV Dataset for Modern Miniature + Drone Threats ICRA 2024 + + +
+ In response to the evolving challenges posed by small unmanned aerial +vehicles (UAVs), which possess the potential to transport harmful payloads or +independently cause damage, we introduce MMAUD: a comprehensive Multi-Modal +Anti-UAV Dataset. MMAUD addresses a critical gap in contemporary threat +detection methodologies by focusing on drone detection, UAV-type +classification, and trajectory estimation. MMAUD stands out by combining +diverse sensory inputs, including stereo vision, various Lidars, Radars, and +audio arrays. It offers a unique overhead aerial detection vital for addressing +real-world scenarios with higher fidelity than datasets captured on specific +vantage points using thermal and RGB. Additionally, MMAUD provides accurate +Leica-generated ground truth data, enhancing credibility and enabling confident +refinement of algorithms and models, which has never been seen in other +datasets. Most existing works do not disclose their datasets, making MMAUD an +invaluable resource for developing accurate and efficient solutions. Our +proposed modalities are cost-effective and highly adaptable, allowing users to +experiment and implement new UAV threat detection tools. Our dataset closely +simulates real-world scenarios by incorporating ambient heavy machinery sounds. +This approach enhances the dataset's applicability, capturing the exact +challenges faced during proximate vehicular operations. It is expected that +MMAUD can play a pivotal role in advancing UAV threat detection, +classification, trajectory estimation capabilities, and beyond. Our dataset, +codes, and designs will be available in https://github.com/ntu-aris/MMAUD. + +
+
+ comment: Accepted by ICRA 2024 +
+
+
+
+
+ + ☆ FoolSDEdit: Deceptively Steering Your Edits Towards Targeted + Attribute-aware Distribution + + +
+ Guided image synthesis methods, like SDEdit based on the diffusion model, +excel at creating realistic images from user inputs such as stroke paintings. +However, existing efforts mainly focus on image quality, often overlooking a +key point: the diffusion model represents a data distribution, not individual +images. This introduces a low but critical chance of generating images that +contradict user intentions, raising ethical concerns. For example, a user +inputting a stroke painting with female characteristics might, with some +probability, get male faces from SDEdit. To expose this potential +vulnerability, we aim to build an adversarial attack forcing SDEdit to generate +a specific data distribution aligned with a specified attribute (e.g., female), +without changing the input's attribute characteristics. We propose the Targeted +Attribute Generative Attack (TAGA), using an attribute-aware objective function +and optimizing the adversarial noise added to the input stroke painting. +Empirical studies reveal that traditional adversarial noise struggles with +TAGA, while natural perturbations like exposure and motion blur easily alter +generated images' attributes. To execute effective attacks, we introduce +FoolSDEdit: We design a joint adversarial exposure and blur attack, adding +exposure and motion blur to the stroke painting and optimizing them together. +We optimize the execution strategy of various perturbations, framing it as a +network architecture search problem. We create the SuperPert, a graph +representing diverse execution strategies for different perturbations. After +training, we obtain the optimized execution strategy for effective TAGA against +SDEdit. Comprehensive experiments on two datasets show our method compelling +SDEdit to generate a targeted attribute-aware data distribution, significantly +outperforming baselines. + +
+
+
+
+
+ + ☆ Automatic Robotic Development through Collaborative Framework by Large + Language Models + + +
+ Despite the remarkable code generation abilities of large language models +LLMs, they still face challenges in complex task handling. Robot development, a +highly intricate field, inherently demands human involvement in task allocation +and collaborative teamwork . To enhance robot development, we propose an +innovative automated collaboration framework inspired by real-world robot +developers. This framework employs multiple LLMs in distinct roles analysts, +programmers, and testers. Analysts delve deep into user requirements, enabling +programmers to produce precise code, while testers fine-tune the parameters +based on user feedback for practical robot application. Each LLM tackles +diverse, critical tasks within the development process. Clear collaboration +rules emulate real world teamwork among LLMs. Analysts, programmers, and +testers form a cohesive team overseeing strategy, code, and parameter +adjustments . Through this framework, we achieve complex robot development +without requiring specialized knowledge, relying solely on non experts +participation. + +
+
+
+
+
+ + ☆ SHMC-Net: A Mask-guided Feature Fusion Network for Sperm Head Morphology + Classification + + +
+ Male infertility accounts for about one-third of global infertility cases. +Manual assessment of sperm abnormalities through head morphology analysis +encounters issues of observer variability and diagnostic discrepancies among +experts. Its alternative, Computer-Assisted Semen Analysis (CASA), suffers from +low-quality sperm images, small datasets, and noisy class labels. We propose a +new approach for sperm head morphology classification, called SHMC-Net, which +uses segmentation masks of sperm heads to guide the morphology classification +of sperm images. SHMC-Net generates reliable segmentation masks using image +priors, refines object boundaries with an efficient graph-based method, and +trains an image network with sperm head crops and a mask network with the +corresponding masks. In the intermediate stages of the networks, image and mask +features are fused with a fusion scheme to better learn morphological features. +To handle noisy class labels and regularize training on small datasets, +SHMC-Net applies Soft Mixup to combine mixup augmentation and a loss function. +We achieve state-of-the-art results on SCIAN and HuSHeM datasets, outperforming +methods that use additional pre-training or costly ensembling techniques. + +
+
+ comment: A shorter version is published on ISBI 2024 +
+
+
+
+
+ + ☆ ConUNETR: A Conditional Transformer Network for 3D Micro-CT Embryonic + Cartilage Segmentation + + +
+ Studying the morphological development of cartilaginous and osseous +structures is critical to the early detection of life-threatening skeletal +dysmorphology. Embryonic cartilage undergoes rapid structural changes within +hours, introducing biological variations and morphological shifts that limit +the generalization of deep learning-based segmentation models that infer across +multiple embryonic age groups. Obtaining individual models for each age group +is expensive and less effective, while direct transfer (predicting an age +unseen during training) suffers a potential performance drop due to +morphological shifts. We propose a novel Transformer-based segmentation model +with improved biological priors that better distills morphologically diverse +information through conditional mechanisms. This enables a single model to +accurately predict cartilage across multiple age groups. Experiments on the +mice cartilage dataset show the superiority of our new model compared to other +competitive segmentation models. Additional studies on a separate mice +cartilage dataset with a distinct mutation show that our model generalizes well +and effectively captures age-based cartilage morphology patterns. + +
+
+ comment: Published in ISBI 2024 +
+
+
+
+
+ + ☆ 3Doodle: Compact Abstraction of Objects with 3D Strokes + + +
+ While free-hand sketching has long served as an efficient representation to +convey characteristics of an object, they are often subjective, deviating +significantly from realistic representations. Moreover, sketches are not +consistent for arbitrary viewpoints, making it hard to catch 3D shapes. We +propose 3Dooole, generating descriptive and view-consistent sketch images given +multi-view images of the target object. Our method is based on the idea that a +set of 3D strokes can efficiently represent 3D structural information and +render view-consistent 2D sketches. We express 2D sketches as a union of +view-independent and view-dependent components. 3D cubic B ezier curves +indicate view-independent 3D feature lines, while contours of superquadrics +express a smooth outline of the volume of varying viewpoints. Our pipeline +directly optimizes the parameters of 3D stroke primitives to minimize +perceptual losses in a fully differentiable manner. The resulting sparse set of +3D strokes can be rendered as abstract sketches containing essential 3D +characteristic shapes of various objects. We demonstrate that 3Doodle can +faithfully express concepts of the original images compared with recent sketch +generation approaches. + +
+
+
+
+
+ + ☆ QuEST: Low-bit Diffusion Model Quantization via Efficient Selective + Finetuning + + +
+ Diffusion models have achieved remarkable success in image generation tasks, +yet their practical deployment is restrained by the high memory and time +consumption. While quantization paves a way for diffusion model compression and +acceleration, existing methods totally fail when the models are quantized to +low-bits. In this paper, we unravel three properties in quantized diffusion +models that compromise the efficacy of current methods: imbalanced activation +distributions, imprecise temporal information, and vulnerability to +perturbations of specific modules. To alleviate the intensified low-bit +quantization difficulty stemming from the distribution imbalance, we propose +finetuning the quantized model to better adapt to the activation distribution. +Building on this idea, we identify two critical types of quantized layers: +those holding vital temporal information and those sensitive to reduced +bit-width, and finetune them to mitigate performance degradation with +efficiency. We empirically verify that our approach modifies the activation +distribution and provides meaningful temporal information, facilitating easier +and more accurate quantization. Our method is evaluated over three +high-resolution image generation tasks and achieves state-of-the-art +performance under various bit-width settings, as well as being the first method +to generate readable images on full 4-bit (i.e. W4A4) Stable Diffusion. + +
+
+
+
+
+ + ☆ Reviewing FID and SID Metrics on Generative Adversarial Networks + + +
+ The growth of generative adversarial network (GAN) models has increased the +ability of image processing and provides numerous industries with the +technology to produce realistic image transformations. However, with the field +being recently established there are new evaluation metrics that can further +this research. Previous research has shown the Fr\'echet Inception Distance +(FID) to be an effective metric when testing these image-to-image GANs in +real-world applications. Signed Inception Distance (SID), a founded metric in +2023, expands on FID by allowing unsigned distances. This paper uses public +datasets that consist of fa\c{c}ades, cityscapes, and maps within Pix2Pix and +CycleGAN models. After training these models are evaluated on both inception +distance metrics which measure the generating performance of the trained +models. Our findings indicate that usage of the metric SID incorporates an +efficient and effective metric to complement, or even exceed the ability shown +using the FID for the image-to-image GANs + +
+
+ comment: 14 pages 9 figures 1 table Included in IOTBS, NLTM, AIMLA, DBDM - + 2024 Conference Proceedings Editor: David C. Wyld et al +
+
+
+
+
+ + ☆ BEAM: Beta Distribution Ray Denoising for Multi-view 3D Object Detection + + +
+ Multi-view 3D object detectors struggle with duplicate predictions due to the +lack of depth information, resulting in false positive detections. In this +study, we introduce BEAM, a novel Beta Distribution Ray Denoising approach that +can be applied to any DETR-style multi-view 3D detector to explicitly +incorporate structure prior knowledge of the scene. By generating rays from +cameras to objects and sampling spatial denoising queries from the Beta +distribution family along these rays, BEAM enhances the model's ability to +distinguish spatial hard negative samples arising from ambiguous depths. BEAM +is a plug-and-play technique that adds only marginal computational costs during +training, while impressively preserving the inference speed. Extensive +experiments and ablation studies on the NuScenes dataset demonstrate +significant improvements over strong baselines, outperforming the +state-of-the-art method StreamPETR by 1.9% mAP. The code will be available at +https://github.com/LiewFeng/BEAM. + +
+
+
+
+
+ + ☆ CAT-SAM: Conditional Tuning Network for Few-Shot Adaptation of + Segmentation Anything Model + + +
+ The recent Segment Anything Model (SAM) has demonstrated remarkable zero-shot +capability and flexible geometric prompting in general image segmentation. +However, SAM often struggles when handling various unconventional images, such +as aerial, medical, and non-RGB images. This paper presents CAT-SAM, a +ConditionAl Tuning network that adapts SAM toward various unconventional target +tasks with just few-shot target samples. CAT-SAM freezes the entire SAM and +adapts its mask decoder and image encoder simultaneously with a small number of +learnable parameters. The core design is a prompt bridge structure that enables +decoder-conditioned joint tuning of the heavyweight image encoder and the +lightweight mask decoder. The bridging maps the prompt token of the mask +decoder to the image encoder, fostering synergic adaptation of the encoder and +the decoder with mutual benefits. We develop two representative tuning +strategies for the image encoder which leads to two CAT-SAM variants: one +injecting learnable prompt tokens in the input space and the other inserting +lightweight adapter networks. Extensive experiments over 11 unconventional +tasks show that both CAT-SAM variants achieve superior target segmentation +performance consistently even under the very challenging one-shot adaptation +setup. Project page: \url{https://xiaoaoran.github.io/projects/CAT-SAM} + +
+
+ comment: Project page: https://xiaoaoran.github.io/projects/CAT-SAM +
+
+
+
+
+ + ☆ Improving Contextual Congruence Across Modalities for Effective + Multimodal Marketing using Knowledge-infused Learning + + +
+ The prevalence of smart devices with the ability to capture moments in +multiple modalities has enabled users to experience multimodal information +online. However, large Language (LLMs) and Vision models (LVMs) are still +limited in capturing holistic meaning with cross-modal semantic relationships. +Without explicit, common sense knowledge (e.g., as a knowledge graph), Visual +Language Models (VLMs) only learn implicit representations by capturing +high-level patterns in vast corpora, missing essential contextual cross-modal +cues. In this work, we design a framework to couple explicit commonsense +knowledge in the form of knowledge graphs with large VLMs to improve the +performance of a downstream task, predicting the effectiveness of multi-modal +marketing campaigns. While the marketing application provides a compelling +metric for assessing our methods, our approach enables the early detection of +likely persuasive multi-modal campaigns and the assessment and augmentation of +marketing theory. + +
+
+
+
+
+ + ☆ GRASP: GRAph-Structured Pyramidal Whole Slide Image Representation + + +
+ Cancer subtyping is one of the most challenging tasks in digital pathology, +where Multiple Instance Learning (MIL) by processing gigapixel whole slide +images (WSIs) has been in the spotlight of recent research. However, MIL +approaches do not take advantage of inter- and intra-magnification information +contained in WSIs. In this work, we present GRASP, a novel graph-structured +multi-magnification framework for processing WSIs in digital pathology. Our +approach is designed to dynamically emulate the pathologist's behavior in +handling WSIs and benefits from the hierarchical structure of WSIs. GRASP, +which introduces a convergence-based node aggregation instead of traditional +pooling mechanisms, outperforms state-of-the-art methods over two distinct +cancer datasets by a margin of up to 10% balanced accuracy, while being 7 times +smaller than the closest-performing state-of-the-art model in terms of the +number of parameters. Our results show that GRASP is dynamic in finding and +consulting with different magnifications for subtyping cancers and is reliable +and stable across different hyperparameters. The model's behavior has been +evaluated by two expert pathologists confirming the interpretability of the +model's dynamic. We also provide a theoretical foundation, along with empirical +evidence, for our work, explaining how GRASP interacts with different +magnifications and nodes in the graph to make predictions. We believe that the +strong characteristics yet simple structure of GRASP will encourage the +development of interpretable, structure-based designs for WSI representation in +digital pathology. Furthermore, we publish two large graph datasets of rare +Ovarian and Bladder cancers to contribute to the field. + +
+
+ comment: Early version: To be updated +
+
+
+
+
+ + ☆ Dual-View Visual Contextualization for Web Navigation + + +
+ Automatic web navigation aims to build a web agent that can follow language +instructions to execute complex and diverse tasks on real-world websites. +Existing work primarily takes HTML documents as input, which define the +contents and action spaces (i.e., actionable elements and operations) of +webpages. Nevertheless, HTML documents may not provide a clear task-related +context for each element, making it hard to select the right (sequence of) +actions. In this paper, we propose to contextualize HTML elements through their +"dual views" in webpage screenshots: each HTML element has its corresponding +bounding box and visual content in the screenshot. We build upon the insight -- +web developers tend to arrange task-related elements nearby on webpages to +enhance user experiences -- and propose to contextualize each element with its +neighbor elements, using both textual and visual features. The resulting +representations of HTML elements are more informative for the agent to take +action. We validate our method on the recently released Mind2Web dataset, which +features diverse navigation domains and tasks on real-world websites. Our +method consistently outperforms the baseline in all the scenarios, including +cross-task, cross-website, and cross-domain ones. + +
+
+
+
+
+ + ☆ BAdaCost: Multi-class Boosting with Costs + + +
+ We present BAdaCost, a multi-class cost-sensitive classification algorithm. +It combines a set of cost-sensitive multi-class weak learners to obtain a +strong classification rule within the Boosting framework. To derive the +algorithm we introduce CMEL, a Cost-sensitive Multi-class Exponential Loss that +generalizes the losses optimized in various classification algorithms such as +AdaBoost, SAMME, Cost-sensitive AdaBoost and PIBoost. Hence unifying them under +a common theoretical framework. In the experiments performed we prove that +BAdaCost achieves significant gains in performance when compared to previous +multi-class cost-sensitive approaches. The advantages of the proposed algorithm +in asymmetric multi-class classification are also evaluated in practical +multi-view face and car detection problems. + +
+
+
+
+
+ + ☆ Pushing the limits of cell segmentation models for imaging mass + cytometry + + +
+ Imaging mass cytometry (IMC) is a relatively new technique for imaging +biological tissue at subcellular resolution. In recent years, learning-based +segmentation methods have enabled precise quantification of cell type and +morphology, but typically rely on large datasets with fully annotated ground +truth (GT) labels. This paper explores the effects of imperfect labels on +learning-based segmentation models and evaluates the generalisability of these +models to different tissue types. Our results show that removing 50% of cell +annotations from GT masks only reduces the dice similarity coefficient (DSC) +score to 0.874 (from 0.889 achieved by a model trained on fully annotated GT +masks). This implies that annotation time can in fact be reduced by at least +half without detrimentally affecting performance. Furthermore, training our +single-tissue model on imperfect labels only decreases DSC by 0.031 on an +unseen tissue type compared to its multi-tissue counterpart, with negligible +qualitative differences in segmentation. Additionally, bootstrapping the +worst-performing model (with 5% of cell annotations) a total of ten times +improves its original DSC score of 0.720 to 0.829. These findings imply that +less time and work can be put into the process of producing comparable +segmentation models; this includes eliminating the need for multiple IMC tissue +types during training, whilst also providing the potential for models with very +few labels to improve on themselves. Source code is available on GitHub: +https://github.com/kimberley/ISBI2024. + +
+
+ comment: International Symposium on Biomedical Imaging (ISBI) 2024 Submission +
+
+
+
+
+ + ☆ Quantitative Metrics for Benchmarking Medical Image Harmonization + + +
+ Image harmonization is an important preprocessing strategy to address domain +shifts arising from data acquired using different machines and scanning +protocols in medical imaging. However, benchmarking the effectiveness of +harmonization techniques has been a challenge due to the lack of widely +available standardized datasets with ground truths. In this context, we propose +three metrics: two intensity harmonization metrics and one anatomy preservation +metric for medical images during harmonization, where no ground truths are +required. Through extensive studies on a dataset with available harmonization +ground truth, we demonstrate that our metrics are correlated with established +image quality assessment metrics. We show how these novel metrics may be +applied to real-world scenarios where no harmonization ground truth exists. +Additionally, we provide insights into different interpretations of the metric +values, shedding light on their significance in the context of the +harmonization process. As a result of our findings, we advocate for the +adoption of these quantitative harmonization metrics as a standard for +benchmarking the performance of image harmonization techniques. + +
+
+ comment: Accepted for presentation at the ISBI 2024 +
+
+
+
+
+ + ☆ A Data Centric Approach for Unsupervised Domain Generalization via + Retrieval from Web Scale Multimodal Data + + +
+ Domain generalization (DG) is an important problem that learns a model that +can generalize to unseen test domains leveraging one or more source domains, +under the assumption of shared label spaces. However, most DG methods assume +access to abundant source data in the target label space, a requirement that +proves overly stringent for numerous real-world applications, where acquiring +the same label space as the target task is prohibitively expensive. For this +setting, we tackle the multimodal version of the unsupervised domain +generalization (UDG) problem, which uses a large task-agnostic unlabeled source +dataset, such as LAION-2B during finetuning. Our framework does not explicitly +assume any relationship between the source dataset and target task. Instead, it +relies only on the premise that the source dataset can be efficiently searched +in a joint vision-language space. For this multimodal UDG setting, we propose a +novel method to build a small ($<$100K) subset of the source data in three +simple steps: (1) diversified retrieval using label names as queries, (2) rank +pseudo-labeling, and (3) clustering to find representative samples. To +demonstrate the value of studying the multimodal UDG problem, we compare our +results against state-of-the-art source-free DG and zero-shot (ZS) methods on +their respective benchmarks and show up to 10% improvement in accuracy on 20 +diverse target datasets. Additionally, our multi-stage dataset construction +method achieves 3% improvement on average over nearest neighbors retrieval. +Code is available: https://github.com/Chris210634/mudg + +
+
+
+
+
+ + ☆ Detection Transformer for Teeth Detection, Segmentation, and Numbering + in Oral Rare Diseases: Focus on Data Augmentation and Inpainting Techniques + + +
+ In this work, we focused on deep learning image processing in the context of +oral rare diseases, which pose challenges due to limited data availability. A +crucial step involves teeth detection, segmentation and numbering in panoramic +radiographs. To this end, we used a dataset consisting of 156 panoramic +radiographs from individuals with rare oral diseases and labeled by experts. We +trained the Detection Transformer (DETR) neural network for teeth detection, +segmentation, and numbering the 52 teeth classes. In addition, we used data +augmentation techniques, including geometric transformations. Finally, we +generated new panoramic images using inpainting techniques with stable +diffusion, by removing teeth from a panoramic radiograph and integrating teeth +into it. The results showed a mAP exceeding 0,69 for DETR without data +augmentation. The mAP was improved to 0,82 when data augmentation techniques +are used. Furthermore, we observed promising performances when using new +panoramic radiographs generated with inpainting technique, with mAP of 0,76. + +
+
+
+
+
+ + ☆ Breaking Data Silos: Cross-Domain Learning for Multi-Agent Perception + from Independent Private Sources ICRA + + +
+ The diverse agents in multi-agent perception systems may be from different +companies. Each company might use the identical classic neural network +architecture based encoder for feature extraction. However, the data source to +train the various agents is independent and private in each company, leading to +the Distribution Gap of different private data for training distinct agents in +multi-agent perception system. The data silos by the above Distribution Gap +could result in a significant performance decline in multi-agent perception. In +this paper, we thoroughly examine the impact of the distribution gap on +existing multi-agent perception systems. To break the data silos, we introduce +the Feature Distribution-aware Aggregation (FDA) framework for cross-domain +learning to mitigate the above Distribution Gap in multi-agent perception. FDA +comprises two key components: Learnable Feature Compensation Module and +Distribution-aware Statistical Consistency Module, both aimed at enhancing +intermediate features to minimize the distribution gap among multi-agent +features. Intensive experiments on the public OPV2V and V2XSet datasets +underscore FDA's effectiveness in point cloud-based 3D object detection, +presenting it as an invaluable augmentation to existing multi-agent perception +systems. + +
+
+ comment: Accepted by the 2024 IEEE International Conference on Robotics and + Automation (ICRA) +
+
+
+
+
+ + ☆ Bidirectional Autoregressive Diffusion Model for Dance Generation + + +
+ Dance serves as a powerful medium for expressing human emotions, but the +lifelike generation of dance is still a considerable challenge. Recently, +diffusion models have showcased remarkable generative abilities across various +domains. They hold promise for human motion generation due to their adaptable +many-to-many nature. Nonetheless, current diffusion-based motion generation +models often create entire motion sequences directly and unidirectionally, +lacking focus on the motion with local and bidirectional enhancement. When +choreographing high-quality dance movements, people need to take into account +not only the musical context but also the nearby music-aligned dance motions. +To authentically capture human behavior, we propose a Bidirectional +Autoregressive Diffusion Model (BADM) for music-to-dance generation, where a +bidirectional encoder is built to enforce that the generated dance is +harmonious in both the forward and backward directions. To make the generated +dance motion smoother, a local information decoder is built for local motion +enhancement. The proposed framework is able to generate new motions based on +the input conditions and nearby motions, which foresees individual motion +slices iteratively and consolidates all predictions. To further refine the +synchronicity between the generated dance and the beat, the beat information is +incorporated as an input to generate better music-aligned dance movements. +Experimental results demonstrate that the proposed model achieves +state-of-the-art performance compared to existing unidirectional approaches on +the prominent benchmark for music-to-dance generation. + +
+
+
+
+
+ + ☆ 3D printer-controlled syringe pumps for dual, active, regulable and + simultaneous dispensing of reagents. Manufacturing of immunochromatographic + test strips + + +
+ Lateral flow immunoassays (LFIA) are widely used worldwide for the detection +of different analytes because they combine multiple advantages such as low +production cost, simplicity, and portability, which allows biomarkers detection +without requiring infrastructure or highly trained personnel. Here we propose +to provide solutions to the manufacturing process of LFIA at laboratory-scale, +particularly to the controlled and active dispensing of the reagents in the +form the Test Lines (TL) and the Control Lines (CL). To accomplish this task, +we adapted a 3D printer to also control Syringe Pumps (SP), since the proposed +adaptation of a 3D printer is easy, free and many laboratories already have it +in their infrastructure. In turn, the standard function of the 3D printer can +be easily restored by disconnecting the SPs and reconnecting the extruder. +Additionally, the unified control of the 3D printer enables dual, active, +regulable and simultaneous dispensing, four features that are typically found +only in certain high-cost commercial equipment. With the proposed setup, the +challenge of dispensing simultaneously at least 2 lines (CL and TL) with SPs +controlled by a 3D printer was addressed, including regulation in the width of +dispensed lines within experimental limits. Also, the construction of a LFIA +for the detection of leptospirosis is shown as a practical example of +automatized reagent dispensing. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ☆ ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation + + +
+ Image-to-video (I2V) generation aims to use the initial frame (alongside a +text prompt) to create a video sequence. A grand challenge in I2V generation is +to maintain visual consistency throughout the video: existing methods often +struggle to preserve the integrity of the subject, background, and style from +the first frame, as well as ensure a fluid and logical progression within the +video narrative. To mitigate these issues, we propose ConsistI2V, a +diffusion-based method to enhance visual consistency for I2V generation. +Specifically, we introduce (1) spatiotemporal attention over the first frame to +maintain spatial and motion consistency, (2) noise initialization from the +low-frequency band of the first frame to enhance layout consistency. These two +approaches enable ConsistI2V to generate highly consistent videos. We also +extend the proposed approaches to show their potential to improve consistency +in auto-regressive long video generation and camera motion control. To verify +the effectiveness of our method, we propose I2V-Bench, a comprehensive +evaluation benchmark for I2V generation. Our automatic and human evaluation +results demonstrate the superiority of ConsistI2V over existing methods. + +
+
+ comment: Project Page: https://tiger-ai-lab.github.io/ConsistI2V/ +
+
+
+
+
+ + ☆ Deep PCCT: Photon Counting Computed Tomography Deep Learning + Applications Review + + +
+ Medical imaging faces challenges such as limited spatial resolution, +interference from electronic noise and poor contrast-to-noise ratios. Photon +Counting Computed Tomography (PCCT) has emerged as a solution, addressing these +issues with its innovative technology. This review delves into the recent +developments and applications of PCCT in pre-clinical research, emphasizing its +potential to overcome traditional imaging limitations. For example PCCT has +demonstrated remarkable efficacy in improving the detection of subtle +abnormalities in breast, providing a level of detail previously unattainable. +Examining the current literature on PCCT, it presents a comprehensive analysis +of the technology, highlighting the main features of scanners and their varied +applications. In addition, it explores the integration of deep learning into +PCCT, along with the study of radiomic features, presenting successful +applications in data processing. While acknowledging these advances, it also +discusses the existing challenges in this field, paving the way for future +research and improvements in medical imaging technologies. Despite the limited +number of articles on this subject, due to the recent integration of PCCT at a +clinical level, its potential benefits extend to various diagnostic +applications. + +
+
+
+
+
+ + ☆ Road Surface Defect Detection -- From Image-based to Non-image-based: A + Survey + + +
+ Ensuring traffic safety is crucial, which necessitates the detection and +prevention of road surface defects. As a result, there has been a growing +interest in the literature on the subject, leading to the development of +various road surface defect detection methods. The methods for detecting road +defects can be categorised in various ways depending on the input data types or +training methodologies. The predominant approach involves image-based methods, +which analyse pixel intensities and surface textures to identify defects. +Despite their popularity, image-based methods share the distinct limitation of +vulnerability to weather and lighting changes. To address this issue, +researchers have explored the use of additional sensors, such as laser scanners +or LiDARs, providing explicit depth information to enable the detection of +defects in terms of scale and volume. However, the exploration of data beyond +images has not been sufficiently investigated. In this survey paper, we provide +a comprehensive review of road surface defect detection studies, categorising +them based on input data types and methodologies used. Additionally, we review +recently proposed non-image-based methods and discuss several challenges and +open problems associated with these techniques. + +
+
+ comment: Survey papers +
+
+
+
+
+ + ♻ ☆ DirecT2V: Large Language Models are Frame-Level Directors for Zero-Shot + Text-to-Video Generation + + +
+ In the paradigm of AI-generated content (AIGC), there has been increasing +attention to transferring knowledge from pre-trained text-to-image (T2I) models +to text-to-video (T2V) generation. Despite their effectiveness, these +frameworks face challenges in maintaining consistent narratives and handling +shifts in scene composition or object placement from a single abstract user +prompt. Exploring the ability of large language models (LLMs) to generate +time-dependent, frame-by-frame prompts, this paper introduces a new framework, +dubbed DirecT2V. DirecT2V leverages instruction-tuned LLMs as directors, +enabling the inclusion of time-varying content and facilitating consistent +video generation. To maintain temporal consistency and prevent mapping the +value to a different object, we equip a diffusion model with a novel value +mapping method and dual-softmax filtering, which do not require any additional +training. The experimental results validate the effectiveness of our framework +in producing visually coherent and storyful videos from abstract user prompts, +successfully addressing the challenges of zero-shot video generation. + +
+
+ comment: The code and demo will be available at + https://github.com/KU-CVLAB/DirecT2V +
+
+
+
+
+ + ♻ ☆ We're Not Using Videos Effectively: An Updated Domain Adaptive Video + Segmentation Baseline + + +
+ There has been abundant work in unsupervised domain adaptation for semantic +segmentation (DAS) seeking to adapt a model trained on images from a labeled +source domain to an unlabeled target domain. While the vast majority of prior +work has studied this as a frame-level Image-DAS problem, a few Video-DAS works +have sought to additionally leverage the temporal signal present in adjacent +frames. However, Video-DAS works have historically studied a distinct set of +benchmarks from Image-DAS, with minimal cross-benchmarking. In this work, we +address this gap. Surprisingly, we find that (1) even after carefully +controlling for data and model architecture, state-of-the-art Image-DAS methods +(HRDA and HRDA+MIC) outperform Video-DAS methods on established Video-DAS +benchmarks (+14.5 mIoU on Viper$\rightarrow$CityscapesSeq, +19.0 mIoU on +Synthia$\rightarrow$CityscapesSeq), and (2) naive combinations of Image-DAS and +Video-DAS techniques only lead to marginal improvements across datasets. To +avoid siloed progress between Image-DAS and Video-DAS, we open-source our +codebase with support for a comprehensive set of Video-DAS and Image-DAS +methods on a common benchmark. Code available at +https://github.com/SimarKareer/UnifiedVideoDA + +
+
+ comment: TMLR 2024 +
+
+
+
+
+ + ♻ ☆ SMERF: Streamable Memory Efficient Radiance Fields for Real-Time + Large-Scene Exploration + + +
+ Recent techniques for real-time view synthesis have rapidly advanced in +fidelity and speed, and modern methods are capable of rendering +near-photorealistic scenes at interactive frame rates. At the same time, a +tension has arisen between explicit scene representations amenable to +rasterization and neural fields built on ray marching, with state-of-the-art +instances of the latter surpassing the former in quality while being +prohibitively expensive for real-time applications. In this work, we introduce +SMERF, a view synthesis approach that achieves state-of-the-art accuracy among +real-time methods on large scenes with footprints up to 300 m$^2$ at a +volumetric resolution of 3.5 mm$^3$. Our method is built upon two primary +contributions: a hierarchical model partitioning scheme, which increases model +capacity while constraining compute and memory consumption, and a distillation +training strategy that simultaneously yields high fidelity and internal +consistency. Our approach enables full six degrees of freedom (6DOF) navigation +within a web browser and renders in real-time on commodity smartphones and +laptops. Extensive experiments show that our method exceeds the current +state-of-the-art in real-time novel view synthesis by 0.78 dB on standard +benchmarks and 1.78 dB on large scenes, renders frames three orders of +magnitude faster than state-of-the-art radiance field models, and achieves +real-time performance across a wide variety of commodity devices, including +smartphones. We encourage readers to explore these models interactively at our +project website: https://smerf-3d.github.io. + +
+
+ comment: Added appendix. Changed LaTeX template. Project website: + https://smerf-3d.github.io +
+
+
+
+
+ + ♻ ☆ Loci-Segmented: Improving Scene Segmentation Learning + + +
+ Current slot-oriented approaches for compositional scene segmentation from +images and videos rely on provided background information or slot assignments. +We present a segmented location and identity tracking system, Loci-Segmented +(Loci-s), which does not require either of this information. It learns to +dynamically segment scenes into interpretable background and slot-based object +encodings, separating rgb, mask, location, and depth information for each. The +results reveal largely superior video decomposition performance in the MOVi +datasets and in another established dataset collection targeting scene +segmentation. The system's well-interpretable, compositional latent encodings +may serve as a foundation model for downstream tasks. + +
+
+
+
+
+ + ♻ ☆ CC-SGG: Corner Case Scenario Generation using Learned Scene Graphs + + +
+ Corner case scenarios are an essential tool for testing and validating the +safety of autonomous vehicles (AVs). As these scenarios are often +insufficiently present in naturalistic driving datasets, augmenting the data +with synthetic corner cases greatly enhances the safe operation of AVs in +unique situations. However, the generation of synthetic, yet realistic, corner +cases poses a significant challenge. In this work, we introduce a novel +approach based on Heterogeneous Graph Neural Networks (HGNNs) to transform +regular driving scenarios into corner cases. To achieve this, we first generate +concise representations of regular driving scenes as scene graphs, minimally +manipulating their structure and properties. Our model then learns to perturb +those graphs to generate corner cases using attention and triple embeddings. +The input and perturbed graphs are then imported back into the simulation to +generate corner case scenarios. Our model successfully learned to produce +corner cases from input scene graphs, achieving 89.9% prediction accuracy on +our testing dataset. We further validate the generated scenarios on baseline +autonomous driving methods, demonstrating our model's ability to effectively +create critical situations for the baselines. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ ViT-DD: Multi-Task Vision Transformer for Semi-Supervised Driver + Distraction Detection + + +
+ Ensuring traffic safety and mitigating accidents in modern driving is of +paramount importance, and computer vision technologies have the potential to +significantly contribute to this goal. This paper presents a multi-modal Vision +Transformer for Driver Distraction Detection (termed ViT-DD), which +incorporates inductive information from training signals related to both +distraction detection and driver emotion recognition. Additionally, a +self-learning algorithm is developed, allowing for the seamless integration of +driver data without emotion labels into the multi-task training process of +ViT-DD. Experimental results reveal that the proposed ViT-DD surpasses existing +state-of-the-art methods for driver distraction detection by 6.5% and 0.9% on +the SFDDD and AUCDD datasets, respectively. + +
+
+ comment: 7 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ MI-SegNet: Mutual Information-Based US Segmentation for Unseen Domain + Generalization + + +
+ Generalization capabilities of learning-based medical image segmentation +across domains are currently limited by the performance degradation caused by +the domain shift, particularly for ultrasound (US) imaging. The quality of US +images heavily relies on carefully tuned acoustic parameters, which vary across +sonographers, machines, and settings. To improve the generalizability on US +images across domains, we propose MI-SegNet, a novel mutual information (MI) +based framework to explicitly disentangle the anatomical and domain feature +representations; therefore, robust domain-independent segmentation can be +expected. Two encoders are employed to extract the relevant features for the +disentanglement. The segmentation only uses the anatomical feature map for its +prediction. In order to force the encoders to learn meaningful feature +representations a cross-reconstruction method is used during training. +Transformations, specific to either domain or anatomy are applied to guide the +encoders in their respective feature extraction task. Additionally, any MI +present in both feature maps is punished to further promote separate feature +spaces. We validate the generalizability of the proposed domain-independent +segmentation approach on several datasets with varying parameters and machines. +Furthermore, we demonstrate the effectiveness of the proposed MI-SegNet serving +as a pre-trained model by comparing it with state-of-the-art networks. + +
+
+
+
+
+ + ♻ ☆ Beyond Hallucinations: Enhancing LVLMs through Hallucination-Aware + Direct Preference Optimization + + +
+ Multimodal large language models have made significant advancements in recent +years, yet they still suffer from a common issue known as the "hallucination +problem", in which the models generate textual descriptions that inaccurately +depict or entirely fabricate content from associated images. This paper +introduces a novel solution, Hallucination-Aware Direct Preference Optimization +(HA-DPO), which reframes the hallucination problem as a preference selection +task. The model is trained to favor the non-hallucinating response when +presented with two responses of the same image (one accurate and one +hallucinatory). Furthermore, this paper proposes an efficient pipeline for +constructing positive~(non-hallucinatory) and negative~(hallucinatory) sample +pairs, ensuring a high-quality, style-consistent dataset for robust preference +learning. When applied to three mainstream multimodal models, HA-DPO +significantly reduced hallucination issues and amplified the models' +generalization capabilities. Notably, the MiniGPT-4 model, when enhanced with +HA-DPO, demonstrated a substantial improvement: POPE accuracy rose from 51.13% +to 86.13% (an absolute improvement of 35%), and the MME score surged from +932.00 to 1326.46 (a relative improvement of 42.32%). The codes, models, and +datasets are made accessible at https://opendatalab.github.io/HA-DPO. + +
+
+ comment: Project Website: https://opendatalab.github.io/HA-DPO, Code: + https://github.com/opendatalab/HA-DPO +
+
+
+
+
+ + ♻ ☆ AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly + Detection + + +
+ Zero-shot anomaly detection (ZSAD) requires detection models trained using +auxiliary data to detect anomalies without any training sample in a target +dataset. It is a crucial task when training data is not accessible due to +various concerns, eg, data privacy, yet it is challenging since the models need +to generalize to anomalies across different domains where the appearance of +foreground objects, abnormal regions, and background features, such as +defects/tumors on different products/organs, can vary significantly. Recently +large pre-trained vision-language models (VLMs), such as CLIP, have +demonstrated strong zero-shot recognition ability in various vision tasks, +including anomaly detection. However, their ZSAD performance is weak since the +VLMs focus more on modeling the class semantics of the foreground objects +rather than the abnormality/normality in the images. In this paper we introduce +a novel approach, namely AnomalyCLIP, to adapt CLIP for accurate ZSAD across +different domains. The key insight of AnomalyCLIP is to learn object-agnostic +text prompts that capture generic normality and abnormality in an image +regardless of its foreground objects. This allows our model to focus on the +abnormal image regions rather than the object semantics, enabling generalized +normality and abnormality recognition on diverse types of objects. Large-scale +experiments on 17 real-world anomaly detection datasets show that AnomalyCLIP +achieves superior zero-shot performance of detecting and segmenting anomalies +in datasets of highly diverse class semantics from various defect inspection +and medical imaging domains. Code will be made available at +https://github.com/zqhang/AnomalyCLIP. + +
+
+
+
+
+ + ♻ ☆ GaMeS: Mesh-Based Adapting and Modification of Gaussian Splatting + + +
+ In recent years, a range of neural network-based methods for image rendering +have been introduced. For instance, widely-researched neural radiance fields +(NeRF) rely on a neural network to represent 3D scenes, allowing for realistic +view synthesis from a small number of 2D images. However, most NeRF models are +constrained by long training and inference times. In comparison, Gaussian +Splatting (GS) is a novel, state-of-theart technique for rendering points in a +3D scene by approximating their contribution to image pixels through Gaussian +distributions, warranting fast training and swift, real-time rendering. A +drawback of GS is the absence of a well-defined approach for its conditioning +due to the necessity to condition several hundred thousand Gaussian components. +To solve this, we introduce Gaussian Mesh Splatting (GaMeS) model, a hybrid of +mesh and a Gaussian distribution, that pin all Gaussians splats on the object +surface (mesh). The unique contribution of our methods is defining Gaussian +splats solely based on their location on the mesh, allowing for automatic +adjustments in position, scale, and rotation during animation. As a result, we +obtain high-quality renders in the real-time generation of high-quality views. +Furthermore, we demonstrate that in the absence of a predefined mesh, it is +possible to fine-tune the initial mesh during the learning process. + +
+
+
+
+
+ + ♻ ☆ Deep Spectral Improvement for Unsupervised Image Instance Segmentation + + +
+ Deep spectral methods reframe the image decomposition process as a graph +partitioning task by extracting features using self-supervised learning and +utilizing the Laplacian of the affinity matrix to obtain eigensegments. +However, instance segmentation has received less attention compared to other +tasks within the context of deep spectral methods. This paper addresses the +fact that not all channels of the feature map extracted from a self-supervised +backbone contain sufficient information for instance segmentation purposes. In +fact, Some channels are noisy and hinder the accuracy of the task. To overcome +this issue, this paper proposes two channel reduction modules: Noise Channel +Reduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains +channels with lower entropy, as they are less likely to be noisy, while DCR +prunes channels with low standard deviation, as they lack sufficient +information for effective instance segmentation. Furthermore, the paper +demonstrates that the dot product, commonly used in deep spectral methods, is +not suitable for instance segmentation due to its sensitivity to feature map +values, potentially leading to incorrect instance segments. A new similarity +metric called Bray-Curtis over Chebyshev (BoC) is proposed to address this +issue. It takes into account the distribution of features in addition to their +values, providing a more robust similarity measure for instance segmentation. +Quantitative and qualitative results on the Youtube-VIS2019 dataset highlight +the improvements achieved by the proposed channel reduction methods and the use +of BoC instead of the conventional dot product for creating the affinity +matrix. These improvements are observed in terms of mean Intersection over +Union and extracted instance segments, demonstrating enhanced instance +segmentation performance. The code is available on: +https://github.com/farnooshar/SpecUnIIS + +
+
+ comment: 11 pages, 13 figures and 5 tables +
+
+
+
+
+ + ♻ ☆ Local Conditional Controlling for Text-to-Image Diffusion Models + + +
+ Diffusion models have exhibited impressive prowess in the text-to-image task. +Recent methods add image-level controls, e.g., edge and depth maps, to +manipulate the generation process together with text prompts to obtain desired +images. This controlling process is globally operated on the entire image, +which limits the flexibility of control regions. In this paper, we introduce a +new simple yet practical task setting: local control. It focuses on controlling +specific local areas according to user-defined image conditions, where the rest +areas are only conditioned by the original text prompt. This manner allows the +users to flexibly control the image generation in a fine-grained way. However, +it is non-trivial to achieve this goal. The naive manner of directly adding +local conditions may lead to the local control dominance problem. To mitigate +this problem, we propose a training-free method that leverages the updates of +noised latents and parameters in the cross-attention map during the denosing +process to promote concept generation in non-control areas. Moreover, we use +feature mask constraints to mitigate the degradation of synthesized image +quality caused by information differences inside and outside the local control +area. Extensive experiments demonstrate that our method can synthesize +high-quality images to the prompt under local control conditions. Code is +available at https://github.com/YibooZhao/Local-Control. + +
+
+
+
+
+ + ♻ ☆ Self-supervised visual learning for analyzing firearms trafficking + activities on the Web + + +
+ Automated visual firearms classification from RGB images is an important +real-world task with applications in public space security, intelligence +gathering and law enforcement investigations. When applied to images massively +crawled from the World Wide Web (including social media and dark Web sites), it +can serve as an important component of systems that attempt to identify +criminal firearms trafficking networks, by analyzing Big Data from open-source +intelligence. Deep Neural Networks (DNN) are the state-of-the-art methodology +for achieving this, with Convolutional Neural Networks (CNN) being typically +employed. The common transfer learning approach consists of pretraining on a +large-scale, generic annotated dataset for whole-image classification, such as +ImageNet-1k, and then finetuning the DNN on a smaller, annotated, +task-specific, downstream dataset for visual firearms classification. Neither +Visual Transformer (ViT) neural architectures nor Self-Supervised Learning +(SSL) approaches have been so far evaluated on this critical task.. + +
+
+
+
+
+ + ♻ ☆ ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based + Place recognition in Orchards + + +
+ Robust and reliable place recognition and loop closure detection in +agricultural environments is still an open problem. In particular, orchards are +a difficult case study due to structural similarity across the entire field. In +this work, we address the place recognition problem in orchards resorting to 3D +LiDAR data, which is considered a key modality for robustness. Hence, we +propose ORCHNet, a deep-learning-based approach that maps 3D-LiDAR scans to +global descriptors. Specifically, this work proposes a new global feature +aggregation approach, which fuses multiple aggregation methods into a robust +global descriptor. ORCHNet is evaluated on real-world data collected in +orchards, comprising data from the summer and autumn seasons. To assess the +robustness, we compare ORCHNet with state-of-the-art aggregation approaches on +data from the same season and across seasons. Moreover, we additionally +evaluate the proposed approach as part of a localization framework, where +ORCHNet is used as a loop closure detector. The empirical results indicate +that, on the place recognition task, ORCHNet outperforms the remaining +approaches, and is also more robust across seasons. As for the localization, +the edge cases where the path goes through the trees are solved when +integrating ORCHNet as a loop detector, showing the potential applicability of +the proposed approach in this task. The code will be publicly available +at:\url{https://github.com/Cybonic/ORCHNet.git} + +
+
+ comment: This is a Technical Report +
+
+
+
+
+ + ♻ ☆ GPT-4V as Traffic Assistant: An In-depth Look at Vision Language Model + on Complex Traffic Events + + +
+ The recognition and understanding of traffic incidents, particularly traffic +accidents, is a topic of paramount importance in the realm of intelligent +transportation systems and intelligent vehicles. This area has continually +captured the extensive focus of both the academic and industrial sectors. +Identifying and comprehending complex traffic events is highly challenging, +primarily due to the intricate nature of traffic environments, diverse +observational perspectives, and the multifaceted causes of accidents. These +factors have persistently impeded the development of effective solutions. The +advent of large vision-language models (VLMs) such as GPT-4V, has introduced +innovative approaches to addressing this issue. In this paper, we explore the +ability of GPT-4V with a set of representative traffic incident videos and +delve into the model's capacity of understanding these complex traffic +situations. We observe that GPT-4V demonstrates remarkable cognitive, +reasoning, and decision-making ability in certain classic traffic events. +Concurrently, we also identify certain limitations of GPT-4V, which constrain +its understanding in more intricate scenarios. These limitations merit further +exploration and resolution. + +
+
+
+
+
+ + ♻ ☆ LocPoseNet: Robust Location Prior for Unseen Object Pose Estimation 3DV2024 + + +
+ Object location prior is critical for the standard 6D object pose estimation +setting. The prior can be used to initialize the 3D object translation and +facilitate 3D object rotation estimation. Unfortunately, the object detectors +that are used for this purpose do not generalize to unseen objects. Therefore, +existing 6D pose estimation methods for unseen objects either assume the +ground-truth object location to be known or yield inaccurate results when it is +unavailable. In this paper, we address this problem by developing a method, +LocPoseNet, able to robustly learn location prior for unseen objects. Our +method builds upon a template matching strategy, where we propose to distribute +the reference kernels and convolve them with a query to efficiently compute +multi-scale correlations. We then introduce a novel translation estimator, +which decouples scale-aware and scale-robust features to predict different +object location parameters. Our method outperforms existing works by a large +margin on LINEMOD and GenMOP. We further construct a challenging synthetic +dataset, which allows us to highlight the better robustness of our method to +various noise sources. Our project website is at: +https://sailor-z.github.io/projects/3DV2024_LocPoseNet.html. + +
+
+ comment: Accepted by 3DV2024 +
+
+
+
+
+ + ♻ ☆ Smooth, exact rotational symmetrization for deep learning on point + clouds + + +
+ Point clouds are versatile representations of 3D objects and have found +widespread application in science and engineering. Many successful +deep-learning models have been proposed that use them as input. The domain of +chemical and materials modeling is especially challenging because exact +compliance with physical constraints is highly desirable for a model to be +usable in practice. These constraints include smoothness and invariance with +respect to translations, rotations, and permutations of identical atoms. If +these requirements are not rigorously fulfilled, atomistic simulations might +lead to absurd outcomes even if the model has excellent accuracy. Consequently, +dedicated architectures, which achieve invariance by restricting their design +space, have been developed. General-purpose point-cloud models are more varied +but often disregard rotational symmetry. We propose a general symmetrization +method that adds rotational equivariance to any given model while preserving +all the other requirements. Our approach simplifies the development of better +atomic-scale machine-learning schemes by relaxing the constraints on the design +space and making it possible to incorporate ideas that proved effective in +other domains. We demonstrate this idea by introducing the Point Edge +Transformer (PET) architecture, which is not intrinsically equivariant but +achieves state-of-the-art performance on several benchmark datasets of +molecules and solids. A-posteriori application of our general protocol makes +PET exactly equivariant, with minimal changes to its accuracy. + +
+
+ comment: Enhancing figures; minor polishing +
+
+
+
+
+ + ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object + Detection Systems + + +
+ In autonomous driving, LiDAR and radar are crucial for environmental +perception. LiDAR offers precise 3D spatial sensing information but struggles +in adverse weather like fog. Conversely, radar signals can penetrate rain or +mist due to their specific wavelength but are prone to noise disturbances. +Recent state-of-the-art works reveal that the fusion of radar and LiDAR can +lead to robust detection in adverse weather. The existing works adopt +convolutional neural network architecture to extract features from each sensor +data, then align and aggregate the two branch features to predict object +detection results. However, these methods have low accuracy of predicted +bounding boxes due to a simple design of label assignment and fusion +strategies. In this paper, we propose a bird's-eye view fusion learning-based +anchor box-free object detection system, which fuses the feature derived from +the radar range-azimuth heatmap and the LiDAR point cloud to estimate possible +objects. Different label assignment strategies have been designed to facilitate +the consistency between the classification of foreground or background anchor +points and the corresponding bounding box regressions. Furthermore, the +performance of the proposed object detector is further enhanced by employing a +novel interactive transformer module. The superior performance of the methods +proposed in this paper has been demonstrated using the recently published +Oxford Radar RobotCar dataset. Our system's average precision significantly +outperforms the state-of-the-art method by 13.1% and 19.0% at Intersection of +Union (IoU) of 0.8 under 'Clear+Foggy' training conditions for 'Clear' and +'Foggy' testing, respectively. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Diffusion Models, Image Super-Resolution And Everything: A Survey + + +
+ Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field +and further closed the gap between image quality and human perceptual +preferences. They are easy to train and can produce very high-quality samples +that exceed the realism of those produced by previous generative methods. +Despite their promising results, they also come with new challenges that need +further research: high computational demands, comparability, lack of +explainability, color shifts, and more. Unfortunately, entry into this field is +overwhelming because of the abundance of publications. To address this, we +provide a unified recount of the theoretical foundations underlying DMs applied +to image SR and offer a detailed analysis that underscores the unique +characteristics and methodologies within this domain, distinct from broader +existing reviews in the field. This survey articulates a cohesive understanding +of DM principles and explores current research avenues, including alternative +input domains, conditioning techniques, guidance mechanisms, corruption spaces, +and zero-shot learning approaches. By offering a detailed examination of the +evolution and current trends in image SR through the lens of DMs, this survey +sheds light on the existing challenges and charts potential future directions, +aiming to inspire further innovation in this rapidly advancing area. + +
+
+
+
+
+ + ♻ ☆ One-Shot Action Recognition via Multi-Scale Spatial-Temporal Skeleton + Matching + + +
+ One-shot skeleton action recognition, which aims to learn a skeleton action +recognition model with a single training sample, has attracted increasing +interest due to the challenge of collecting and annotating large-scale skeleton +action data. However, most existing studies match skeleton sequences by +comparing their feature vectors directly which neglects spatial structures and +temporal orders of skeleton data. This paper presents a novel one-shot skeleton +action recognition technique that handles skeleton action recognition via +multi-scale spatial-temporal feature matching. We represent skeleton data at +multiple spatial and temporal scales and achieve optimal feature matching from +two perspectives. The first is multi-scale matching which captures the +scale-wise semantic relevance of skeleton data at multiple spatial and temporal +scales simultaneously. The second is cross-scale matching which handles +different motion magnitudes and speeds by capturing sample-wise relevance +across multiple scales. Extensive experiments over three large-scale datasets +(NTU RGB+D, NTU RGB+D 120, and PKU-MMD) show that our method achieves superior +one-shot skeleton action recognition, and it outperforms the state-of-the-art +consistently by large margins. + +
+
+ comment: 8 pages, 4 figures, 6 tables. Accepted by IEEE Transactions on + Pattern Analysis and Machine Intelligence +
+
+
+
+
+ + ♻ ☆ SmoothVideo: Smooth Video Synthesis with Noise Constraints on Diffusion + Models for One-shot Video Tuning + + +
+ Recent one-shot video tuning methods, which fine-tune the network on a +specific video based on pre-trained text-to-image models (e.g., Stable +Diffusion), are popular in the community because of the flexibility. However, +these methods often produce videos marred by incoherence and inconsistency. To +address these limitations, this paper introduces a simple yet effective noise +constraint across video frames. This constraint aims to regulate noise +predictions across their temporal neighbors, resulting in smooth latents. It +can be simply included as a loss term during the training phase. By applying +the loss to existing one-shot video tuning methods, we significantly improve +the overall consistency and smoothness of the generated videos. Furthermore, we +argue that current video evaluation metrics inadequately capture smoothness. To +address this, we introduce a novel metric that considers detailed features and +their temporal dynamics. Experimental results validate the effectiveness of our +approach in producing smoother videos on various one-shot video tuning +baselines. The source codes and video demos are available at +\href{https://github.com/SPengLiang/SmoothVideo}{https://github.com/SPengLiang/SmoothVideo}. + +
+
+
+
+
+ + ♻ ☆ Bootstrapping Audio-Visual Segmentation by Strengthening Audio Cues + + +
+ How to effectively interact audio with vision has garnered considerable +interest within the multi-modality research field. Recently, a novel +audio-visual segmentation (AVS) task has been proposed, aiming to segment the +sounding objects in video frames under the guidance of audio cues. However, +most existing AVS methods are hindered by a modality imbalance where the visual +features tend to dominate those of the audio modality, due to a unidirectional +and insufficient integration of audio cues. This imbalance skews the feature +representation towards the visual aspect, impeding the learning of joint +audio-visual representations and potentially causing segmentation inaccuracies. +To address this issue, we propose AVSAC. Our approach features a Bidirectional +Audio-Visual Decoder (BAVD) with integrated bidirectional bridges, enhancing +audio cues and fostering continuous interplay between audio and visual +modalities. This bidirectional interaction narrows the modality imbalance, +facilitating more effective learning of integrated audio-visual +representations. Additionally, we present a strategy for audio-visual +frame-wise synchrony as fine-grained guidance of BAVD. This strategy enhances +the share of auditory components in visual features, contributing to a more +balanced audio-visual representation learning. Extensive experiments show that +our method attains new benchmarks in AVS performance. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised 3D Object Detection with Multi-Stage Generalization + + +
+ With the rapid development of large models, the need for data has become +increasingly crucial. Especially in 3D object detection, costly manual +annotations have hindered further advancements. To reduce the burden of +annotation, we study the problem of achieving 3D object detection solely based +on 2D annotations. Thanks to advanced 3D reconstruction techniques, it is now +feasible to reconstruct the overall static 3D scene. However, extracting +precise object-level annotations from the entire scene and generalizing these +limited annotations to the entire scene remain challenges. In this paper, we +introduce a novel paradigm called BA$^2$-Det, encompassing pseudo label +generation and multi-stage generalization. We devise the DoubleClustering +algorithm to obtain object clusters from reconstructed scene-level points, and +further enhance the model's detection capabilities by developing three stages +of generalization: progressing from complete to partial, static to dynamic, and +close to distant. Experiments conducted on the large-scale Waymo Open Dataset +show that the performance of BA$^2$-Det is on par with the fully-supervised +methods using 10% annotations. Additionally, using large raw videos for +pretraining,BA$^2$-Det can achieve a 20% relative improvement on the KITTI +dataset. The method also has great potential for detecting open-set 3D objects +in complex scenes. Project page: https://ba2det.site. + +
+
+ comment: Project page: https://ba2det.site +
+
+
+
+
+ + ♻ ☆ Semantic2Graph: Graph-based Multi-modal Feature Fusion for Action + Segmentation in Videos + + +
+ Video action segmentation have been widely applied in many fields. Most +previous studies employed video-based vision models for this purpose. However, +they often rely on a large receptive field, LSTM or Transformer methods to +capture long-term dependencies within videos, leading to significant +computational resource requirements. To address this challenge, graph-based +model was proposed. However, previous graph-based models are less accurate. +Hence, this study introduces a graph-structured approach named Semantic2Graph, +to model long-term dependencies in videos, thereby reducing computational costs +and raise the accuracy. We construct a graph structure of video at the +frame-level. Temporal edges are utilized to model the temporal relations and +action order within videos. Additionally, we have designed positive and +negative semantic edges, accompanied by corresponding edge weights, to capture +both long-term and short-term semantic relationships in video actions. Node +attributes encompass a rich set of multi-modal features extracted from video +content, graph structures, and label text, encompassing visual, structural, and +semantic cues. To synthesize this multi-modal information effectively, we +employ a graph neural network (GNN) model to fuse multi-modal features for node +action label classification. Experimental results demonstrate that +Semantic2Graph outperforms state-of-the-art methods in terms of performance, +particularly on benchmark datasets such as GTEA and 50Salads. Multiple ablation +experiments further validate the effectiveness of semantic features in +enhancing model performance. Notably, the inclusion of semantic edges in +Semantic2Graph allows for the cost-effective capture of long-term dependencies, +affirming its utility in addressing the challenges posed by computational +resource constraints in video-based vision models. + +
+
+ comment: 13 pages, 3 figures, 9 tables. Published on Applied Intelligence +
+
+
+
+
+ + ♻ ☆ Memory-Assisted Sub-Prototype Mining for Universal Domain Adaptation ICLR + + +
+ Universal domain adaptation aims to align the classes and reduce the feature +gap between the same category of the source and target domains. The target +private category is set as the unknown class during the adaptation process, as +it is not included in the source domain. However, most existing methods +overlook the intra-class structure within a category, especially in cases where +there exists significant concept shift between the samples belonging to the +same category. When samples with large concept shift are forced to be pushed +together, it may negatively affect the adaptation performance. Moreover, from +the interpretability aspect, it is unreasonable to align visual features with +significant differences, such as fighter jets and civil aircraft, into the same +category. Unfortunately, due to such semantic ambiguity and annotation cost, +categories are not always classified in detail, making it difficult for the +model to perform precise adaptation. To address these issues, we propose a +novel Memory-Assisted Sub-Prototype Mining (MemSPM) method that can learn the +differences between samples belonging to the same category and mine sub-classes +when there exists significant concept shift between them. By doing so, our +model learns a more reasonable feature space that enhances the transferability +and reflects the inherent differences among samples annotated as the same +category. We evaluate the effectiveness of our MemSPM method over multiple +scenarios, including UniDA, OSDA, and PDA. Our method achieves state-of-the-art +performance on four benchmarks in most cases. + +
+
+ comment: Accepted by The International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Learnable Graph Matching: A Practical Paradigm for Data Association + + +
+ Data association is at the core of many computer vision tasks, e.g., multiple +object tracking, image matching, and point cloud registration. however, current +data association solutions have some defects: they mostly ignore the intra-view +context information; besides, they either train deep association models in an +end-to-end way and hardly utilize the advantage of optimization-based +assignment methods, or only use an off-the-shelf neural network to extract +features. In this paper, we propose a general learnable graph matching method +to address these issues. Especially, we model the intra-view relationships as +an undirected graph. Then data association turns into a general graph matching +problem between graphs. Furthermore, to make optimization end-to-end +differentiable, we relax the original graph matching problem into continuous +quadratic programming and then incorporate training into a deep graph neural +network with KKT conditions and implicit function theorem. In MOT task, our +method achieves state-of-the-art performance on several MOT datasets. For image +matching, our method outperforms state-of-the-art methods on a popular indoor +dataset, ScanNet. For point cloud registration, we also achieve competitive +results. Code will be available at https://github.com/jiaweihe1996/GMTracker. + +
+
+ comment: Accepted by TPAMI 2024. arXiv admin note: substantial text overlap + with arXiv:2103.16178 +
+
+
+
+
+ + ♻ ☆ AGILE: Approach-based Grasp Inference Learned from Element Decomposition + + +
+ Humans, this species expert in grasp detection, can grasp objects by taking +into account hand-object positioning information. This work proposes a method +to enable a robot manipulator to learn the same, grasping objects in the most +optimal way according to how the gripper has approached the object. Built on +deep learning, the proposed method consists of two main stages. In order to +generalize the network on unseen objects, the proposed Approach-based Grasping +Inference involves an element decomposition stage to split an object into its +main parts, each with one or more annotated grasps for a particular approach of +the gripper. Subsequently, a grasp detection network utilizes the decomposed +elements by Mask R-CNN and the information on the approach of the gripper in +order to detect the element the gripper has approached and the most optimal +grasp. In order to train the networks, the study introduces a robotic grasping +dataset collected in the Coppeliasim simulation environment. The dataset +involves 10 different objects with annotated element decomposition masks and +grasp rectangles. The proposed method acquires a 90% grasp success rate on seen +objects and 78% on unseen objects in the Coppeliasim simulation environment. +Lastly, simulation-to-reality domain adaptation is performed by applying +transformations on the training set collected in simulation and augmenting the +dataset, which results in a 70% physical grasp success performance using a +Delta parallel robot and a 2 -fingered gripper. + +
+
+ comment: Conference Paper, ICROM 2023, 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Generative Modeling through the Semi-dual Formulation of Unbalanced + Optimal Transport + + +
+ Optimal Transport (OT) problem investigates a transport map that bridges two +distributions while minimizing a given cost function. In this regard, OT +between tractable prior distribution and data has been utilized for generative +modeling tasks. However, OT-based methods are susceptible to outliers and face +optimization challenges during training. In this paper, we propose a novel +generative model based on the semi-dual formulation of Unbalanced Optimal +Transport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution +matching. This approach provides better robustness against outliers, stability +during training, and faster convergence. We validate these properties +empirically through experiments. Moreover, we study the theoretical upper-bound +of divergence between distributions in UOT. Our model outperforms existing +OT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 6.36 +on CelebA-HQ-256. The code is available at +\url{https://github.com/Jae-Moo/UOTM}. + +
+
+ comment: 23 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Linear Alignment of Vision-language Models for Image Captioning + + +
+ Recently, vision-language models like CLIP have advanced the state of the art +in a variety of multi-modal tasks including image captioning and caption +evaluation. Many approaches adapt CLIP-style models to a downstream task by +training a mapping network between CLIP and a language model. This is costly as +it usually involves calculating gradients for large models. We propose a more +efficient training protocol that fits a linear mapping between image and text +embeddings of CLIP via a closed-form solution. This bypasses the need for +gradient computation and results in a lightweight captioning method called +ReCap, which can be trained up to 1000 times faster than existing lightweight +methods. Moreover, we propose two new learning-based image-captioning metrics +that build on CLIP score along with our linear mapping. Furthermore, we combine +ReCap with our new metrics to design an iterative datastore-augmentation loop +(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k, +VizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art +lightweight methods on established metrics while outperforming them on our new +metrics, which are better aligned with human ratings on Flickr8k-Expert and +Flickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to +other domains and that our DAL leads to a performance boost. + +
+
+ comment: 8 pages (+ references and appendix) +
+
+
+
+
+ + ♻ ☆ Exploiting Low-level Representations for Ultra-Fast Road Segmentation + + +
+ Achieving real-time and accuracy on embedded platforms has always been the +pursuit of road segmentation methods. To this end, they have proposed many +lightweight networks. However, they ignore the fact that roads are "stuff" +(background or environmental elements) rather than "things" (specific +identifiable objects), which inspires us to explore the feasibility of +representing roads with low-level instead of high-level features. Surprisingly, +we find that the primary stage of mainstream network models is sufficient to +represent most pixels of the road for segmentation. Motivated by this, we +propose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg). +Specifically, LFD-RoadSeg employs a bilateral structure. The spatial detail +branch is firstly designed to extract low-level feature representation for the +road by the first stage of ResNet-18. To suppress texture-less regions mistaken +as the road in the low-level feature, the context semantic branch is then +designed to extract the context feature in a fast manner. To this end, in the +second branch, we asymmetrically downsample the input image and design an +aggregation module to achieve comparable receptive fields to the third stage of +ResNet-18 but with less time consumption. Finally, to segment the road from the +low-level feature, a selective fusion module is proposed to calculate +pixel-wise attention between the low-level representation and context feature, +and suppress the non-road low-level response by this attention. On KITTI-Road, +LFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average +precision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on +a Jetson TX2, all with a compact model size of just 936k parameters. The source +code is available at https://github.com/zhouhuan-hust/LFD-RoadSeg. + +
+
+ comment: 11 pages, 7 figures, IEEE TITS +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Continual Learning: Theory, Method and + Application + + +
+ To cope with real-world dynamics, an intelligent system needs to +incrementally acquire, update, accumulate, and exploit knowledge throughout its +lifetime. This ability, known as continual learning, provides a foundation for +AI systems to develop themselves adaptively. In a general sense, continual +learning is explicitly limited by catastrophic forgetting, where learning a new +task usually results in a dramatic performance degradation of the old tasks. +Beyond this, increasingly numerous advances have emerged in recent years that +largely extend the understanding and application of continual learning. The +growing and widespread interest in this direction demonstrates its realistic +significance as well as complexity. In this work, we present a comprehensive +survey of continual learning, seeking to bridge the basic settings, theoretical +foundations, representative methods, and practical applications. Based on +existing theoretical and empirical results, we summarize the general objectives +of continual learning as ensuring a proper stability-plasticity trade-off and +an adequate intra/inter-task generalizability in the context of resource +efficiency. Then we provide a state-of-the-art and elaborated taxonomy, +extensively analyzing how representative methods address continual learning, +and how they are adapted to particular challenges in realistic applications. +Through an in-depth discussion of promising directions, we believe that such a +holistic perspective can greatly facilitate subsequent exploration in this +field and beyond. + +
+
+ comment: The concise version is in IEEE Transactions on Pattern Analysis and + Machine Intelligence (TPAMI) +
+
+
+
+
+ + ♻ ☆ Matrix Information Theory for Self-Supervised Learning + + +
+ The maximum entropy encoding framework provides a unified perspective for +many non-contrastive learning methods like SimSiam, Barlow Twins, and MEC. +Inspired by this framework, we introduce Matrix-SSL, a novel approach that +leverages matrix information theory to interpret the maximum entropy encoding +loss as matrix uniformity loss. Furthermore, Matrix-SSL enhances the maximum +entropy encoding method by seamlessly incorporating matrix alignment loss, +directly aligning covariance matrices in different branches. Experimental +results reveal that Matrix-SSL outperforms state-of-the-art methods on the +ImageNet dataset under linear evaluation settings and on MS-COCO for transfer +learning tasks. Specifically, when performing transfer learning tasks on +MS-COCO, our method outperforms previous SOTA methods such as MoCo v2 and BYOL +up to 3.3% with only 400 epochs compared to 800 epochs pre-training. We also +try to introduce representation learning into the language modeling regime, +achieving 72.3% on the GSM8K dataset by fine-tuning a 7B model using matrix +cross-entropy loss, with a margin of 3.1% over the standard cross-entropy loss. +Code available at https://github.com/yifanzhang-pro/Matrix-SSL. + +
+
+
+
+
+ + ♻ ☆ Meta-Learning 3D Shape Segmentation Functions + + +
+ Learning robust 3D shape segmentation functions with deep neural networks has +emerged as a powerful paradigm, offering promising performance in producing a +consistent part segmentation of each 3D shape. Generalizing across 3D shape +segmentation functions requires robust learning of priors over the respective +function space and enables consistent part segmentation of shapes in presence +of significant 3D structure variations. Existing generalization methods rely on +extensive training of 3D shape segmentation functions on large-scale labeled +datasets. In this paper, we proposed to formalize the learning of a 3D shape +segmentation function space as a meta-learning problem, aiming to predict a 3D +segmentation model that can be quickly adapted to new shapes with no or limited +training data. More specifically, we define each task as unsupervised learning +of shape-conditioned 3D segmentation function which takes as input points in 3D +space and predicts the part-segment labels. The 3D segmentation function is +trained by a self-supervised 3D shape reconstruction loss without the need for +part labels. Also, we introduce an auxiliary deep neural network as a +meta-learner which takes as input a 3D shape and predicts the prior over the +respective 3D segmentation function space. We show in experiments that our +meta-learning approach, denoted as Meta-3DSeg, leads to improvements on +unsupervised 3D shape segmentation over the conventional designs of deep neural +networks for 3D shape segmentation functions. + +
+
+
+
+
+ + ♻ ☆ Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D + Generation + + +
+ Text-to-3D generation has shown rapid progress in recent days with the advent +of score distillation, a methodology of using pretrained text-to-2D diffusion +models to optimize neural radiance field (NeRF) in the zero-shot setting. +However, the lack of 3D awareness in the 2D diffusion models destabilizes score +distillation-based methods from reconstructing a plausible 3D scene. To address +this issue, we propose 3DFuse, a novel framework that incorporates 3D awareness +into pretrained 2D diffusion models, enhancing the robustness and 3D +consistency of score distillation-based methods. We realize this by first +constructing a coarse 3D structure of a given text prompt and then utilizing +projected, view-specific depth map as a condition for the diffusion model. +Additionally, we introduce a training strategy that enables the 2D diffusion +model learns to handle the errors and sparsity within the coarse 3D structure +for robust generation, as well as a method for ensuring semantic consistency +throughout all viewpoints of the scene. Our framework surpasses the limitations +of prior arts, and has significant implications for 3D consistent generation of +2D diffusion models. + +
+
+ comment: Project page https://ku-cvlab.github.io/3DFuse/ +
+
+
+
+
+ + ♻ ☆ Video-LaVIT: Unified Video-Language Pre-training with Decoupled + Visual-Motional Tokenization + + +
+ In light of recent advances in multimodal Large Language Models (LLMs), there +is increasing attention to scaling them from image-text data to more +informative real-world videos. Compared to static images, video poses unique +challenges for effective large-scale pre-training due to the modeling of its +spatiotemporal dynamics. In this paper, we address such limitations in +video-language pre-training with an efficient video decomposition that +represents each video as keyframes and temporal motions. These are then adapted +to an LLM using well-designed tokenizers that discretize visual and temporal +information as a few tokens, thus enabling unified generative pre-training of +videos, images, and text. At inference, the generated tokens from the LLM are +carefully recovered to the original continuous pixel space to create various +video content. Our proposed framework is both capable of comprehending and +generating image and video content, as demonstrated by its competitive +performance across 13 multimodal benchmarks in image and video understanding +and generation. Our code and models will be available at +https://video-lavit.github.io. + +
+
+
+
+
+ + ♻ ☆ Taming Uncertainty in Sparse-view Generalizable NeRF via Indirect + Diffusion Guidance + + +
+ Neural Radiance Fields (NeRF) have demonstrated effectiveness in synthesizing +novel views. However, their reliance on dense inputs and scene-specific +optimization has limited their broader applicability. Generalizable NeRFs +(Gen-NeRF), while intended to address this, often produce blurring artifacts in +unobserved regions with sparse inputs, which are full of uncertainty. In this +paper, we aim to diminish the uncertainty in Gen-NeRF for plausible renderings. +We assume that NeRF's inability to effectively mitigate this uncertainty stems +from its inherent lack of generative capacity. Therefore, we innovatively +propose an Indirect Diffusion-guided NeRF framework, termed ID-NeRF, to address +this uncertainty from a generative perspective by leveraging a distilled +diffusion prior as guidance. Specifically, to avoid model confusion caused by +directly regularizing with inconsistent samplings as in previous methods, our +approach introduces a strategy to indirectly inject the inherently missing +imagination into the learned implicit function through a diffusion-guided +latent space. Empirical evaluation across various benchmarks demonstrates the +superior performance of our approach in handling uncertainty with sparse +inputs. + +
+
+
+
+
+ + ♻ ☆ Out-of-Domain Robustness via Targeted Augmentations + + +
+ Models trained on one set of domains often suffer performance drops on unseen +domains, e.g., when wildlife monitoring models are deployed in new camera +locations. In this work, we study principles for designing data augmentations +for out-of-domain (OOD) generalization. In particular, we focus on real-world +scenarios in which some domain-dependent features are robust, i.e., some +features that vary across domains are predictive OOD. For example, in the +wildlife monitoring application above, image backgrounds vary across camera +locations but indicate habitat type, which helps predict the species of +photographed animals. Motivated by theoretical analysis on a linear setting, we +propose targeted augmentations, which selectively randomize spurious +domain-dependent features while preserving robust ones. We prove that targeted +augmentations improve OOD performance, allowing models to generalize better +with fewer domains. In contrast, existing approaches such as generic +augmentations, which fail to randomize domain-dependent features, and +domain-invariant augmentations, which randomize all domain-dependent features, +both perform poorly OOD. In experiments on three real-world datasets, we show +that targeted augmentations set new states-of-the-art for OOD performance by +3.2-15.2 percentage points. + +
+
+
+
+
+ + ♻ ☆ Multi-Body Neural Scene Flow 3DV'2024 + + +
+ The test-time optimization of scene flow - using a coordinate network as a +neural prior - has gained popularity due to its simplicity, lack of dataset +bias, and state-of-the-art performance. We observe, however, that although +coordinate networks capture general motions by implicitly regularizing the +scene flow predictions to be spatially smooth, the neural prior by itself is +unable to identify the underlying multi-body rigid motions present in +real-world data. To address this, we show that multi-body rigidity can be +achieved without the cumbersome and brittle strategy of constraining the +$SE(3)$ parameters of each rigid body as done in previous works. This is +achieved by regularizing the scene flow optimization to encourage isometry in +flow predictions for rigid bodies. This strategy enables multi-body rigidity in +scene flow while maintaining a continuous flow field, hence allowing dense +long-term scene flow integration across a sequence of point clouds. We conduct +extensive experiments on real-world datasets and demonstrate that our approach +outperforms the state-of-the-art in 3D scene flow and long-term point-wise 4D +trajectory prediction. The code is available at: +https://github.com/kavisha725/MBNSF. + +
+
+ comment: Accepted for 3DV'2024 (oral) +
+
+
+
+
+ + ♻ ☆ An Efficient Convex Hull-based Vehicle Pose Estimation Method for 3D + LiDAR + + +
+ Vehicle pose estimation with LiDAR is essential in the perception technology +of autonomous driving. However, due to incomplete observation measurements and +sparsity of the LiDAR point cloud, it is challenging to achieve satisfactory +pose extraction based on 3D LiDAR with the existing pose estimation methods. In +addition, the demand for real-time performance further increases the difficulty +of the pose estimation task. In this paper, we propose a novel vehicle pose +estimation method based on the convex hull. The extracted 3D cluster is reduced +to the convex hull, reducing the subsequent computation burden while preserving +essential contour information. Subsequently, a novel criterion based on the +minimum occlusion area is developed for the search-based algorithm, enabling +accurate pose estimation. Additionally, this criterion renders the proposed +algorithm particularly well-suited for obstacle avoidance. The proposed +algorithm is validated on the KITTI dataset and a manually labeled dataset +acquired at an industrial park. The results demonstrate that our proposed +method can achieve better accuracy than the classical pose estimation method +while maintaining real-time speed. + +
+
+
+
+
+ + ♻ ☆ VoroNav: Voronoi-based Zero-shot Object Navigation with Large Language + Model + + +
+ In the realm of household robotics, the Zero-Shot Object Navigation (ZSON) +task empowers agents to adeptly traverse unfamiliar environments and locate +objects from novel categories without prior explicit training. This paper +introduces VoroNav, a novel semantic exploration framework that proposes the +Reduced Voronoi Graph to extract exploratory paths and planning nodes from a +semantic map constructed in real time. By harnessing topological and semantic +information, VoroNav designs text-based descriptions of paths and images that +are readily interpretable by a large language model (LLM). In particular, our +approach presents a synergy of path and farsight descriptions to represent the +environmental context, enabling LLM to apply commonsense reasoning to ascertain +waypoints for navigation. Extensive evaluation on HM3D and HSSD validates +VoroNav surpasses existing benchmarks in both success rate and exploration +efficiency (absolute improvement: +2.8% Success and +3.7% SPL on HM3D, +2.6% +Success and +3.8% SPL on HSSD). Additionally introduced metrics that evaluate +obstacle avoidance proficiency and perceptual efficiency further corroborate +the enhancements achieved by our method in ZSON planning. Project page: +https://voro-nav.github.io + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Skip \n: A simple method to reduce hallucination in Large + Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks (\n\n), where the content before and +after '\n\n' in the training data frequently exhibit significant semantic +changes. This pattern leads the model to infer that the contents following +'\n\n' should be obviously different from the preceding contents with less +hallucinatory descriptions, thereby increasing the probability of hallucinatory +descriptions subsequent to the '\n\n'. We have validated this hypothesis on +multiple publicly available LVLMs. Besides, we find that deliberately inserting +'\n\n' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of '\n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ An Open-source Benchmark of Deep Learning Models for Audio-visual + Apparent and Self-reported Personality Recognition + + +
+ Personality determines a wide variety of human daily and working behaviours, +and is crucial for understanding human internal and external states. In recent +years, a large number of automatic personality computing approaches have been +developed to predict either the apparent personality or self-reported +personality of the subject based on non-verbal audio-visual behaviours. +However, the majority of them suffer from complex and dataset-specific +pre-processing steps and model training tricks. In the absence of a +standardized benchmark with consistent experimental settings, it is not only +impossible to fairly compare the real performances of these personality +computing models but also makes them difficult to be reproduced. In this paper, +we present the first reproducible audio-visual benchmarking framework to +provide a fair and consistent evaluation of eight existing personality +computing models (e.g., audio, visual and audio-visual) and seven standard deep +learning models on both self-reported and apparent personality recognition +tasks. Building upon a set of benchmarked models, we also investigate the +impact of two previously-used long-term modelling strategies for summarising +short-term/frame-level predictions on personality computing results. The +results conclude: (i) apparent personality traits, inferred from facial +behaviours by most benchmarked deep learning models, show more reliability than +self-reported ones; (ii) visual models frequently achieved superior +performances than audio models on personality recognition; (iii) non-verbal +behaviours contribute differently in predicting different personality traits; +and (iv) our reproduced personality computing models generally achieved worse +performances than their original reported results. Our benchmark is publicly +available at \url{https://github.com/liaorongfan/DeepPersonality}. + +
+
+ comment: Accepted by IEEE Transactions on Affective Computing +
+
+
+
+
+ + ♻ ☆ Orientation-Aware Leg Movement Learning for Action-Driven Human Motion + Prediction + + +
+ The task of action-driven human motion prediction aims to forecast future +human motion based on the observed sequence while respecting the given action +label. It requires modeling not only the stochasticity within human motion but +the smooth yet realistic transition between multiple action labels. However, +the fact that most datasets do not contain such transition data complicates +this task. Existing work tackles this issue by learning a smoothness prior to +simply promote smooth transitions, yet doing so can result in unnatural +transitions especially when the history and predicted motions differ +significantly in orientations. In this paper, we argue that valid human motion +transitions should incorporate realistic leg movements to handle orientation +changes, and cast it as an action-conditioned in-betweening (ACB) learning task +to encourage transition naturalness. Because modeling all possible transitions +is virtually unreasonable, our ACB is only performed on very few selected +action classes with active gait motions, such as Walk or Run. Specifically, we +follow a two-stage forecasting strategy by first employing the motion diffusion +model to generate the target motion with a specified future action, and then +producing the in-betweening to smoothly connect the observation and prediction +to eventually address motion prediction. Our method is completely free from the +labeled motion transition data during training. To show the robustness of our +approach, we generalize our trained in-betweening learning model on one dataset +to two unseen large-scale motion datasets to produce natural transitions. +Extensive experimental evaluations on three benchmark datasets demonstrate that +our method yields the state-of-the-art performance in terms of visual quality, +prediction accuracy, and action faithfulness. + +
+
+
+
+
+ + ♻ ☆ HumanReg: Self-supervised Non-rigid Registration of Human Point Cloud + + +
+ In this paper, we present a novel registration framework, HumanReg, that +learns a non-rigid transformation between two human point clouds end-to-end. We +introduce body prior into the registration process to efficiently handle this +type of point cloud. Unlike most exsisting supervised registration techniques +that require expensive point-wise flow annotations, HumanReg can be trained in +a self-supervised manner benefiting from a set of novel loss functions. To make +our model better converge on real-world data, we also propose a pretraining +strategy, and a synthetic dataset (HumanSyn4D) consists of dynamic, sparse +human point clouds and their auto-generated ground truth annotations. Our +experiments shows that HumanReg achieves state-of-the-art performance on +CAPE-512 dataset and gains a qualitative result on another more challenging +real-world dataset. Furthermore, our ablation studies demonstrate the +effectiveness of our synthetic dataset and novel loss functions. Our code and +synthetic dataset is available at https://github.com/chenyifanthu/HumanReg. + +
+
+
+
+
+ + ♻ ☆ WS-SfMLearner: Self-supervised Monocular Depth and Ego-motion Estimation + on Surgical Videos with Unknown Camera Parameters SP + + +
+ Depth estimation in surgical video plays a crucial role in many image-guided +surgery procedures. However, it is difficult and time consuming to create depth +map ground truth datasets in surgical videos due in part to inconsistent +brightness and noise in the surgical scene. Therefore, building an accurate and +robust self-supervised depth and camera ego-motion estimation system is gaining +more attention from the computer vision community. Although several +self-supervision methods alleviate the need for ground truth depth maps and +poses, they still need known camera intrinsic parameters, which are often +missing or not recorded. Moreover, the camera intrinsic prediction methods in +existing works depend heavily on the quality of datasets. In this work, we +aimed to build a self-supervised depth and ego-motion estimation system which +can predict not only accurate depth maps and camera pose, but also camera +intrinsic parameters. We proposed a cost-volume-based supervision manner to +give the system auxiliary supervision for camera parameters prediction. The +experimental results showed that the proposed method improved the accuracy of +estimated camera parameters, ego-motion, and depth estimation. + +
+
+ comment: Accepted by SPIE 2024 +
+
+
+
+
+ + ♻ ☆ SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene + Reconstruction by Neural Radiance Field (NeRF) SP + + +
+ The accurate reconstruction of surgical scenes from surgical videos is +critical for various applications, including intraoperative navigation and +image-guided robotic surgery automation. However, previous approaches, mainly +relying on depth estimation, have limited effectiveness in reconstructing +surgical scenes with moving surgical tools. To address this limitation and +provide accurate 3D position prediction for surgical tools in all frames, we +propose a novel approach called SAMSNeRF that combines Segment Anything Model +(SAM) and Neural Radiance Field (NeRF) techniques. Our approach generates +accurate segmentation masks of surgical tools using SAM, which guides the +refinement of the dynamic surgical scene reconstruction by NeRF. Our +experimental results on public endoscopy surgical videos demonstrate that our +approach successfully reconstructs high-fidelity dynamic surgical scenes and +accurately reflects the spatial information of surgical tools. Our proposed +approach can significantly enhance surgical navigation and automation by +providing surgeons with accurate 3D position information of surgical tools +during surgery.The source code will be released soon. + +
+
+ comment: Accepted by SPIE 2024 +
+
+
+
+
+ + ♻ ☆ An Examination of the Robustness of Reference-Free Image Captioning + Evaluation Metrics + + +
+ Recently, reference-free metrics such as CLIPScore (Hessel et al., 2021), +UMIC (Lee et al., 2021), and PAC-S (Sarto et al., 2023) have been proposed for +automatic reference-free evaluation of image captions. Our focus lies in +evaluating the robustness of these metrics in scenarios that require +distinguishing between two captions with high lexical overlap but very +different meanings. Our findings reveal that despite their high correlation +with human judgments, CLIPScore, UMIC, and PAC-S struggle to identify +fine-grained errors. While all metrics exhibit strong sensitivity to visual +grounding errors, their sensitivity to caption implausibility errors is +limited. Furthermore, we found that all metrics are sensitive to variations in +the size of image-relevant objects mentioned in the caption, while CLIPScore +and PAC-S are also sensitive to the number of mentions of image-relevant +objects in the caption. Regarding linguistic aspects of a caption, all metrics +show weak comprehension of negation, and CLIPScore and PAC-S are insensitive to +the structure of the caption to a great extent. We hope our findings will guide +further improvements in reference-free evaluation of image captioning. + +
+
+
+
+
+ + ♻ ☆ Reversing Skin Cancer Adversarial Examples by Multiscale Diffusive and + Denoising Aggregation Mechanism + + +
+ Reliable skin cancer diagnosis models play an essential role in early +screening and medical intervention. Prevailing computer-aided skin cancer +classification systems employ deep learning approaches. However, recent studies +reveal their extreme vulnerability to adversarial attacks -- often +imperceptible perturbations to significantly reduce the performances of skin +cancer diagnosis models. To mitigate these threats, this work presents a +simple, effective, and resource-efficient defense framework by reverse +engineering adversarial perturbations in skin cancer images. Specifically, a +multiscale image pyramid is first established to better preserve discriminative +structures in the medical imaging domain. To neutralize adversarial effects, +skin images at different scales are then progressively diffused by injecting +isotropic Gaussian noises to move the adversarial examples to the clean image +manifold. Crucially, to further reverse adversarial noises and suppress +redundant injected noises, a novel multiscale denoising mechanism is carefully +designed that aggregates image information from neighboring scales. We +evaluated the defensive effectiveness of our method on ISIC 2019, a largest +skin cancer multiclass classification dataset. Experimental results demonstrate +that the proposed method can successfully reverse adversarial perturbations +from different attacks and significantly outperform some state-of-the-art +methods in defending skin cancer diagnosis models. + +
+
+ comment: Accepted by Computers in Biology and Medicine +
+
+
+
+
+ + ♻ ☆ Learning Calibrated Uncertainties for Domain Shift: A Distributionally + Robust Learning Approach IJCAI 2023 + + +
+ We propose a framework for learning calibrated uncertainties under domain +shifts, where the source (training) distribution differs from the target (test) +distribution. We detect such domain shifts via a differentiable density ratio +estimator and train it together with the task network, composing an adjusted +softmax predictive form concerning domain shift. In particular, the density +ratio estimation reflects the closeness of a target (test) sample to the source +(training) distribution. We employ it to adjust the uncertainty of prediction +in the task network. This idea of using the density ratio is based on the +distributionally robust learning (DRL) framework, which accounts for the domain +shift by adversarial risk minimization. We show that our proposed method +generates calibrated uncertainties that benefit downstream tasks, such as +unsupervised domain adaptation (UDA) and semi-supervised learning (SSL). On +these tasks, methods like self-training and FixMatch use uncertainties to +select confident pseudo-labels for re-training. Our experiments show that the +introduction of DRL leads to significant improvements in cross-domain +performance. We also show that the estimated density ratios align with human +selection frequencies, suggesting a positive correlation with a proxy of human +perceived uncertainties. + +
+
+ comment: IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Mastering Text-to-Image Diffusion: Recaptioning, Planning, and + Generating with Multimodal LLMs + + +
+ Diffusion models have exhibit exceptional performance in text-to-image +generation and editing. However, existing methods often face challenges when +handling complex text prompts that involve multiple objects with multiple +attributes and relationships. In this paper, we propose a brand new +training-free text-to-image generation/editing framework, namely Recaption, +Plan and Generate (RPG), harnessing the powerful chain-of-thought reasoning +ability of multimodal LLMs to enhance the compositionality of text-to-image +diffusion models. Our approach employs the MLLM as a global planner to +decompose the process of generating complex images into multiple simpler +generation tasks within subregions. We propose complementary regional diffusion +to enable region-wise compositional generation. Furthermore, we integrate +text-guided image generation and editing within the proposed RPG in a +closed-loop fashion, thereby enhancing generalization ability. Extensive +experiments demonstrate our RPG outperforms state-of-the-art text-to-image +diffusion models, including DALL-E 3 and SDXL, particularly in multi-category +object composition and text-image semantic alignment. Notably, our RPG +framework exhibits wide compatibility with various MLLM architectures (e.g., +MiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available +at: https://github.com/YangLing0818/RPG-DiffusionMaster + +
+
+ comment: Project: https://github.com/YangLing0818/RPG-DiffusionMaster +
+
+
+
+
+ + ♻ ☆ EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense + Prediction ICCV 2023 + + +
+ High-resolution dense prediction enables many appealing real-world +applications, such as computational photography, autonomous driving, etc. +However, the vast computational cost makes deploying state-of-the-art +high-resolution dense prediction models on hardware devices difficult. This +work presents EfficientViT, a new family of high-resolution vision models with +novel multi-scale linear attention. Unlike prior high-resolution dense +prediction models that rely on heavy softmax attention, hardware-inefficient +large-kernel convolution, or complicated topology structure to obtain good +performances, our multi-scale linear attention achieves the global receptive +field and multi-scale learning (two desirable features for high-resolution +dense prediction) with only lightweight and hardware-efficient operations. As +such, EfficientViT delivers remarkable performance gains over previous +state-of-the-art models with significant speedup on diverse hardware platforms, +including mobile CPU, edge GPU, and cloud GPU. Without performance loss on +Cityscapes, our EfficientViT provides up to 13.9$\times$ and 6.2$\times$ GPU +latency reduction over SegFormer and SegNeXt, respectively. For +super-resolution, EfficientViT delivers up to 6.4x speedup over Restormer while +providing 0.11dB gain in PSNR. For Segment Anything, EfficientViT delivers +48.9x higher throughput on A100 GPU while achieving slightly better zero-shot +instance segmentation performance on COCO. + +
+
+ comment: ICCV 2023; Update EfficientViT-SAM results +
+
+
+
+
+ + ♻ ☆ DeCoF: Generated Video Detection via Frame Consistency + + +
+ The escalating quality of video generated by advanced video generation +methods leads to new security challenges in society, which makes generated +video detection an urgent research priority. To foster collaborative research +in this area, we construct the first open-source dataset explicitly for +generated video detection, providing a valuable resource for the community to +benchmark and improve detection methodologies. Through a series of carefully +designed probe experiments, our study explores the significance of temporal and +spatial artifacts in developing general and robust detectors for generated +video. Based on the principle of video frame consistency, we introduce a simple +yet effective detection model (DeCoF) that eliminates the impact of spatial +artifacts during generalizing feature learning. Our extensive experiments +demonstrate the efficacy of DeCoF in detecting videos produced by unseen video +generation models and confirm its powerful generalization capabilities across +several commercial proprietary models. + +
+
+
+
+
+ + ♻ ☆ Trojan Model Detection Using Activation Optimization + + +
+ Training machine learning models can be very expensive or even unaffordable. +This may be, for example, due to data limitations (unavailability or being too +large), or computational power limitations. Therefore, it is a common practice +to rely on open-source pre-trained models whenever possible. However, this +practice is alarming from a security perspective. Pre-trained models can be +infected with Trojan attacks, in which the attacker embeds a trigger in the +model such that the model's behavior can be controlled by the attacker when the +trigger is present in the input. In this paper, we present a novel method for +detecting Trojan models. Our method creates a signature for a model based on +activation optimization. A classifier is then trained to detect a Trojan model +given its signature. We call our method TRIGS for TRojan Identification from +Gradient-based Signatures. TRIGS achieves state-of-the-art performance on two +public datasets of convolutional models. Additionally, we introduce a new +challenging dataset of ImageNet models based on the vision transformer +architecture. TRIGS delivers the best performance on the new dataset, +surpassing the baseline methods by a large margin. Our experiments also show +that TRIGS requires only a small amount of clean samples to achieve good +performance, and works reasonably well even if the defender does not have prior +knowledge about the attacker's model architecture. Our dataset will be released +soon. + +
+
+
+
+
+ + ♻ ☆ StyleGAN3: Generative Networks for Improving the Equivariance of + Translation and Rotation + + +
+ StyleGAN can use style to affect facial posture and identity features, and +noise to affect hair, wrinkles, skin color and other details. Among these, the +outcomes of the picture processing will vary slightly between different +versions of styleGAN. As a result, the comparison of performance differences +between styleGAN2 and the two modified versions of styleGAN3 will be the main +focus of this study. We used the FFHQ dataset as the dataset and FID, EQ-T, and +EQ-R were used to be the assessment of the model. In the end, we discovered +that Stylegan3 version is a better generative network to improve the +equivariance. Our findings have a positive impact on the creation of animation +and videos. + +
+
+ comment: But now we feel we haven't fully studied our work and have found some + new great results. So after careful consideration, we're going to rework this + manuscript and try to give a more accurate model +
+
+
+
+
+ + ♻ ☆ TiMix: Text-aware Image Mixing for Effective Vision-Language + Pre-training AAAI2024 + + +
+ Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances +modern Vision-Language Pre-training (VLP) models by aligning visual and +linguistic modalities. Due to noises in web-harvested text-image pairs, +however, scaling up training data volume in SMCL presents considerable +obstacles in terms of computational cost and data inefficiency. To improve data +efficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates +mix-based data augmentation techniques into SMCL, yielding significant +performance improvements without significantly increasing computational +overhead. We provide a theoretical analysis of TiMixfrom a mutual information +(MI) perspective, showing that mixed data samples for cross-modal contrastive +learning implicitly serve as a regularizer for the contrastive loss. The +experimental results demonstrate that TiMix exhibits a comparable performance +on downstream tasks, even with a reduced amount of training data and shorter +training time, when benchmarked against existing methods. This work empirically +and theoretically demonstrates the potential of data mixing for data-efficient +and computationally viable VLP, benefiting broader VLP model adoption in +practical scenarios. + +
+
+ comment: Accepted on AAAI2024 +
+
+
+
+
+ + ♻ ☆ Spatially Covariant Image Registration with Text Prompts + + +
+ Medical images are often characterized by their structured anatomical +representations and spatially inhomogeneous contrasts. Leveraging anatomical +priors in neural networks can greatly enhance their utility in +resource-constrained clinical settings. Prior research has harnessed such +information for image segmentation, yet progress in deformable image +registration has been modest. Our work introduces textSCF, a novel method that +integrates spatially covariant filters and textual anatomical prompts encoded +by visual-language models, to fill this gap. This approach optimizes an +implicit function that correlates text embeddings of anatomical regions to +filter weights, relaxing the typical translation-invariance constraint of +convolutional operations. TextSCF not only boosts computational efficiency but +can also retain or improve registration accuracy. By capturing the contextual +interplay between anatomical regions, it offers impressive inter-regional +transferability and the ability to preserve structural discontinuities during +registration. TextSCF's performance has been rigorously tested on inter-subject +brain MRI and abdominal CT registration tasks, outperforming existing +state-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the +leaderboard. In abdominal registrations, textSCF's larger model variant +improved the Dice score by 11.3% over the second-best model, while its smaller +variant maintained similar accuracy but with an 89.13% reduction in network +parameters and a 98.34\% decrease in computational operations. + +
+
+ comment: 13 pages, 8 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Representation Synthesis by Probabilistic Many-Valued Logic Operation in + Self-Supervised Learning + + +
+ In this paper, we propose a new self-supervised learning (SSL) method for +representations that enable logic operations. Representation learning has been +applied to various tasks, such as image generation and retrieval. The logical +controllability of representations is important for these tasks. Although some +methods have been shown to enable the intuitive control of representations +using natural languages as the inputs, representation control via logic +operations between representations has not been demonstrated. Some SSL methods +using representation synthesis (e.g., elementwise mean and maximum operations) +have been proposed, but the operations performed in these methods do not +incorporate logic operations. In this work, we propose a logic-operable +self-supervised representation learning method by replacing the existing +representation synthesis with the OR operation on the probabilistic extension +of many-valued logic. The representations comprise a set of feature-possession +degrees, which are truth values indicating the presence or absence of each +feature in the image, and realize the logic operations (e.g., OR and AND). Our +method can generate a representation that has the features of both +representations or only those features common to both representations. In +addition, the expression of the ambiguous presence of a feature is realized by +indicating the feature-possession degree by the probability distribution of +truth values of the many-valued logic. We showed that our method performs +competitively in single and multi-label classification tasks compared with +prior SSL methods using synthetic representations. Moreover, experiments on +image retrieval using MNIST and PascalVOC showed that the representations of +our method can be operated by OR and AND operations. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Unsupervised Deep Learning Image Verification Method + + +
+ Although deep learning are commonly employed for image recognition, usually +huge amount of labeled training data is required, which may not always be +readily available. This leads to a noticeable performance disparity when +compared to state-of-the-art unsupervised face verification techniques. In this +work, we propose a method to narrow this gap by leveraging an autoencoder to +convert the face image vector into a novel representation. Notably, the +autoencoder is trained to reconstruct neighboring face image vectors rather +than the original input image vectors. These neighbor face image vectors are +chosen through an unsupervised process based on the highest cosine scores with +the training face image vectors. The proposed method achieves a relative +improvement of 56\% in terms of EER over the baseline system on Labeled Faces +in the Wild (LFW) dataset. This has successfully narrowed down the performance +gap between cosine and PLDA scoring systems. + +
+
+
+
+
+ + ♻ ☆ Transcending Domains through Text-to-Image Diffusion: A Source-Free + Approach to Domain Adaptation + + +
+ Domain Adaptation (DA) is a method for enhancing a model's performance on a +target domain with inadequate annotated data by applying the information the +model has acquired from a related source domain with sufficient labeled data. +The escalating enforcement of data-privacy regulations like HIPAA, COPPA, +FERPA, etc. have sparked a heightened interest in adapting models to novel +domains while circumventing the need for direct access to the source data, a +problem known as Source-Free Domain Adaptation (SFDA). In this paper, we +propose a novel framework for SFDA that generates source data using a +text-to-image diffusion model trained on the target domain samples. Our method +starts by training a text-to-image diffusion model on the labeled target domain +samples, which is then fine-tuned using the pre-trained source model to +generate samples close to the source data. Finally, we use Domain Adaptation +techniques to align the artificially generated source data with the target +domain data, resulting in significant performance improvements of the model on +the target domain. Through extensive comparison against several baselines on +the standard Office-31, Office-Home, and VisDA benchmarks, we demonstrate the +effectiveness of our approach for the SFDA task. + +
+
+ comment: Revamped the whole paper; new version will be re-submitted +
+
+
+
+
+ + ♻ ☆ Semi-supervised learning for generalizable intracranial hemorrhage + detection and segmentation + + +
+ Purpose: To develop and evaluate a semi-supervised learning model for +intracranial hemorrhage detection and segmentation on an out-of-distribution +head CT evaluation set. + Materials and Methods: This retrospective study used semi-supervised learning +to bootstrap performance. An initial "teacher" deep learning model was trained +on 457 pixel-labeled head CT scans collected from one US institution from +2010-2017 and used to generate pseudo-labels on a separate unlabeled corpus of +25000 examinations from the RSNA and ASNR. A second "student" model was trained +on this combined pixel- and pseudo-labeled dataset. Hyperparameter tuning was +performed on a validation set of 93 scans. Testing for both classification +(n=481 examinations) and segmentation (n=23 examinations, or 529 images) was +performed on CQ500, a dataset of 481 scans performed in India, to evaluate +out-of-distribution generalizability. The semi-supervised model was compared +with a baseline model trained on only labeled data using area under the +receiver operating characteristic curve (AUC), Dice similarity coefficient +(DSC), and average precision (AP) metrics. + Results: The semi-supervised model achieved statistically significantly +higher examination AUC on CQ500 compared with the baseline (0.939 [0.938, +0.940] vs. 0.907 [0.906, 0.908]) (p=0.009). It also achieved a higher DSC +(0.829 [0.825, 0.833] vs. 0.809 [0.803, 0.812]) (p=0.012) and Pixel AP (0.848 +[0.843, 0.853]) vs. 0.828 [0.817, 0.828]) compared to the baseline. + Conclusion: The addition of unlabeled data in a semi-supervised learning +framework demonstrates stronger generalizability potential for intracranial +hemorrhage detection and segmentation compared with a supervised baseline. + +
+
+
+
+
+ + ♻ ☆ RenderDiffusion: Image Diffusion for 3D Reconstruction, Inpainting and + Generation CVPR 2023 + + +
+ Diffusion models currently achieve state-of-the-art performance for both +conditional and unconditional image generation. However, so far, image +diffusion models do not support tasks required for 3D understanding, such as +view-consistent 3D generation or single-view object reconstruction. In this +paper, we present RenderDiffusion, the first diffusion model for 3D generation +and inference, trained using only monocular 2D supervision. Central to our +method is a novel image denoising architecture that generates and renders an +intermediate three-dimensional representation of a scene in each denoising +step. This enforces a strong inductive structure within the diffusion process, +providing a 3D consistent representation while only requiring 2D supervision. +The resulting 3D representation can be rendered from any view. We evaluate +RenderDiffusion on FFHQ, AFHQ, ShapeNet and CLEVR datasets, showing competitive +performance for generation of 3D scenes and inference of 3D scenes from 2D +images. Additionally, our diffusion-based approach allows us to use 2D +inpainting to edit 3D scenes. + +
+
+ comment: Accepted at CVPR 2023. Project page: + https://github.com/Anciukevicius/RenderDiffusion +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS + + +
+ Recent advancements in real-time neural rendering using point-based +techniques have paved the way for the widespread adoption of 3D +representations. However, foundational approaches like 3D Gaussian Splatting +come with a substantial storage overhead caused by growing the SfM points to +millions, often demanding gigabyte-level disk space for a single unbounded +scene, posing significant scalability challenges and hindering the splatting +efficiency. + To address this challenge, we introduce LightGaussian, a novel method +designed to transform 3D Gaussians into a more efficient and compact format. +Drawing inspiration from the concept of Network Pruning, LightGaussian +identifies Gaussians that are insignificant in contributing to the scene +reconstruction and adopts a pruning and recovery process, effectively reducing +redundancy in Gaussian counts while preserving visual effects. Additionally, +LightGaussian employs distillation and pseudo-view augmentation to distill +spherical harmonics to a lower degree, allowing knowledge transfer to more +compact representations while maintaining reflectance. Furthermore, we propose +a hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in +lower bitwidth representations with minimal accuracy losses. + In summary, LightGaussian achieves an averaged compression rate over 15x +while boosting the FPS from 139 to 215, enabling an efficient representation of +complex scenes on Mip-NeRF 360, Tank and Temple datasets. + Project website: https://lightgaussian.github.io/ + +
+
+ comment: 16pages, 8figures +
+
+
+
+
+ + ♻ ☆ Evaluating Robustness of Visual Representations for Object Assembly Task + Requiring Spatio-Geometrical Reasoning + + +
+ This paper primarily focuses on evaluating and benchmarking the robustness of +visual representations in the context of object assembly tasks. Specifically, +it investigates the alignment and insertion of objects with geometrical +extrusions and intrusions, commonly referred to as a peg-in-hole task. The +accuracy required to detect and orient the peg and the hole geometry in SE(3) +space for successful assembly poses significant challenges. Addressing this, we +employ a general framework in visuomotor policy learning that utilizes visual +pretraining models as vision encoders. Our study investigates the robustness of +this framework when applied to a dual-arm manipulation setup, specifically to +the grasp variations. Our quantitative analysis shows that existing pretrained +models fail to capture the essential visual features necessary for this task. +However, a visual encoder trained from scratch consistently outperforms the +frozen pretrained models. Moreover, we discuss rotation representations and +associated loss functions that substantially improve policy learning. We +present a novel task scenario designed to evaluate the progress in visuomotor +policy learning, with a specific focus on improving the robustness of intricate +assembly tasks that require both geometrical and spatial reasoning. Videos, +additional experiments, dataset, and code are available at +https://bit.ly/geometric-peg-in-hole . + +
+
+
+
+
+ + ♻ ☆ VALUED -- Vision and Logical Understanding Evaluation Dataset + + +
+ Starting with early successes in computer vision tasks, deep learning based +techniques have since overtaken state of the art approaches in a multitude of +domains. However, it has been demonstrated time and again that these techniques +fail to capture semantic context and logical constraints, instead often relying +on spurious correlations to arrive at the answer. Since application of deep +learning techniques to critical scenarios are dependent on adherence to domain +specific constraints, several attempts have been made to address this issue. +One limitation holding back a thorough exploration of this area, is a lack of +suitable datasets which feature a rich set of rules. In order to address this, +we present the VALUE (Vision And Logical Understanding Evaluation) Dataset, +consisting of 200,000$+$ annotated images and an associated rule set, based on +the popular board game - chess. The curated rule set considerably constrains +the set of allowable predictions, and are designed to probe key semantic +abilities like localization and enumeration. Alongside standard metrics, +additional metrics to measure performance with regards to logical consistency +is presented. We analyze several popular and state of the art vision models on +this task, and show that, although their performance on standard metrics are +laudable, they produce a plethora of incoherent results, indicating that this +dataset presents a significant challenge for future works. + +
+
+
+
+
+ + ♻ ☆ Organic or Diffused: Can We Distinguish Human Art from AI-generated + Images? + + +
+ The advent of generative AI images has completely disrupted the art world. +Distinguishing AI generated images from human art is a challenging problem +whose impact is growing over time. A failure to address this problem allows bad +actors to defraud individuals paying a premium for human art and companies +whose stated policies forbid AI imagery. It is also critical for content owners +to establish copyright, and for model trainers interested in curating training +data in order to avoid potential model collapse. + There are several different approaches to distinguishing human art from AI +images, including classifiers trained by supervised learning, research tools +targeting diffusion models, and identification by professional artists using +their knowledge of artistic techniques. In this paper, we seek to understand +how well these approaches can perform against today's modern generative models +in both benign and adversarial settings. We curate real human art across 7 +styles, generate matching images from 5 generative models, and apply 8 +detectors (5 automated detectors and 3 different human groups including 180 +crowdworkers, 4000+ professional artists, and 13 expert artists experienced at +detecting AI). Both Hive and expert artists do very well, but make mistakes in +different ways (Hive is weaker against adversarial perturbations while Expert +artists produce higher false positives). We believe these weaknesses will +remain as models continue to evolve, and use our data to demonstrate why a +combined team of human and automated detectors provides the best combination of +accuracy and robustness. + +
+
+
+
+
+
+
+
+ + Information Retrieval 19 + +
+
+
+ + ☆ Can Large Language Models Detect Rumors on Social Media? + + +
+ In this work, we investigate to use Large Language Models (LLMs) for rumor +detection on social media. However, it is challenging for LLMs to reason over +the entire propagation information on social media, which contains news +contents and numerous comments, due to LLMs may not concentrate on key clues in +the complex propagation information, and have trouble in reasoning when facing +massive and redundant information. Accordingly, we propose an LLM-empowered +Rumor Detection (LeRuD) approach, in which we design prompts to teach LLMs to +reason over important clues in news and comments, and divide the entire +propagation information into a Chain-of-Propagation for reducing LLMs' burden. +We conduct extensive experiments on the Twitter and Weibo datasets, and LeRuD +outperforms several state-of-the-art rumor detection models by 2.4% to 7.6%. +Meanwhile, by applying LLMs, LeRuD requires no data for training, and thus +shows more promising rumor detection ability in few-shot or zero-shot +scenarios. + +
+
+
+
+
+ + ☆ Learning Metrics that Maximise Power for Accelerated A/B-Tests + + +
+ Online controlled experiments are a crucial tool to allow for confident +decision-making in technology companies. A North Star metric is defined (such +as long-term revenue or user retention), and system variants that statistically +significantly improve on this metric in an A/B-test can be considered superior. +North Star metrics are typically delayed and insensitive. As a result, the cost +of experimentation is high: experiments need to run for a long time, and even +then, type-II errors (i.e. false negatives) are prevalent. + We propose to tackle this by learning metrics from short-term signals that +directly maximise the statistical power they harness with respect to the North +Star. We show that existing approaches are prone to overfitting, in that higher +average metric sensitivity does not imply improved type-II errors, and propose +to instead minimise the $p$-values a metric would have produced on a log of +past experiments. We collect such datasets from two social media applications +with over 160 million Monthly Active Users each, totalling over 153 A/B-pairs. +Empirical results show that we are able to increase statistical power by up to +78% when using our learnt metrics stand-alone, and by up to 210% when used in +tandem with the North Star. Alternatively, we can obtain constant statistical +power at a sample size that is down to 12% of what the North Star requires, +significantly reducing the cost of experimentation. + +
+
+
+
+
+ + ☆ On Practical Diversified Recommendation with Controllable Category + Diversity Framework + + +
+ Recommender systems have made significant strides in various industries, +primarily driven by extensive efforts to enhance recommendation accuracy. +However, this pursuit of accuracy has inadvertently given rise to echo +chamber/filter bubble effects. Especially in industry, it could impair user's +experiences and prevent user from accessing a wider range of items. One of the +solutions is to take diversity into account. However, most of existing works +focus on user's explicit preferences, while rarely exploring user's +non-interaction preferences. These neglected non-interaction preferences are +especially important for broadening user's interests in alleviating echo +chamber/filter bubble effects.Therefore, in this paper, we first define +diversity as two distinct definitions, i.e., user-explicit diversity +(U-diversity) and user-item non-interaction diversity (N-diversity) based on +user historical behaviors. Then, we propose a succinct and effective method, +named as Controllable Category Diversity Framework (CCDF) to achieve both high +U-diversity and N-diversity simultaneously.Specifically, CCDF consists of two +stages, User-Category Matching and Constrained Item Matching. The User-Category +Matching utilizes the DeepU2C model and a combined loss to capture user's +preferences in categories, and then selects the top-$K$ categories with a +controllable parameter $K$.These top-$K$ categories will be used as trigger +information in Constrained Item Matching. Offline experimental results show +that our proposed DeepU2C outperforms state-of-the-art diversity-oriented +methods, especially on N-diversity task. The whole framework is validated in a +real-world production environment by conducting online A/B testing. + +
+
+ comment: A Two-stage Controllable Category Diversity Framework for + Recommendation +
+
+
+
+
+ + ☆ Retrieval Augmented Cross-Modal Tag Recommendation in Software Q&A Sites + + +
+ Posts in software Q\&A sites often consist of three main parts: title, +description and code, which are interconnected and jointly describe the +question. Existing tag recommendation methods often treat different modalities +as a whole or inadequately consider the interaction between different +modalities. Additionally, they focus on extracting information directly from +the post itself, neglecting the information from external knowledge sources. +Therefore, we propose a Retrieval Augmented Cross-Modal (RACM) Tag +Recommendation Model in Software Q\&A Sites. Specifically, we first use the +input post as a query and enhance the representation of different modalities by +retrieving information from external knowledge sources. For the +retrieval-augmented representations, we employ a cross-modal context-aware +attention to leverage the main modality description for targeted feature +extraction across the submodalities title and code. In the fusion process, a +gate mechanism is employed to achieve fine-grained feature selection, +controlling the amount of information extracted from the submodalities. +Finally, the fused information is used for tag recommendation. Experimental +results on three real-world datasets demonstrate that our model outperforms the +state-of-the-art counterparts. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Hybrid Workplace Decision Support + + +
+ Large Language Models (LLMs) hold the potential to perform a variety of text +processing tasks and provide textual explanations for proposed actions or +decisions. In the era of hybrid work, LLMs can provide intelligent decision +support for workers who are designing their hybrid work plans. In particular, +they can offer suggestions and explanations to workers balancing numerous +decision factors, thereby enhancing their work experience. In this paper, we +present a decision support model for workspaces in hybrid work environments, +leveraging the reasoning skill of LLMs. We first examine LLM's capability of +making suitable workspace suggestions. We find that its reasoning extends +beyond the guidelines in the prompt and the LLM can manage the trade-off among +the available resources in the workspaces. We conduct an extensive user study +to understand workers' decision process for workspace choices and evaluate the +effectiveness of the system. We observe that a worker's decision could be +influenced by the LLM's suggestions and explanations. The participants in our +study find the system to be convenient, regardless of whether reasons are +provided or not. Our results show that employees can benefit from the +LLM-empowered system for their workspace selection in hybrid workplace. + +
+
+
+
+
+ + ☆ Understanding and Counteracting Feature-Level Bias in Click-Through Rate + Prediction + + +
+ Common click-through rate (CTR) prediction recommender models tend to exhibit +feature-level bias, which leads to unfair recommendations among item groups and +inaccurate recommendations for users. While existing methods address this issue +by adjusting the learning of CTR models, such as through additional +optimization objectives, they fail to consider how the bias is caused within +these models. To address this research gap, our study performs a top-down +analysis on representative CTR models. Through blocking different components of +a trained CTR model one by one, we identify the key contribution of the linear +component to feature-level bias. We conduct a theoretical analysis of the +learning process for the weights in the linear component, revealing how +group-wise properties of training data influence them. Our experimental and +statistical analyses demonstrate a strong correlation between imbalanced +positive sample ratios across item groups and feature-level bias. Based on this +understanding, we propose a minimally invasive yet effective strategy to +counteract feature-level bias in CTR models by removing the biased linear +weights from trained models. Additionally, we present a linear weight adjusting +strategy that requires fewer random exposure records than relevant debiasing +methods. The superiority of our proposed strategies are validated through +extensive experiments on three real-world datasets. + +
+
+
+
+
+ + ☆ Identifying Reasons for Contraceptive Switching from Real-World Data + Using Large Language Models + + +
+ Prescription contraceptives play a critical role in supporting women's +reproductive health. With nearly 50 million women in the United States using +contraceptives, understanding the factors that drive contraceptives selection +and switching is of significant interest. However, many factors related to +medication switching are often only captured in unstructured clinical notes and +can be difficult to extract. Here, we evaluate the zero-shot abilities of a +recently developed large language model, GPT-4 (via HIPAA-compliant Microsoft +Azure API), to identify reasons for switching between classes of contraceptives +from the UCSF Information Commons clinical notes dataset. We demonstrate that +GPT-4 can accurately extract reasons for contraceptive switching, outperforming +baseline BERT-based models with microF1 scores of 0.849 and 0.881 for +contraceptive start and stop extraction, respectively. Human evaluation of +GPT-4-extracted reasons for switching showed 91.4% accuracy, with minimal +hallucinations. Using extracted reasons, we identified patient preference, +adverse events, and insurance as key reasons for switching using unsupervised +topic modeling approaches. Notably, we also showed using our approach that +"weight gain/mood change" and "insurance coverage" are disproportionately found +as reasons for contraceptive switching in specific demographic populations. Our +code and supplemental data are available at +https://github.com/BMiao10/contraceptive-switching. + +
+
+
+
+
+ + ☆ Reliability quality measures for recommender systems + + +
+ Users want to know the reliability of the recommendations; they do not accept +high predictions if there is no reliability evidence. Recommender systems +should provide reliability values associated with the predictions. Research +into reliability measures requires the existence of simple, plausible and +universal reliability quality measures. Research into recommender system +quality measures has focused on accuracy. Moreover, novelty, serendipity and +diversity have been studied; nevertheless there is an important lack of +research into reliability/confidence quality measures. + This paper proposes a reliability quality prediction measure (RPI) and a +reliability quality recommendation measure (RRI). Both quality measures are +based on the hypothesis that the more suitable a reliability measure is, the +better accuracy results it will provide when applied. These reliability quality +measures show accuracy improvements when appropriated reliability values are +associated with their predictions (i.e. high reliability values associated with +correct predictions or low reliability values associated with incorrect +predictions). + The proposed reliability quality metrics will lead to the design of brand new +recommender system reliability measures. These measures could be applied to +different matrix factorization techniques and to content-based, context-aware +and social recommendation approaches. The recommender system reliability +measures designed could be tested, compared and improved using the proposed +reliability quality metrics. + +
+
+
+
+
+ + ☆ The Potential of AutoML for Recommender Systems + + +
+ Automated Machine Learning (AutoML) has greatly advanced applications of +Machine Learning (ML) including model compression, machine translation, and +computer vision. Recommender Systems (RecSys) can be seen as an application of +ML. Yet, AutoML has found little attention in the RecSys community; nor has +RecSys found notable attention in the AutoML community. Only few and relatively +simple Automated Recommender Systems (AutoRecSys) libraries exist that adopt +AutoML techniques. However, these libraries are based on student projects and +do not offer the features and thorough development of AutoML libraries. We set +out to determine how AutoML libraries perform in the scenario of an +inexperienced user who wants to implement a recommender system. We compared the +predictive performance of 60 AutoML, AutoRecSys, ML, and RecSys algorithms from +15 libraries, including a mean predictor baseline, on 14 explicit feedback +RecSys datasets. To simulate the perspective of an inexperienced user, the +algorithms were evaluated with default hyperparameters. We found that AutoML +and AutoRecSys libraries performed best. AutoML libraries performed best for +six of the 14 datasets (43%), but it was not always the same AutoML library +performing best. The single-best library was the AutoRecSys library +Auto-Surprise, which performed best on five datasets (36%). On three datasets +(21%), AutoML libraries performed poorly, and RecSys libraries with default +parameters performed best. Although, while obtaining 50% of all placements in +the top five per dataset, RecSys algorithms fall behind AutoML on average. ML +algorithms generally performed the worst. + +
+
+
+
+
+ + ☆ Building Retrieval Systems for the ClueWeb22-B Corpus + + +
+ The ClueWeb22 dataset containing nearly 10 billion documents was released in +2022 to support academic and industry research. The goal of this project was to +build retrieval baselines for the English section of the "super head" part +(category B) of this dataset. These baselines can then be used by the research +community to compare their systems and also to generate data to train/evaluate +new retrieval and ranking algorithms. The report covers sparse and dense first +stage retrievals as well as neural rerankers that were implemented for this +dataset. These systems are available as a service on a Carnegie Mellon +University cluster. + +
+
+
+
+
+ + ♻ ☆ K-PERM: Personalized Response Generation Using Dynamic Knowledge + Retrieval and Persona-Adaptive Queries AAAI 2024 + + +
+ Personalizing conversational agents can enhance the quality of conversations +and increase user engagement. However, they often lack external knowledge to +appropriately tend to a user's persona. This is particularly crucial for +practical applications like mental health support, nutrition planning, +culturally sensitive conversations, or reducing toxic behavior in +conversational agents. To enhance the relevance and comprehensiveness of +personalized responses, we propose using a two-step approach that involves (1) +selectively integrating user personas and (2) contextualizing the response with +supplementing information from a background knowledge source. We develop K-PERM +(Knowledge-guided PErsonalization with Reward Modulation), a dynamic +conversational agent that combines these elements. K-PERM achieves +state-of-the-art performance on the popular FoCus dataset, containing +real-world personalized conversations concerning global landmarks. We show that +using responses from K-PERM can improve performance in state-of-the-art LLMs +(GPT 3.5) by 10.5%, highlighting the impact of K-PERM for personalizing +chatbots. + +
+
+ comment: Accepted at AAAI 2024 Spring Symposium Series +
+
+
+
+
+ + ♻ ☆ Online Recommendations for Agents with Discounted Adaptive Preferences ALT 2024 + + +
+ We consider a bandit recommendations problem in which an agent's preferences +(representing selection probabilities over recommended items) evolve as a +function of past selections, according to an unknown $\textit{preference +model}$. In each round, we show a menu of $k$ items (out of $n$ total) to the +agent, who then chooses a single item, and we aim to minimize regret with +respect to some $\textit{target set}$ (a subset of the item simplex) for +adversarial losses over the agent's choices. Extending the setting from Agarwal +and Brown (2022), where uniform-memory agents were considered, here we allow +for non-uniform memory in which a discount factor is applied to the agent's +memory vector at each subsequent round. In the "long-term memory" regime (when +the effective memory horizon scales with $T$ sublinearly), we show that +efficient sublinear regret is obtainable with respect to the set of +$\textit{everywhere instantaneously realizable distributions}$ (the "EIRD set", +as formulated in prior work) for any $\textit{smooth}$ preference model. +Further, for preferences which are bounded above and below by linear functions +of memory weight (we call these "scale-bounded" preferences) we give an +algorithm which obtains efficient sublinear regret with respect to nearly the +$\textit{entire}$ item simplex. We show an NP-hardness result for expanding to +targets beyond EIRD in general. In the "short-term memory" regime (when the +memory horizon is constant), we show that scale-bounded preferences again +enable efficient sublinear regret for nearly the entire simplex even without +smoothness if losses do not change too frequently, yet we show an +information-theoretic barrier for competing against the EIRD set under +arbitrary smooth preference models even when losses are constant. + +
+
+ comment: Updates for camera-ready version (ALT 2024) +
+
+
+
+
+ + ♻ ☆ Source Code Clone Detection Using Unsupervised Similarity Measures + + +
+ Assessing similarity in source code has gained significant attention in +recent years due to its importance in software engineering tasks such as clone +detection and code search and recommendation. This work presents a comparative +analysis of unsupervised similarity measures for identifying source code clone +detection. The goal is to overview the current state-of-the-art techniques, +their strengths, and weaknesses. To do that, we compile the existing +unsupervised strategies and evaluate their performance on a benchmark dataset +to guide software engineers in selecting appropriate methods for their specific +use cases. The source code of this study is available at +https://github.com/jorge-martinez-gil/codesim + +
+
+ comment: Accepted for publication as Full Paper in the Software Quality Days + 2024, Vienna, Austria +
+
+
+
+
+ + ♻ ☆ Re3val: Reinforced and Reranked Generative Retrieval EACL 2023 + + +
+ Generative retrieval models encode pointers to information in a corpus as an +index within the model's parameters. These models serve as part of a larger +pipeline, where retrieved information conditions generation for +knowledge-intensive NLP tasks. However, we identify two limitations: the +generative retrieval does not account for contextual information. Secondly, the +retrieval can't be tuned for the downstream readers as decoding the page title +is a non-differentiable operation. This paper introduces Re3val, trained with +generative reranking and reinforcement learning using limited data. Re3val +leverages context acquired via Dense Passage Retrieval to rerank the retrieved +page titles and utilizes REINFORCE to maximize rewards generated by constrained +decoding. Additionally, we generate questions from our pre-training dataset to +mitigate epistemic uncertainty and bridge the domain gap between the +pre-training and fine-tuning datasets. Subsequently, we extract and rerank +contexts from the KILT database using the rerank page titles. Upon grounding +the top five reranked contexts, Re3val demonstrates the Top 1 KILT scores +compared to all other generative retrieval models across five KILT datasets. + +
+
+ comment: 17 pages, 4 figures, Findings of the Association for Computational + Linguistics: EACL 2023 +
+
+
+
+
+ + ♻ ☆ Whole Page Unbiased Learning to Rank + + +
+ The page presentation biases in the information retrieval system, especially +on the click behavior, is a well-known challenge that hinders improving ranking +models' performance with implicit user feedback. Unbiased Learning to +Rank~(ULTR) algorithms are then proposed to learn an unbiased ranking model +with biased click data. However, most existing algorithms are specifically +designed to mitigate position-related bias, e.g., trust bias, without +considering biases induced by other features in search result page +presentation(SERP), e.g. attractive bias induced by the multimedia. +Unfortunately, those biases widely exist in industrial systems and may lead to +an unsatisfactory search experience. Therefore, we introduce a new problem, +i.e., whole-page Unbiased Learning to Rank(WP-ULTR), aiming to handle biases +induced by whole-page SERP features simultaneously. It presents tremendous +challenges: (1) a suitable user behavior model (user behavior hypothesis) can +be hard to find; and (2) complex biases cannot be handled by existing +algorithms. To address the above challenges, we propose a Bias Agnostic +whole-page unbiased Learning to rank algorithm, named BAL, to automatically +find the user behavior model with causal discovery and mitigate the biases +induced by multiple SERP features with no specific design. Experimental results +on a real-world dataset verify the effectiveness of the BAL. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ An In-depth Investigation of User Response Simulation for Conversational + Search + + +
+ Conversational search has seen increased recent attention in both the IR and +NLP communities. It seeks to clarify and solve users' search needs through +multi-turn natural language interactions. However, most existing systems are +trained and demonstrated with recorded or artificial conversation logs. +Eventually, conversational search systems should be trained, evaluated, and +deployed in an open-ended setting with unseen conversation trajectories. A key +challenge is that training and evaluating such systems both require a +human-in-the-loop, which is expensive and does not scale. One strategy is to +simulate users, thereby reducing the scaling costs. However, current user +simulators are either limited to only responding to yes-no questions from the +conversational search system or unable to produce high-quality responses in +general. + In this paper, we show that existing user simulation systems could be +significantly improved by a smaller finetuned natural language generation +model. However, rather than merely reporting it as the new state-of-the-art, we +consider it a strong baseline and present an in-depth investigation of +simulating user response for conversational search. Our goal is to supplement +existing work with an insightful hand-analysis of unsolved challenges by the +baseline and propose our solutions. The challenges we identified include (1) a +blind spot that is difficult to learn, and (2) a specific type of misevaluation +in the standard setup. We propose a new generation system to effectively cover +the training blind spot and suggest a new evaluation setup to avoid +misevaluation. Our proposed system leads to significant improvements over +existing systems and large language models such as GPT-4. Additionally, our +analysis provides insights into the nature of user simulation to facilitate +future work. + +
+
+ comment: To appear in The Web Conference 2024, 8 pages with Appendices +
+
+
+
+
+ + ♻ ☆ Language is All a Graph Needs EACL 2024 + + +
+ The emergence of large-scale pre-trained language models has revolutionized +various AI research domains. Transformers-based Large Language Models (LLMs) +have gradually replaced CNNs and RNNs to unify fields of computer vision and +natural language processing. Compared with independent data samples such as +images, videos or texts, graphs usually contain rich structural and relational +information. Meanwhile, language, especially natural language, being one of the +most expressive mediums, excels in describing complex structures. However, +existing work on incorporating graph problems into the generative language +modeling framework remains very limited. Considering the rising prominence of +LLMs, it becomes essential to explore whether LLMs can also replace GNNs as the +foundation model for graphs. In this paper, we propose InstructGLM +(Instruction-finetuned Graph Language Model) with highly scalable prompts based +on natural language instructions. We use natural language to describe +multi-scale geometric structure of the graph and then instruction finetune an +LLM to perform graph tasks, which enables Generative Graph Learning. Our method +surpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets, +underscoring its effectiveness and sheds light on generative LLMs as new +foundation model for graph machine learning. Our code is open-sourced at +https://github.com/agiresearch/InstructGLM. + +
+
+ comment: In EACL 2024 +
+
+
+
+
+ + ♻ ☆ EasyInstruct: An Easy-to-use Instruction Processing Framework for Large + Language Models + + +
+ In recent years, instruction tuning has gained increasing attention and +emerged as a crucial technique to enhance the capabilities of Large Language +Models (LLMs). To construct high-quality instruction datasets, many instruction +processing approaches have been proposed, aiming to achieve a delicate balance +between data quantity and data quality. Nevertheless, due to inconsistencies +that persist among various instruction processing methods, there is no standard +open-source instruction processing implementation framework available for the +community, which hinders practitioners from further developing and advancing. +To facilitate instruction processing research and development, we present +EasyInstruct, an easy-to-use instruction processing framework for LLMs, which +modularizes instruction generation, selection, and prompting, while also +considering their combination and interaction. EasyInstruct is publicly +released and actively maintained at https://github.com/zjunlp/EasyInstruct, +along with a running demo App at +https://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for +broader research centered on instruction data. + +
+
+ comment: Ongoing work; the project website is at + https://zjunlp.github.io/project/EasyInstruct, code is at + https://github.com/zjunlp/EasyInstruct, demo is at + https://huggingface.co/spaces/zjunlp/EasyInstruct +
+
+
+
+
+ + ♻ ☆ RimiRec: Modeling Refined Multi-interest in Hierarchical Structure for + Recommendation + + +
+ Industrial recommender systems usually consist of the retrieval stage and the +ranking stage, to handle the billion-scale of users and items. The retrieval +stage retrieves candidate items relevant to user interests for recommendations +and has attracted much attention. Frequently, a user shows refined +multi-interests in a hierarchical structure. For example, a user likes Conan +and Kuroba Kaito, which are the roles in hierarchical structure "Animation, +Japanese Animation, Detective Conan". However, most existing methods ignore +this hierarchical nature, and simply average the fine-grained interest +information. Therefore, we propose a novel two-stage approach to explicitly +modeling refined multi-interest in a hierarchical structure for recommendation. +In the first hierarchical multi-interest mining stage, the hierarchical +clustering and transformer-based model adaptively generate circles or +sub-circles that users are interested in. In the second stage, the partition of +retrieval space allows the EBR models to deal only with items within each +circle and accurately capture users' refined interests. Experimental results +show that the proposed approach achieves state-of-the-art performance. Our +framework has also been deployed at Lofter. + +
+
+ comment: 4 pages, 4 figures +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ HarmBench: A Standardized Evaluation Framework for Automated Red Teaming + and Robust Refusal + + +
+ Automated red teaming holds substantial promise for uncovering and mitigating +the risks associated with the malicious use of large language models (LLMs), +yet the field lacks a standardized evaluation framework to rigorously assess +new methods. To address this issue, we introduce HarmBench, a standardized +evaluation framework for automated red teaming. We identify several desirable +properties previously unaccounted for in red teaming evaluations and +systematically design HarmBench to meet these criteria. Using HarmBench, we +conduct a large-scale comparison of 18 red teaming methods and 33 target LLMs +and defenses, yielding novel insights. We also introduce a highly efficient +adversarial training method that greatly enhances LLM robustness across a wide +range of attacks, demonstrating how HarmBench enables codevelopment of attacks +and defenses. We open source HarmBench at +https://github.com/centerforaisafety/HarmBench. + +
+
+ comment: Website: https://www.harmbench.org +
+
+
+
+
+ + ☆ Can Mamba Learn How to Learn? A Comparative Study on In-Context Learning + Tasks + + +
+ State-space models (SSMs), such as Mamba Gu & Dao (2034), have been proposed +as alternatives to Transformer networks in language modeling, by incorporating +gating, convolutions, and input-dependent token selection to mitigate the +quadratic cost of multi-head attention. Although SSMs exhibit competitive +performance, their in-context learning (ICL) capabilities, a remarkable +emergent property of modern language models that enables task execution without +parameter optimization, remain underexplored compared to Transformers. In this +study, we evaluate the ICL performance of SSMs, focusing on Mamba, against +Transformer models across various tasks. Our results show that SSMs perform +comparably to Transformers in standard regression ICL tasks, while +outperforming them in tasks like sparse parity learning. However, SSMs fall +short in tasks involving non-standard retrieval functionality. To address these +limitations, we introduce a hybrid model, \variant, that combines Mamba with +attention blocks, surpassing individual models in tasks where they struggle +independently. Our findings suggest that hybrid architectures offer promising +avenues for enhancing ICL in language models. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science + + +
+ Intelligent agents powered by large language models (LLMs) have demonstrated +substantial promise in autonomously conducting experiments and facilitating +scientific discoveries across various disciplines. While their capabilities are +promising, they also introduce novel vulnerabilities that demand careful +consideration for safety. However, there exists a notable gap in the +literature, as there has been no comprehensive exploration of these +vulnerabilities. This position paper fills this gap by conducting a thorough +examination of vulnerabilities in LLM-based agents within scientific domains, +shedding light on potential risks associated with their misuse and emphasizing +the need for safety measures. We begin by providing a comprehensive overview of +the potential risks inherent to scientific LLM agents, taking into account user +intent, the specific scientific domain, and their potential impact on the +external environment. Then, we delve into the origins of these vulnerabilities +and provide a scoping review of the limited existing works. Based on our +analysis, we propose a triadic framework involving human regulation, agent +alignment, and an understanding of environmental feedback (agent regulation) to +mitigate these identified risks. Furthermore, we highlight the limitations and +challenges associated with safeguarding scientific agents and advocate for the +development of improved models, robust benchmarks, and comprehensive +regulations to address these issues effectively. + +
+
+
+
+
+ + ☆ CAST: Clustering Self-Attention using Surrogate Tokens for Efficient + Transformers + + +
+ The Transformer architecture has shown to be a powerful tool for a wide range +of tasks. It is based on the self-attention mechanism, which is an inherently +computationally expensive operation with quadratic computational complexity: +memory usage and compute time increase quadratically with the length of the +input sequences, thus limiting the application of Transformers. In this work, +we propose a novel Clustering self-Attention mechanism using Surrogate Tokens +(CAST), to optimize the attention computation and achieve efficient +transformers. CAST utilizes learnable surrogate tokens to construct a cluster +affinity matrix, used to cluster the input sequence and generate novel cluster +summaries. The self-attention from within each cluster is then combined with +the cluster summaries of other clusters, enabling information flow across the +entire input sequence. CAST improves efficiency by reducing the complexity from +$O(N^2)$ to $O(\alpha N)$ where N is the sequence length, and {\alpha} is +constant according to the number of clusters and samples per cluster. We show +that CAST performs better than or comparable to the baseline Transformers on +long-range sequence modeling tasks, while also achieving higher results on time +and memory efficiency than other efficient transformers. + +
+
+
+
+
+ + ☆ MusicRL: Aligning Music Generation to Human Preferences + + +
+ We propose MusicRL, the first music generation system finetuned from human +feedback. Appreciation of text-to-music models is particularly subjective since +the concept of musicality as well as the specific intention behind a caption +are user-dependent (e.g. a caption such as "upbeat work-out music" can map to a +retro guitar solo or a techno pop beat). Not only this makes supervised +training of such models challenging, but it also calls for integrating +continuous human feedback in their post-deployment finetuning. MusicRL is a +pretrained autoregressive MusicLM (Agostinelli et al., 2023) model of discrete +audio tokens finetuned with reinforcement learning to maximise sequence-level +rewards. We design reward functions related specifically to text-adherence and +audio quality with the help from selected raters, and use those to finetune +MusicLM into MusicRL-R. We deploy MusicLM to users and collect a substantial +dataset comprising 300,000 pairwise preferences. Using Reinforcement Learning +from Human Feedback (RLHF), we train MusicRL-U, the first text-to-music model +that incorporates human feedback at scale. Human evaluations show that both +MusicRL-R and MusicRL-U are preferred to the baseline. Ultimately, MusicRL-RU +combines the two approaches and results in the best model according to human +raters. Ablation studies shed light on the musical attributes influencing human +preferences, indicating that text adherence and quality only account for a part +of it. This underscores the prevalence of subjectivity in musical appreciation +and calls for further involvement of human listeners in the finetuning of music +generation models. + +
+
+
+
+
+ + ☆ Resource-Aware Hierarchical Federated Learning in Wireless Video Caching + Networks + + +
+ Backhaul traffic congestion caused by the video traffic of a few popular +files can be alleviated by storing the to-be-requested content at various +levels in wireless video caching networks. Typically, content service providers +(CSPs) own the content, and the users request their preferred content from the +CSPs using their (wireless) internet service providers (ISPs). As these parties +do not reveal their private information and business secrets, traditional +techniques may not be readily used to predict the dynamic changes in users' +future demands. Motivated by this, we propose a novel resource-aware +hierarchical federated learning (RawHFL) solution for predicting user's future +content requests. A practical data acquisition technique is used that allows +the user to update its local training dataset based on its requested content. +Besides, since networking and other computational resources are limited, +considering that only a subset of the users participate in the model training, +we derive the convergence bound of the proposed algorithm. Based on this bound, +we minimize a weighted utility function for jointly configuring the +controllable parameters to train the RawHFL energy efficiently under practical +resource constraints. Our extensive simulation results validate the proposed +algorithm's superiority, in terms of test accuracy and energy cost, over +existing baselines. + +
+
+ comment: Under review for possible publication in IEEE TWC +
+
+
+
+
+ + ☆ Variational Shapley Network: A Probabilistic Approach to Self-Explaining + Shapley values with Uncertainty Quantification + + +
+ Shapley values have emerged as a foundational tool in machine learning (ML) +for elucidating model decision-making processes. Despite their widespread +adoption and unique ability to satisfy essential explainability axioms, +computational challenges persist in their estimation when ($i$) evaluating a +model over all possible subset of input feature combinations, ($ii$) estimating +model marginals, and ($iii$) addressing variability in explanations. We +introduce a novel, self-explaining method that simplifies the computation of +Shapley values significantly, requiring only a single forward pass. Recognizing +the deterministic treatment of Shapley values as a limitation, we explore +incorporating a probabilistic framework to capture the inherent uncertainty in +explanations. Unlike alternatives, our technique does not rely directly on the +observed data space to estimate marginals; instead, it uses adaptable baseline +values derived from a latent, feature-specific embedding space, generated by a +novel masked neural network architecture. Evaluations on simulated and real +datasets underscore our technique's robust predictive and explanatory +performance. + +
+
+
+
+
+ + ☆ Acute kidney injury prediction for non-critical care patients: a + retrospective external and internal validation study + + +
+ Background: Acute kidney injury (AKI), the decline of kidney excretory +function, occurs in up to 18% of hospitalized admissions. Progression of AKI +may lead to irreversible kidney damage. Methods: This retrospective cohort +study includes adult patients admitted to a non-intensive care unit at the +University of Pittsburgh Medical Center (UPMC) (n = 46,815) and University of +Florida Health (UFH) (n = 127,202). We developed and compared deep learning and +conventional machine learning models to predict progression to Stage 2 or +higher AKI within the next 48 hours. We trained local models for each site (UFH +Model trained on UFH, UPMC Model trained on UPMC) and a separate model with a +development cohort of patients from both sites (UFH-UPMC Model). We internally +and externally validated the models on each site and performed subgroup +analyses across sex and race. Results: Stage 2 or higher AKI occurred in 3% +(n=3,257) and 8% (n=2,296) of UFH and UPMC patients, respectively. Area under +the receiver operating curve values (AUROC) for the UFH test cohort ranged +between 0.77 (UPMC Model) and 0.81 (UFH Model), while AUROC values ranged +between 0.79 (UFH Model) and 0.83 (UPMC Model) for the UPMC test cohort. +UFH-UPMC Model achieved an AUROC of 0.81 (95% confidence interval [CI] [0.80, +0.83]) for UFH and 0.82 (95% CI [0.81,0.84]) for UPMC test cohorts; an area +under the precision recall curve values (AUPRC) of 0.6 (95% CI, [0.05, 0.06]) +for UFH and 0.13 (95% CI, [0.11,0.15]) for UPMC test cohorts. Kinetic estimated +glomerular filtration rate, nephrotoxic drug burden and blood urea nitrogen +remained the top three features with the highest influence across the models +and health centers. Conclusion: Locally developed models displayed marginally +reduced discrimination when tested on another institution, while the top set of +influencing features remained the same across the models and sites. + +
+
+
+
+
+ + ☆ Gradient Coding in Decentralized Learning for Evading Stragglers + + +
+ In this paper, we consider a decentralized learning problem in the presence +of stragglers. Although gradient coding techniques have been developed for +distributed learning to evade stragglers, where the devices send encoded +gradients with redundant training data, it is difficult to apply those +techniques directly to decentralized learning scenarios. To deal with this +problem, we propose a new gossip-based decentralized learning method with +gradient coding (GOCO). In the proposed method, to avoid the negative impact of +stragglers, the parameter vectors are updated locally using encoded gradients +based on the framework of stochastic gradient coding and then averaged in a +gossip-based manner. We analyze the convergence performance of GOCO for +strongly convex loss functions. And we also provide simulation results to +demonstrate the superiority of the proposed method in terms of learning +performance compared with the baseline methods. + +
+
+
+
+
+ + ☆ Reinforcement Learning with Ensemble Model Predictive Safety + Certification AAMAS 2024 + + +
+ Reinforcement learning algorithms need exploration to learn. However, +unsupervised exploration prevents the deployment of such algorithms on +safety-critical tasks and limits real-world deployment. In this paper, we +propose a new algorithm called Ensemble Model Predictive Safety Certification +that combines model-based deep reinforcement learning with tube-based model +predictive control to correct the actions taken by a learning agent, keeping +safety constraint violations at a minimum through planning. Our approach aims +to reduce the amount of prior knowledge about the actual system by requiring +only offline data generated by a safe controller. Our results show that we can +achieve significantly fewer constraint violations than comparable reinforcement +learning methods. + +
+
+ comment: Published in: Proc. of the 23rd International Conference on + Autonomous Agents and Multiagent Systems (AAMAS 2024) +
+
+
+
+
+ + ☆ Scaling Laws for Downstream Task Performance of Large Language Models + + +
+ Scaling laws provide important insights that can guide the design of large +language models (LLMs). Existing work has primarily focused on studying scaling +laws for pretraining (upstream) loss. However, in transfer learning settings, +in which LLMs are pretrained on an unsupervised dataset and then finetuned on a +downstream task, we often also care about the downstream performance. In this +work, we study the scaling behavior in a transfer learning setting, where LLMs +are finetuned for machine translation tasks. Specifically, we investigate how +the choice of the pretraining data and its size affect downstream performance +(translation quality) as judged by two metrics: downstream cross-entropy and +BLEU score. Our experiments indicate that the size of the finetuning dataset +and the distribution alignment between the pretraining and downstream data +significantly influence the scaling behavior. With sufficient alignment, both +downstream cross-entropy and BLEU score improve monotonically with more +pretraining data. In such cases, we show that it is possible to predict the +downstream BLEU score with good accuracy using a log-law. However, there are +also cases where moderate misalignment causes the BLEU score to fluctuate or +get worse with more pretraining, whereas downstream cross-entropy monotonically +improves. By analyzing these observations, we provide new practical insights +for choosing appropriate pretraining data. + +
+
+
+
+
+ + ☆ Informed Reinforcement Learning for Situation-Aware Traffic Rule + Exceptions ICRA 2024 + + +
+ Reinforcement Learning is a highly active research field with promising +advancements. In the field of autonomous driving, however, often very simple +scenarios are being examined. Common approaches use non-interpretable control +commands as the action space and unstructured reward designs which lack +structure. In this work, we introduce Informed Reinforcement Learning, where a +structured rulebook is integrated as a knowledge source. We learn trajectories +and asses them with a situation-aware reward design, leading to a dynamic +reward which allows the agent to learn situations which require controlled +traffic rule exceptions. Our method is applicable to arbitrary RL models. We +successfully demonstrate high completion rates of complex scenarios with recent +model-based agents. + +
+
+ comment: Daniel Bogdoll and Jing Qin contributed equally. Accepted for + publication at ICRA 2024 +
+
+
+
+
+ + ☆ Tempered Calculus for ML: Application to Hyperbolic Model Embedding + + +
+ Most mathematical distortions used in ML are fundamentally integral in +nature: $f$-divergences, Bregman divergences, (regularized) optimal transport +distances, integral probability metrics, geodesic distances, etc. In this +paper, we unveil a grounded theory and tools which can help improve these +distortions to better cope with ML requirements. We start with a generalization +of Riemann integration that also encapsulates functions that are not strictly +additive but are, more generally, $t$-additive, as in nonextensive statistical +mechanics. Notably, this recovers Volterra's product integral as a special +case. We then generalize the Fundamental Theorem of calculus using an extension +of the (Euclidean) derivative. This, along with a series of more specific +Theorems, serves as a basis for results showing how one can specifically +design, alter, or change fundamental properties of distortion measures in a +simple way, with a special emphasis on geometric- and ML-related properties +that are the metricity, hyperbolicity, and encoding. We show how to apply it to +a problem that has recently gained traction in ML: hyperbolic embeddings with a +"cheap" and accurate encoding along the hyperbolic vs Euclidean scale. We +unveil a new application for which the Poincar\'e disk model has very appealing +features, and our theory comes in handy: \textit{model} embeddings for boosted +combinations of decision trees, trained using the log-loss (trees) and logistic +loss (combinations). + +
+
+
+
+
+ + ☆ Attention with Markov: A Framework for Principled Analysis of + Transformers via Markov Chains + + +
+ In recent years, attention-based transformers have achieved tremendous +success across a variety of disciplines including natural languages. A key +ingredient behind their success is the generative pretraining procedure, during +which these models are trained on a large text corpus in an auto-regressive +manner. To shed light on this phenomenon, we propose a new framework that +allows both theory and systematic experiments to study the sequential modeling +capabilities of transformers through the lens of Markov chains. Inspired by the +Markovianity of natural languages, we model the data as a Markovian source and +utilize this framework to systematically study the interplay between the +data-distributional properties, the transformer architecture, the learnt +distribution, and the final model performance. In particular, we theoretically +characterize the loss landscape of single-layer transformers and show the +existence of global minima and bad local minima contingent upon the specific +data characteristics and the transformer architecture. Backed by experiments, +we demonstrate that our theoretical findings are in congruence with the +empirical results. We further investigate these findings in the broader context +of higher order Markov chains and deeper architectures, and outline open +problems in this arena. Code is available at +\url{https://github.com/Bond1995/Markov}. + +
+
+
+
+
+ + ☆ Read to Play (R2-Play): Decision Transformer with Multimodal Game + Instruction + + +
+ Developing a generalist agent is a longstanding objective in artificial +intelligence. Previous efforts utilizing extensive offline datasets from +various tasks demonstrate remarkable performance in multitasking scenarios +within Reinforcement Learning.However, these works encounter challenges in +extending their capabilities to new tasks.Recent approaches integrate textual +guidance or visual trajectory into decision networks to provide task-specific +contextual cues, representing a promising direction.However, it is observed +that relying solely on textual guidance or visual trajectory is insufficient +for accurately conveying the contextual information of tasks.This paper +explores enhanced forms of task guidance for agents, enabling them to +comprehend gameplay instructions, thereby facilitating a "read-to-play" +capability.Drawing inspiration from the success of multimodal instruction +tuning in visual tasks, we treat the visual-based RL task as a long-horizon +vision task and construct a set of multimodal game instructions to incorporate +instruction tuning into a decision transformer.Experimental results demonstrate +that incorporating multimodal game instructions significantly enhances the +decision transformer's multitasking and generalization capabilities. + +
+
+
+
+
+ + ☆ Interpretable Multi-Source Data Fusion Through Latent Variable Gaussian + Process + + +
+ With the advent of artificial intelligence (AI) and machine learning (ML), +various domains of science and engineering communites has leveraged data-driven +surrogates to model complex systems from numerous sources of information +(data). The proliferation has led to significant reduction in cost and time +involved in development of superior systems designed to perform specific +functionalities. A high proposition of such surrogates are built extensively +fusing multiple sources of data, may it be published papers, patents, open +repositories, or other resources. However, not much attention has been paid to +the differences in quality and comprehensiveness of the known and unknown +underlying physical parameters of the information sources that could have +downstream implications during system optimization. Towards resolving this +issue, a multi-source data fusion framework based on Latent Variable Gaussian +Process (LVGP) is proposed. The individual data sources are tagged as a +characteristic categorical variable that are mapped into a physically +interpretable latent space, allowing the development of source-aware data +fusion modeling. Additionally, a dissimilarity metric based on the latent +variables of LVGP is introduced to study and understand the differences in the +sources of data. The proposed approach is demonstrated on and analyzed through +two mathematical (representative parabola problem, 2D Ackley function) and two +materials science (design of FeCrAl and SmCoFe alloys) case studies. From the +case studies, it is observed that compared to using single-source and source +unaware ML models, the proposed multi-source data fusion framework can provide +better predictions for sparse-data problems, interpretability regarding the +sources, and enhanced modeling capabilities by taking advantage of the +correlations and relationships among different sources. + +
+
+ comment: 27 Pages,9 Figures, 3 Supplementary Figures, 2 Supplementary Tables +
+
+
+
+
+ + ☆ OVOR: OnePrompt with Virtual Outlier Regularization for Rehearsal-Free + Class-Incremental Learning ICLR 2024 + + +
+ Recent works have shown that by using large pre-trained models along with +learnable prompts, rehearsal-free methods for class-incremental learning (CIL) +settings can achieve superior performance to prominent rehearsal-based ones. +Rehearsal-free CIL methods struggle with distinguishing classes from different +tasks, as those are not trained together. In this work we propose a +regularization method based on virtual outliers to tighten decision boundaries +of the classifier, such that confusion of classes among different tasks is +mitigated. Recent prompt-based methods often require a pool of task-specific +prompts, in order to prevent overwriting knowledge of previous tasks with that +of the new task, leading to extra computation in querying and composing an +appropriate prompt from the pool. This additional cost can be eliminated, +without sacrificing accuracy, as we reveal in the paper. We illustrate that a +simplified prompt-based method can achieve results comparable to previous +state-of-the-art (SOTA) methods equipped with a prompt pool, using much less +learnable parameters and lower inference cost. Our regularization method has +demonstrated its compatibility with different prompt-based methods, boosting +those previous SOTA rehearsal-free CIL methods' accuracy on the ImageNet-R and +CIFAR-100 benchmarks. Our source code is available at +https://github.com/jpmorganchase/ovor. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Scientific Language Modeling: A Quantitative Review of Large Language + Models in Molecular Science + + +
+ Efficient molecular modeling and design are crucial for the discovery and +exploration of novel molecules, and the incorporation of deep learning methods +has revolutionized this field. In particular, large language models (LLMs) +offer a fresh approach to tackle scientific problems from a natural language +processing (NLP) perspective, introducing a research paradigm called scientific +language modeling (SLM). However, two key issues remain: how to quantify the +match between model and data modalities and how to identify the +knowledge-learning preferences of models. To address these challenges, we +propose a multi-modal benchmark, named ChEBI-20-MM, and perform 1263 +experiments to assess the model's compatibility with data modalities and +knowledge acquisition. Through the modal transition probability matrix, we +provide insights into the most suitable modalities for tasks. Furthermore, we +introduce a statistically interpretable approach to discover context-specific +knowledge mapping by localized feature filtering. Our pioneering analysis +offers an exploration of the learning mechanism and paves the way for advancing +SLM in molecular science. + +
+
+
+
+
+ + ☆ SCAFFLSA: Quantifying and Eliminating Heterogeneity Bias in Federated + Linear Stochastic Approximation and Temporal Difference Learning + + +
+ In this paper, we perform a non-asymptotic analysis of the federated linear +stochastic approximation (FedLSA) algorithm. We explicitly quantify the bias +introduced by local training with heterogeneous agents, and investigate the +sample complexity of the algorithm. We show that the communication complexity +of FedLSA scales polynomially with the desired precision $\epsilon$, which +limits the benefits of federation. To overcome this, we propose SCAFFLSA, a +novel variant of FedLSA, that uses control variates to correct the bias of +local training, and prove its convergence without assumptions on statistical +heterogeneity. We apply the proposed methodology to federated temporal +difference learning with linear function approximation, and analyze the +corresponding complexity improvements. + +
+
+
+
+
+ + ☆ Hierarchical Delay Attribution Classification using Unstructured Text in + Train Management Systems + + +
+ EU directives stipulate a systematic follow-up of train delays. In Sweden, +the Swedish Transport Administration registers and assigns an appropriate delay +attribution code. However, this delay attribution code is assigned manually, +which is a complex task. In this paper, a machine learning-based decision +support for assigning delay attribution codes based on event descriptions is +investigated. The text is transformed using TF-IDF, and two models, Random +Forest and Support Vector Machine, are evaluated against a random uniform +classifier and the classification performance of the Swedish Transport +Administration. Further, the problem is modeled as both a hierarchical and flat +approach. The results indicate that a hierarchical approach performs better +than a flat approach. Both approaches perform better than the random uniform +classifier but perform worse than the manual classification. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ An Exploration of Clustering Algorithms for Customer Segmentation in the + UK Retail Market + + +
+ Recently, peoples awareness of online purchases has significantly risen. This +has given rise to online retail platforms and the need for a better +understanding of customer purchasing behaviour. Retail companies are pressed +with the need to deal with a high volume of customer purchases, which requires +sophisticated approaches to perform more accurate and efficient customer +segmentation. Customer segmentation is a marketing analytical tool that aids +customer-centric service and thus enhances profitability. In this paper, we aim +to develop a customer segmentation model to improve decision-making processes +in the retail market industry. To achieve this, we employed a UK-based online +retail dataset obtained from the UCI machine learning repository. The retail +dataset consists of 541,909 customer records and eight features. Our study +adopted the RFM (recency, frequency, and monetary) framework to quantify +customer values. Thereafter, we compared several state-of-the-art (SOTA) +clustering algorithms, namely, K-means clustering, the Gaussian mixture model +(GMM), density-based spatial clustering of applications with noise (DBSCAN), +agglomerative clustering, and balanced iterative reducing and clustering using +hierarchies (BIRCH). The results showed the GMM outperformed other approaches, +with a Silhouette Score of 0.80. + +
+
+ comment: 15 pages, Journal of Analytics +
+
+
+
+
+ + ☆ The Use of a Large Language Model for Cyberbullying Detection + + +
+ The dominance of social media has added to the channels of bullying for +perpetrators. Unfortunately, cyberbullying (CB) is the most prevalent +phenomenon in todays cyber world, and is a severe threat to the mental and +physical health of citizens. This opens the need to develop a robust system to +prevent bullying content from online forums, blogs, and social media platforms +to manage the impact in our society. Several machine learning (ML) algorithms +have been proposed for this purpose. However, their performances are not +consistent due to high class imbalance and generalisation issues. In recent +years, large language models (LLMs) like BERT and RoBERTa have achieved +state-of-the-art (SOTA) results in several natural language processing (NLP) +tasks. Unfortunately, the LLMs have not been applied extensively for CB +detection. In our paper, we explored the use of these models for cyberbullying +(CB) detection. We have prepared a new dataset (D2) from existing studies +(Formspring and Twitter). Our experimental results for dataset D1 and D2 showed +that RoBERTa outperformed other models. + +
+
+ comment: 14 pages, Journal of Analytics +
+
+
+
+
+ + ☆ A Hard-to-Beat Baseline for Training-free CLIP-based Adaptation ICLR 2024 + + +
+ Contrastive Language-Image Pretraining (CLIP) has gained popularity for its +remarkable zero-shot capacity. Recent research has focused on developing +efficient fine-tuning methods, such as prompt learning and adapter, to enhance +CLIP's performance in downstream tasks. However, these methods still require +additional training time and computational resources, which is undesirable for +devices with limited resources. In this paper, we revisit a classical +algorithm, Gaussian Discriminant Analysis (GDA), and apply it to the downstream +classification of CLIP. Typically, GDA assumes that features of each class +follow Gaussian distributions with identical covariance. By leveraging Bayes' +formula, the classifier can be expressed in terms of the class means and +covariance, which can be estimated from the data without the need for training. +To integrate knowledge from both visual and textual modalities, we ensemble it +with the original zero-shot classifier within CLIP. Extensive results on 17 +datasets validate that our method surpasses or achieves comparable results with +state-of-the-art methods on few-shot classification, imbalanced learning, and +out-of-distribution generalization. In addition, we extend our method to +base-to-new generalization and unsupervised learning, once again demonstrating +its superiority over competing approaches. Our code is publicly available at +\url{https://github.com/mrflogs/ICLR24}. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Provably learning a multi-head attention layer + + +
+ The multi-head attention layer is one of the key components of the +transformer architecture that sets it apart from traditional feed-forward +models. Given a sequence length $k$, attention matrices +$\mathbf{\Theta}_1,\ldots,\mathbf{\Theta}_m\in\mathbb{R}^{d\times d}$, and +projection matrices $\mathbf{W}_1,\ldots,\mathbf{W}_m\in\mathbb{R}^{d\times +d}$, the corresponding multi-head attention layer $F: \mathbb{R}^{k\times d}\to +\mathbb{R}^{k\times d}$ transforms length-$k$ sequences of $d$-dimensional +tokens $\mathbf{X}\in\mathbb{R}^{k\times d}$ via $F(\mathbf{X}) \triangleq +\sum^m_{i=1} +\mathrm{softmax}(\mathbf{X}\mathbf{\Theta}_i\mathbf{X}^\top)\mathbf{X}\mathbf{W}_i$. +In this work, we initiate the study of provably learning a multi-head attention +layer from random examples and give the first nontrivial upper and lower bounds +for this problem: + - Provided $\{\mathbf{W}_i, \mathbf{\Theta}_i\}$ satisfy certain +non-degeneracy conditions, we give a $(dk)^{O(m^3)}$-time algorithm that learns +$F$ to small error given random labeled examples drawn uniformly from $\{\pm +1\}^{k\times d}$. + - We prove computational lower bounds showing that in the worst case, +exponential dependence on $m$ is unavoidable. + We focus on Boolean $\mathbf{X}$ to mimic the discrete nature of tokens in +large language models, though our techniques naturally extend to standard +continuous settings, e.g. Gaussian. Our algorithm, which is centered around +using examples to sculpt a convex body containing the unknown parameters, is a +significant departure from existing provable algorithms for learning +feedforward networks, which predominantly exploit algebraic and rotation +invariance properties of the Gaussian distribution. In contrast, our analysis +is more flexible as it primarily relies on various upper and lower tail bounds +for the input distribution and "slices" thereof. + +
+
+ comment: 105 pages, comments welcome +
+
+
+
+
+ + ☆ An Optimal House Price Prediction Algorithm: XGBoost + + +
+ An accurate prediction of house prices is a fundamental requirement for +various sectors including real estate and mortgage lending. It is widely +recognized that a property value is not solely determined by its physical +attributes but is significantly influenced by its surrounding neighbourhood. +Meeting the diverse housing needs of individuals while balancing budget +constraints is a primary concern for real estate developers. To this end, we +addressed the house price prediction problem as a regression task and thus +employed various machine learning techniques capable of expressing the +significance of independent variables. We made use of the housing dataset of +Ames City in Iowa, USA to compare support vector regressor, random forest +regressor, XGBoost, multilayer perceptron and multiple linear regression +algorithms for house price prediction. Afterwards, we identified the key +factors that influence housing costs. Our results show that XGBoost is the best +performing model for house price prediction. + +
+
+ comment: 16 pages, Journal of Analytics +
+
+
+
+
+ + ☆ Improved Generalization of Weight Space Networks via Augmentations + + +
+ Learning in deep weight spaces (DWS), where neural networks process the +weights of other neural networks, is an emerging research direction, with +applications to 2D and 3D neural fields (INRs, NeRFs), as well as making +inferences about other types of neural networks. Unfortunately, weight space +models tend to suffer from substantial overfitting. We empirically analyze the +reasons for this overfitting and find that a key reason is the lack of +diversity in DWS datasets. While a given object can be represented by many +different weight configurations, typical INR training sets fail to capture +variability across INRs that represent the same object. To address this, we +explore strategies for data augmentation in weight spaces and propose a MixUp +method adapted for weight spaces. We demonstrate the effectiveness of these +methods in two setups. In classification, they improve performance similarly to +having up to 10 times more data. In self-supervised contrastive learning, they +yield substantial 5-10% gains in downstream classification. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Entropy-regularized Diffusion Policy with Q-Ensembles for Offline + Reinforcement Learning + + +
+ This paper presents advanced techniques of training diffusion policies for +offline reinforcement learning (RL). At the core is a mean-reverting stochastic +differential equation (SDE) that transfers a complex action distribution into a +standard Gaussian and then samples actions conditioned on the environment state +with a corresponding reverse-time SDE, like a typical diffusion policy. We show +that such an SDE has a solution that we can use to calculate the log +probability of the policy, yielding an entropy regularizer that improves the +exploration of offline datasets. To mitigate the impact of inaccurate value +functions from out-of-distribution data points, we further propose to learn the +lower confidence bound of Q-ensembles for more robust policy improvement. By +combining the entropy-regularized diffusion policy with Q-ensembles in offline +RL, our method achieves state-of-the-art performance on most tasks in D4RL +benchmarks. Code is available at +\href{https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble}{https://github.com/ruoqizzz/Entropy-Regularized-Diffusion-Policy-with-QEnsemble}. + +
+
+
+
+
+ + ☆ Retrieve to Explain: Evidence-driven Predictions with Language Models + + +
+ Machine learning models, particularly language models, are notoriously +difficult to introspect. Black-box models can mask both issues in model +training and harmful biases. For human-in-the-loop processes, opaque +predictions can drive lack of trust, limiting a model's impact even when it +performs effectively. To address these issues, we introduce Retrieve to Explain +(R2E). R2E is a retrieval-based language model that prioritizes amongst a +pre-defined set of possible answers to a research question based on the +evidence in a document corpus, using Shapley values to identify the relative +importance of pieces of evidence to the final prediction. R2E can adapt to new +evidence without retraining, and incorporate structured data through templating +into natural language. We assess on the use case of drug target identification +from published scientific literature, where we show that the model outperforms +an industry-standard genetics-based approach on predicting clinical trial +outcomes. + +
+
+
+
+
+ + ☆ Link Prediction with Relational Hypergraphs + + +
+ Link prediction with knowledge graphs has been thoroughly studied in graph +machine learning, leading to a rich landscape of graph neural network +architectures with successful applications. Nonetheless, it remains challenging +to transfer the success of these architectures to link prediction with +relational hypergraphs. The presence of relational hyperedges makes link +prediction a task between $k$ nodes for varying choices of $k$, which is +substantially harder than link prediction with knowledge graphs, where every +relation is binary ($k=2$). In this paper, we propose two frameworks for link +prediction with relational hypergraphs and conduct a thorough analysis of the +expressive power of the resulting model architectures via corresponding +relational Weisfeiler-Leman algorithms, and also via some natural logical +formalisms. Through extensive empirical analysis, we validate the power of the +proposed model architectures on various relational hypergraph benchmarks. The +resulting model architectures substantially outperform every baseline for +inductive link prediction, and lead to state-of-the-art results for +transductive link prediction. Our study therefore unlocks applications of graph +neural networks to fully relational structures. + +
+
+
+
+
+ + ☆ TopoNav: Topological Navigation for Efficient Exploration in Sparse + Reward Environments + + +
+ Autonomous robots exploring unknown areas face a significant challenge -- +navigating effectively without prior maps and with limited external feedback. +This challenge intensifies in sparse reward environments, where traditional +exploration techniques often fail. In this paper, we introduce TopoNav, a novel +framework that empowers robots to overcome these constraints and achieve +efficient, adaptable, and goal-oriented exploration. TopoNav's fundamental +building blocks are active topological mapping, intrinsic reward mechanisms, +and hierarchical objective prioritization. Throughout its exploration, TopoNav +constructs a dynamic topological map that captures key locations and pathways. +It utilizes intrinsic rewards to guide the robot towards designated sub-goals +within this map, fostering structured exploration even in sparse reward +settings. To ensure efficient navigation, TopoNav employs the Hierarchical +Objective-Driven Active Topologies framework, enabling the robot to prioritize +immediate tasks like obstacle avoidance while maintaining focus on the overall +goal. We demonstrate TopoNav's effectiveness in simulated environments that +replicate real-world conditions. Our results reveal significant improvements in +exploration efficiency, navigational accuracy, and adaptability to unforeseen +obstacles, showcasing its potential to revolutionize autonomous exploration in +a wide range of applications, including search and rescue, environmental +monitoring, and planetary exploration. + +
+
+ comment: Paper under review +
+
+
+
+
+ + ☆ Deep Learning for Multivariate Time Series Imputation: A Survey + + +
+ The ubiquitous missing values cause the multivariate time series data to be +partially observed, destroying the integrity of time series and hindering the +effective time series data analysis. Recently deep learning imputation methods +have demonstrated remarkable success in elevating the quality of corrupted time +series data, subsequently enhancing performance in downstream tasks. In this +paper, we conduct a comprehensive survey on the recently proposed deep learning +imputation methods. First, we propose a taxonomy for the reviewed methods, and +then provide a structured review of these methods by highlighting their +strengths and limitations. We also conduct empirical experiments to study +different methods and compare their enhancement for downstream tasks. Finally, +the open issues for future research on multivariate time series imputation are +pointed out. All code and configurations of this work, including a regularly +maintained multivariate time series imputation paper list, can be found in the +GitHub repository~\url{https://github.com/WenjieDu/Awesome\_Imputation}. + +
+
+ comment: 9 pages, 1 figure, 5 tables, 58 referred papers +
+
+
+
+
+ + ☆ More Flexible PAC-Bayesian Meta-Learning by Learning Learning Algorithms + + +
+ We introduce a new framework for studying meta-learning methods using +PAC-Bayesian theory. Its main advantage over previous work is that it allows +for more flexibility in how the transfer of knowledge between tasks is +realized. For previous approaches, this could only happen indirectly, by means +of learning prior distributions over models. In contrast, the new +generalization bounds that we prove express the process of meta-learning much +more directly as learning the learning algorithm that should be used for future +tasks. The flexibility of our framework makes it suitable to analyze a wide +range of meta-learning mechanisms and even design new mechanisms. Other than +our theoretical contributions we also show empirically that our framework +improves the prediction quality in practical meta-learning mechanisms. + +
+
+
+
+
+ + ☆ Analysis of Linear Mode Connectivity via Permutation-Based Weight + Matching + + +
+ Recently, Ainsworth et al. showed that using weight matching (WM) to minimize +the $L_2$ distance in a permutation search of model parameters effectively +identifies permutations that satisfy linear mode connectivity (LMC), in which +the loss along a linear path between two independently trained models with +different seeds remains nearly constant. This paper provides a theoretical +analysis of LMC using WM, which is crucial for understanding stochastic +gradient descent's effectiveness and its application in areas like model +merging. We first experimentally and theoretically show that permutations found +by WM do not significantly reduce the $L_2$ distance between two models and the +occurrence of LMC is not merely due to distance reduction by WM in itself. We +then provide theoretical insights showing that permutations can change the +directions of the singular vectors, but not the singular values, of the weight +matrices in each layer. This finding shows that permutations found by WM mainly +align the directions of singular vectors associated with large singular values +across models. This alignment brings the singular vectors with large singular +values, which determine the model functionality, closer between pre-merged and +post-merged models, so that the post-merged model retains functionality similar +to the pre-merged models, making it easy to satisfy LMC. Finally, we analyze +the difference between WM and straight-through estimator (STE), a +dataset-dependent permutation search method, and show that WM outperforms STE, +especially when merging three or more models. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Connecting the Dots: Collaborative Fine-tuning for Black-Box + Vision-Language Models + + +
+ With the emergence of pretrained vision-language models (VLMs), considerable +efforts have been devoted to fine-tuning them for downstream tasks. Despite the +progress made in designing efficient fine-tuning methods, such methods require +access to the model's parameters, which can be challenging as model owners +often opt to provide their models as a black box to safeguard model ownership. +This paper proposes a \textbf{C}ollabo\textbf{ra}tive +\textbf{F}ine-\textbf{T}uning (\textbf{CraFT}) approach for fine-tuning +black-box VLMs to downstream tasks, where one only has access to the input +prompts and the output predictions of the model. CraFT comprises two modules, a +prompt generation module for learning text prompts and a prediction refinement +module for enhancing output predictions in residual style. Additionally, we +introduce an auxiliary prediction-consistent loss to promote consistent +optimization across these modules. These modules are optimized by a novel +collaborative training algorithm. Extensive experiments on few-shot +classification over 15 datasets demonstrate the superiority of CraFT. The +results show that CraFT achieves a decent gain of about 12\% with 16-shot +datasets and only 8,000 queries. Moreover, CraFT trains faster and uses only +about 1/80 of the memory footprint for deployment, while sacrificing only +1.62\% compared to the white-box method. + +
+
+
+
+
+ + ☆ Generative Modeling of Graphs via Joint Diffusion of Node and Edge + Attributes + + +
+ Graph generation is integral to various engineering and scientific +disciplines. Nevertheless, existing methodologies tend to overlook the +generation of edge attributes. However, we identify critical applications where +edge attributes are essential, making prior methods potentially unsuitable in +such contexts. Moreover, while trivial adaptations are available, empirical +investigations reveal their limited efficacy as they do not properly model the +interplay among graph components. To address this, we propose a joint +score-based model of nodes and edges for graph generation that considers all +graph components. Our approach offers two key novelties: (i) node and edge +attributes are combined in an attention module that generates samples based on +the two ingredients; and (ii) node, edge and adjacency information are mutually +dependent during the graph diffusion process. We evaluate our method on +challenging benchmarks involving real-world and synthetic datasets in which +edge features are crucial. Additionally, we introduce a new synthetic dataset +that incorporates edge values. Furthermore, we propose a novel application that +greatly benefits from the method due to its nature: the generation of traffic +scenes represented as graphs. Our method outperforms other graph generation +methods, demonstrating a significant advantage in edge-related measures. + +
+
+
+
+
+ + ☆ PAC-Bayesian Adversarially Robust Generalization Bounds for Graph Neural + Network + + +
+ Graph neural networks (GNNs) have gained popularity for various graph-related +tasks. However, similar to deep neural networks, GNNs are also vulnerable to +adversarial attacks. Empirical studies have shown that adversarially robust +generalization has a pivotal role in establishing effective defense algorithms +against adversarial attacks. In this paper, we contribute by providing +adversarially robust generalization bounds for two kinds of popular GNNs, graph +convolutional network (GCN) and message passing graph neural network, using the +PAC-Bayesian framework. Our result reveals that spectral norm of the diffusion +matrix on the graph and spectral norm of the weights as well as the +perturbation factor govern the robust generalization bounds of both models. Our +bounds are nontrivial generalizations of the results developed in (Liao et al., +2020) from the standard setting to adversarial setting while avoiding +exponential dependence of the maximum node degree. As corollaries, we derive +better PAC-Bayesian robust generalization bounds for GCN in the standard +setting, which improve the bounds in (Liao et al., 2020) by avoiding +exponential dependence on the maximum node degree. + +
+
+ comment: 32pages +
+
+
+
+
+ + ☆ On provable privacy vulnerabilities of graph representations + + +
+ Graph representation learning (GRL) is critical for extracting insights from +complex network structures, but it also raises security concerns due to +potential privacy vulnerabilities in these representations. This paper +investigates the structural vulnerabilities in graph neural models where +sensitive topological information can be inferred through edge reconstruction +attacks. Our research primarily addresses the theoretical underpinnings of +cosine-similarity-based edge reconstruction attacks (COSERA), providing +theoretical and empirical evidence that such attacks can perfectly reconstruct +sparse Erdos Renyi graphs with independent random features as graph size +increases. Conversely, we establish that sparsity is a critical factor for +COSERA's effectiveness, as demonstrated through analysis and experiments on +stochastic block models. Finally, we explore the resilience of (provably) +private graph representations produced via noisy aggregation (NAG) mechanism +against COSERA. We empirically delineate instances wherein COSERA demonstrates +both efficacy and deficiency in its capacity to function as an instrument for +elucidating the trade-off between privacy and utility. + +
+
+
+
+
+ + ☆ Polyp-DDPM: Diffusion-Based Semantic Polyp Synthesis for Enhanced + Segmentation + + +
+ This study introduces Polyp-DDPM, a diffusion-based method for generating +realistic images of polyps conditioned on masks, aimed at enhancing the +segmentation of gastrointestinal (GI) tract polyps. Our approach addresses the +challenges of data limitations, high annotation costs, and privacy concerns +associated with medical images. By conditioning the diffusion model on +segmentation masks-binary masks that represent abnormal areas-Polyp-DDPM +outperforms state-of-the-art methods in terms of image quality (achieving a +Frechet Inception Distance (FID) score of 78.47, compared to scores above +83.79) and segmentation performance (achieving an Intersection over Union (IoU) +of 0.7156, versus less than 0.6694 for synthetic images from baseline models +and 0.7067 for real data). Our method generates a high-quality, diverse +synthetic dataset for training, thereby enhancing polyp segmentation models to +be comparable with real images and offering greater data augmentation +capabilities to improve segmentation models. The source code and pretrained +weights for Polyp-DDPM are made publicly available at +https://github.com/mobaidoctor/polyp-ddpm. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Reducing the Cost of Quantum Chemical Data By Backpropagating Through + Density Functional Theory + + +
+ Density Functional Theory (DFT) accurately predicts the quantum chemical +properties of molecules, but scales as $O(N_{\text{electrons}}^3)$. Sch\"utt et +al. (2019) successfully approximate DFT 1000x faster with Neural Networks (NN). +Arguably, the biggest problem one faces when scaling to larger molecules is the +cost of DFT labels. For example, it took years to create the PCQ dataset +(Nakata & Shimazaki, 2017) on which subsequent NNs are trained within a week. +DFT labels molecules by minimizing energy $E(\cdot )$ as a "loss function." We +bypass dataset creation by directly training NNs with $E(\cdot )$ as a loss +function. For comparison, Sch\"utt et al. (2019) spent 626 hours creating a +dataset on which they trained their NN for 160h, for a total of 786h; our +method achieves comparable performance within 31h. + +
+
+
+
+
+ + ☆ Positive concave deep equilibrium models + + +
+ Deep equilibrium (DEQ) models are widely recognized as a memory efficient +alternative to standard neural networks, achieving state-of-the-art performance +in language modeling and computer vision tasks. These models solve a fixed +point equation instead of explicitly computing the output, which sets them +apart from standard neural networks. However, existing DEQ models often lack +formal guarantees of the existence and uniqueness of the fixed point, and the +convergence of the numerical scheme used for computing the fixed point is not +formally established. As a result, DEQ models are potentially unstable in +practice. To address these drawbacks, we introduce a novel class of DEQ models +called positive concave deep equilibrium (pcDEQ) models. Our approach, which is +based on nonlinear Perron-Frobenius theory, enforces nonnegative weights and +activation functions that are concave on the positive orthant. By imposing +these constraints, we can easily ensure the existence and uniqueness of the +fixed point without relying on additional complex assumptions commonly found in +the DEQ literature, such as those based on monotone operator theory in convex +analysis. Furthermore, the fixed point can be computed with the standard fixed +point algorithm, and we provide theoretical guarantees of geometric +convergence, which, in particular, simplifies the training process. Experiments +demonstrate the competitiveness of our pcDEQ models against other implicit +models. + +
+
+
+
+
+ + ☆ AlbNews: A Corpus of Headlines for Topic Modeling in Albanian + + +
+ The scarcity of available text corpora for low-resource languages like +Albanian is a serious hurdle for research in natural language processing tasks. +This paper introduces AlbNews, a collection of 600 topically labeled news +headlines and 2600 unlabeled ones in Albanian. The data can be freely used for +conducting topic modeling research. We report the initial classification scores +of some traditional machine learning classifiers trained with the AlbNews +samples. These results show that basic models outrun the ensemble learning ones +and can serve as a baseline for future experiments. + +
+
+
+
+
+ + ☆ A General Theory for Kernel Packets: from state space model to compactly + supported basis + + +
+ It is well known that the state space (SS) model formulation of a Gaussian +process (GP) can lower its training and prediction time both to O(n) for n data +points. We prove that an $m$-dimensional SS model formulation of GP is +equivalent to a concept we introduce as the general right Kernel Packet (KP): a +transformation for the GP covariance function $K$ such that +$\sum_{i=0}^{m}a_iD_t^{(j)}K(t,t_i)=0$ holds for any $t \leq t_1$, 0 $\leq j +\leq m-1$, and $m+1$ consecutive points $t_i$, where ${D}_t^{(j)}f(t) $ denotes +$j$-th order derivative acting on $t$. We extend this idea to the backward SS +model formulation of the GP, leading to the concept of the left KP for next $m$ +consecutive points: $\sum_{i=0}^{m}b_i{D}_t^{(j)}K(t,t_{m+i})=0$ for any $t\geq +t_{2m}$. By combining both left and right KPs, we can prove that a suitable +linear combination of these covariance functions yields $m$ compactly supported +KP functions: $\phi^{(j)}(t)=0$ for any $t\not\in(t_0,t_{2m})$ and +$j=0,\cdots,m-1$. KPs further reduces the prediction time of GP to O(log n) or +even O(1) and can be applied to more general problems involving the derivative +of GPs. + +
+
+
+
+
+ + ☆ Efficient Availability Attacks against Supervised and Contrastive + Learning Simultaneously + + +
+ Availability attacks can prevent the unauthorized use of private data and +commercial datasets by generating imperceptible noise and making unlearnable +examples before release. Ideally, the obtained unlearnability prevents +algorithms from training usable models. When supervised learning (SL) +algorithms have failed, a malicious data collector possibly resorts to +contrastive learning (CL) algorithms to bypass the protection. Through +evaluation, we have found that most of the existing methods are unable to +achieve both supervised and contrastive unlearnability, which poses risks to +data protection. Different from recent methods based on contrastive error +minimization, we employ contrastive-like data augmentations in supervised error +minimization or maximization frameworks to obtain attacks effective for both SL +and CL. Our proposed AUE and AAP attacks achieve state-of-the-art worst-case +unlearnability across SL and CL algorithms with less computation consumption, +showcasing prospects in real-world applications. + +
+
+
+
+
+ + ☆ Bayesian Uncertainty for Gradient Aggregation in Multi-Task Learning + + +
+ As machine learning becomes more prominent there is a growing demand to +perform several inference tasks in parallel. Running a dedicated model for each +task is computationally expensive and therefore there is a great interest in +multi-task learning (MTL). MTL aims at learning a single model that solves +several tasks efficiently. Optimizing MTL models is often achieved by computing +a single gradient per task and aggregating them for obtaining a combined update +direction. However, these approaches do not consider an important aspect, the +sensitivity in the gradient dimensions. Here, we introduce a novel gradient +aggregation approach using Bayesian inference. We place a probability +distribution over the task-specific parameters, which in turn induce a +distribution over the gradients of the tasks. This additional valuable +information allows us to quantify the uncertainty in each of the gradients +dimensions, which can then be factored in when aggregating them. We empirically +demonstrate the benefits of our approach in a variety of datasets, achieving +state-of-the-art performance. + +
+
+
+
+
+ + ☆ Understanding the Effect of Noise in LLM Training Data with Algorithmic + Chains of Thought + + +
+ During both pretraining and fine-tuning, Large Language Models +(\textbf{LLMs}) are trained on trillions of tokens of text of widely varying +quality. Both phases of training typically involve heuristically filtering out +``low-quality'' or \textit{noisy} training samples, yet little is known +quantitatively about how the type or intensity of noise affects downstream +performance. In this work, we study how noise in chain of thought +(\textbf{CoT}) impacts task performance in the highly-controlled setting of +algorithmically solvable tasks. First, we develop the Traced Integer +(\textbf{TInt}) framework to generate highly customizable noised execution +traces for any arithmetic function on lists of integers. We then define two +types of noise: \textit{static} noise, a local form of noise which is applied +after the CoT trace is computed, and \textit{dynamic} noise, a global form of +noise which propagates errors in the trace as it is computed. We then evaluate +the test performance of pretrained models both prompted and fine-tuned on +noised datasets with varying levels of dataset contamination and intensity. We +find fine-tuned models are extremely robust to high levels of static noise but +struggle significantly more with lower levels of dynamic noise. In contrast, +few-shot prompted models appear more sensitive to even static noise. We +conclude with a discussion of how our findings impact noise filtering +best-practices, in particular emphasizing the importance of removing samples +containing destructive dynamic noise with global errors. + +
+
+
+
+
+ + ☆ Gradient Sketches for Training Data Attribution and Studying the Loss + Landscape + + +
+ Random projections or sketches of gradients and Hessian vector products play +an essential role in applications where one needs to store many such vectors +while retaining accurate information about their relative geometry. Two +important scenarios are training data attribution (tracing a model's behavior +to the training data), where one needs to store a gradient for each training +example, and the study of the spectrum of the Hessian (to analyze the training +dynamics), where one needs to store multiple Hessian vector products. While +sketches that use dense matrices are easy to implement, they are memory bound +and cannot be scaled to modern neural networks. Motivated by work on the +intrinsic dimension of neural networks, we propose and study a design space for +scalable sketching algorithms. We demonstrate the efficacy of our approach in +three applications: training data attribution, the analysis of the Hessian +spectrum and the computation of the intrinsic dimension when fine-tuning +pre-trained language models. + +
+
+
+
+
+ + ☆ Space Group Constrained Crystal Generation ICLR 2024 + + +
+ Crystals are the foundation of numerous scientific and industrial +applications. While various learning-based approaches have been proposed for +crystal generation, existing methods seldom consider the space group constraint +which is crucial in describing the geometry of crystals and closely relevant to +many desirable properties. However, considering space group constraint is +challenging owing to its diverse and nontrivial forms. In this paper, we reduce +the space group constraint into an equivalent formulation that is more +tractable to be handcrafted into the generation process. In particular, we +translate the space group constraint into two parts: the basis constraint of +the invariant logarithmic space of the lattice matrix and the Wyckoff position +constraint of the fractional coordinates. Upon the derived constraints, we then +propose DiffCSP++, a novel diffusion model that has enhanced a previous work +DiffCSP by further taking space group constraint into account. Experiments on +several popular datasets verify the benefit of the involvement of the space +group constraint, and show that our DiffCSP++ achieves promising performance on +crystal structure prediction, ab initio crystal generation and controllable +generation with customized space groups. + +
+
+ comment: ICLR 2024 poster +
+
+
+
+
+ + ☆ Neural Rank Collapse: Weight Decay and Small Within-Class Variability + Yield Low-Rank Bias + + +
+ Recent work in deep learning has shown strong empirical and theoretical +evidence of an implicit low-rank bias: weight matrices in deep networks tend to +be approximately low-rank and removing relatively small singular values during +training or from available trained models may significantly reduce model size +while maintaining or even improving model performance. However, the majority of +the theoretical investigations around low-rank bias in neural networks deal +with oversimplified deep linear networks. In this work, we consider general +networks with nonlinear activations and the weight decay parameter, and we show +the presence of an intriguing neural rank collapse phenomenon, connecting the +low-rank bias of trained networks with networks' neural collapse properties: as +the weight decay parameter grows, the rank of each layer in the network +decreases proportionally to the within-class variability of the hidden-space +embeddings of the previous layers. Our theoretical findings are supported by a +range of experimental evaluations illustrating the phenomenon. + +
+
+
+
+
+ + ☆ Subsampling is not Magic: Why Large Batch Sizes Work for Differentially + Private Stochastic Optimisation + + +
+ We study the effect of the batch size to the total gradient variance in +differentially private stochastic gradient descent (DP-SGD), seeking a +theoretical explanation for the usefulness of large batch sizes. As DP-SGD is +the basis of modern DP deep learning, its properties have been widely studied, +and recent works have empirically found large batch sizes to be beneficial. +However, theoretical explanations of this benefit are currently heuristic at +best. We first observe that the total gradient variance in DP-SGD can be +decomposed into subsampling-induced and noise-induced variances. We then prove +that in the limit of an infinite number of iterations, the effective +noise-induced variance is invariant to the batch size. The remaining +subsampling-induced variance decreases with larger batch sizes, so large +batches reduce the effective total gradient variance. We confirm numerically +that the asymptotic regime is relevant in practical settings when the batch +size is not small, and find that outside the asymptotic regime, the total +gradient variance decreases even more with large batch sizes. We also find a +sufficient condition that implies that large batch sizes similarly reduce +effective DP noise variance for one iteration of DP-SGD. + +
+
+
+
+
+ + ☆ A Bias-Variance Decomposition for Ensembles over Multiple Synthetic + Datasets + + +
+ Recent studies have highlighted the benefits of generating multiple synthetic +datasets for supervised learning, from increased accuracy to more effective +model selection and uncertainty estimation. These benefits have clear empirical +support, but the theoretical understanding of them is currently very light. We +seek to increase the theoretical understanding by deriving bias-variance +decompositions for several settings of using multiple synthetic datasets. Our +theory predicts multiple synthetic datasets to be especially beneficial for +high-variance downstream predictors, and yields a simple rule of thumb to +select the appropriate number of synthetic datasets in the case of mean-squared +error and Brier score. We investigate how our theory works in practice by +evaluating the performance of an ensemble over many synthetic datasets for +several real datasets and downstream predictors. The results follow our theory, +showing that our insights are also practically relevant. + +
+
+
+
+
+ + ☆ On Convergence of Adam for Stochastic Optimization under Relaxed + Assumptions + + +
+ The Adaptive Momentum Estimation (Adam) algorithm is highly effective in +training various deep learning tasks. Despite this, there's limited theoretical +understanding for Adam, especially when focusing on its vanilla form in +non-convex smooth scenarios with potential unbounded gradients and affine +variance noise. In this paper, we study vanilla Adam under these challenging +conditions. We introduce a comprehensive noise model which governs affine +variance noise, bounded noise and sub-Gaussian noise. We show that Adam can +find a stationary point with a $\mathcal{O}(\text{poly}(\log T)/\sqrt{T})$ rate +in high probability under this general noise model where $T$ denotes total +number iterations, matching the lower rate of stochastic first-order algorithms +up to logarithm factors. More importantly, we reveal that Adam is free of +tuning step-sizes with any problem-parameters, yielding a better adaptation +property than the Stochastic Gradient Descent under the same conditions. We +also provide a probabilistic convergence result for Adam under a generalized +smooth condition which allows unbounded smoothness parameters and has been +illustrated empirically to more accurately capture the smooth property of many +practical objective functions. + +
+
+
+
+
+ + ☆ Cross Entropy versus Label Smoothing: A Neural Collapse Perspective + + +
+ Label smoothing loss is a widely adopted technique to mitigate overfitting in +deep neural networks. This paper studies label smoothing from the perspective +of Neural Collapse (NC), a powerful empirical and theoretical framework which +characterizes model behavior during the terminal phase of training. We first +show empirically that models trained with label smoothing converge faster to +neural collapse solutions and attain a stronger level of neural collapse. +Additionally, we show that at the same level of NC1, models under label +smoothing loss exhibit intensified NC2. These findings provide valuable +insights into the performance benefits and enhanced model calibration under +label smoothing loss. We then leverage the unconstrained feature model to +derive closed-form solutions for the global minimizers for both loss functions +and further demonstrate that models under label smoothing have a lower +conditioning number and, therefore, theoretically converge faster. Our study, +combining empirical evidence and theoretical results, not only provides nuanced +insights into the differences between label smoothing and cross-entropy losses, +but also serves as an example of how the powerful neural collapse framework can +be used to improve our understanding of DNNs. + +
+
+
+
+
+ + ☆ Humans Beat Deep Networks at Recognizing Objects in Unusual Poses, Given + Enough Time + + +
+ Deep learning is closing the gap with humans on several object recognition +benchmarks. Here we investigate this gap in the context of challenging images +where objects are seen from unusual viewpoints. We find that humans excel at +recognizing objects in unusual poses, in contrast with state-of-the-art +pretrained networks (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) which are +systematically brittle in this condition. Remarkably, as we limit image +exposure time, human performance degrades to the level of deep networks, +suggesting that additional mental processes (requiring additional time) take +place when humans identify objects in unusual poses. Finally, our analysis of +error patterns of humans vs. networks reveals that even time-limited humans are +dissimilar to feed-forward deep networks. We conclude that more work is needed +to bring computer vision systems to the level of robustness of the human visual +system. Understanding the nature of the mental processes taking place during +extra viewing time may be key to attain such robustness. + +
+
+
+
+
+ + ☆ Tabular Data: Is Attention All You Need? + + +
+ Deep Learning has revolutionized the field of AI and led to remarkable +achievements in applications involving image and text data. Unfortunately, +there is inconclusive evidence on the merits of neural networks for structured +tabular data. In this paper, we introduce a large-scale empirical study +comparing neural networks against gradient-boosted decision trees on tabular +data, but also transformer-based architectures against traditional multi-layer +perceptrons (MLP) with residual connections. In contrast to prior work, our +empirical findings indicate that neural networks are competitive against +decision trees. Furthermore, we assess that transformer-based architectures do +not outperform simpler variants of traditional MLP architectures on tabular +datasets. As a result, this paper helps the research and practitioner +communities make informed choices on deploying neural networks on future +tabular data applications. + +
+
+
+
+
+ + ☆ In-context learning agents are asymmetric belief updaters + + +
+ We study the in-context learning dynamics of large language models (LLMs) +using three instrumental learning tasks adapted from cognitive psychology. We +find that LLMs update their beliefs in an asymmetric manner and learn more from +better-than-expected outcomes than from worse-than-expected ones. Furthermore, +we show that this effect reverses when learning about counterfactual feedback +and disappears when no agency is implied. We corroborate these findings by +investigating idealized in-context learning agents derived through +meta-reinforcement learning, where we observe similar patterns. Taken together, +our results contribute to our understanding of how in-context learning works by +highlighting that the framing of a problem significantly influences how +learning occurs, a phenomenon also observed in human cognition. + +
+
+
+
+
+ + ☆ On dimensionality of feature vectors in MPNNs + + +
+ We revisit the classical result of Morris et al.~(AAAI'19) that +message-passing graphs neural networks (MPNNs) are equal in their +distinguishing power to the Weisfeiler--Leman (WL) isomorphism test. + Morris et al.~show their simulation result with ReLU activation function and +$O(n)$-dimensional feature vectors, where $n$ is the number of nodes of the +graph. Recently, by introducing randomness into the architecture, Aamand et +al.~(NeurIPS'22) were able to improve this bound to $O(\log n)$-dimensional +feature vectors, although at the expense of guaranteeing perfect simulation +only with high probability. + In all these constructions, to guarantee equivalence to the WL test, the +dimension of feature vectors in the MPNN has to increase with the size of the +graphs. However, architectures used in practice have feature vectors of +constant dimension. Thus, there is a gap between the guarantees provided by +these results and the actual characteristics of architectures used in practice. +In this paper we close this gap by showing that, for \emph{any} non-polynomial +analytic (like the sigmoid) activation function, to guarantee that MPNNs are +equivalent to the WL test, feature vectors of dimension $d=1$ is all we need, +independently of the size of the graphs. + Our main technical insight is that for simulating multi-sets in the WL-test, +it is enough to use linear independence of feature vectors over rationals +instead of reals. Countability of the set of rationals together with nice +properties of analytic functions allow us to carry out the simulation invariant +over the iterations of the WL test without increasing the dimension of the +feature vectors. + +
+
+ comment: 15 pages, 2 figures +
+
+
+
+
+ + ☆ Discovery of the Hidden World with Large Language Models + + +
+ Science originates with discovering new causal knowledge from a combination +of known facts and observations. Traditional causal discovery approaches mainly +rely on high-quality measured variables, usually given by human experts, to +find causal relations. However, the causal variables are usually unavailable in +a wide range of real-world applications. The rise of large language models +(LLMs) that are trained to learn rich knowledge from the massive observations +of the world, provides a new opportunity to assist with discovering high-level +hidden variables from the raw observational data. Therefore, we introduce COAT: +Causal representatiOn AssistanT. COAT incorporates LLMs as a factor proposer +that extracts the potential causal factors from unstructured data. Moreover, +LLMs can also be instructed to provide additional information used to collect +data values (e.g., annotation criteria) and to further parse the raw +unstructured data into structured data. The annotated data will be fed to a +causal learning module (e.g., the FCI algorithm) that provides both rigorous +explanations of the data, as well as useful feedback to further improve the +extraction of causal factors by LLMs. We verify the effectiveness of COAT in +uncovering the underlying causal system with two case studies of review rating +analysis and neuropathic diagnosis. + +
+
+ comment: Preliminary version of an ongoing project; Chenxi and Yongqiang + contributed equally; 26 pages, 41 figures; Project page: + https://causalcoat.github.io/ +
+
+
+
+
+ + ☆ Fully autonomous tuning of a spin qubit + + +
+ Spanning over two decades, the study of qubits in semiconductors for quantum +computing has yielded significant breakthroughs. However, the development of +large-scale semiconductor quantum circuits is still limited by challenges in +efficiently tuning and operating these circuits. Identifying optimal operating +conditions for these qubits is complex, involving the exploration of vast +parameter spaces. This presents a real 'needle in the haystack' problem, which, +until now, has resisted complete automation due to device variability and +fabrication imperfections. In this study, we present the first fully autonomous +tuning of a semiconductor qubit, from a grounded device to Rabi oscillations, a +clear indication of successful qubit operation. We demonstrate this automation, +achieved without human intervention, in a Ge/Si core/shell nanowire device. Our +approach integrates deep learning, Bayesian optimization, and computer vision +techniques. We expect this automation algorithm to apply to a wide range of +semiconductor qubit devices, allowing for statistical studies of qubit quality +metrics. As a demonstration of the potential of full automation, we +characterise how the Rabi frequency and g-factor depend on barrier gate +voltages for one of the qubits found by the algorithm. Twenty years after the +initial demonstrations of spin qubit operation, this significant advancement is +poised to finally catalyze the operation of large, previously unexplored +quantum circuits. + +
+
+
+
+
+ + ☆ Return-Aligned Decision Transformer + + +
+ Traditional approaches in offline reinforcement learning aim to learn the +optimal policy that maximizes the cumulative reward, also known as return. +However, as applications broaden, it becomes increasingly crucial to train +agents that not only maximize the returns, but align the actual return with a +specified target return, giving control over the agent's performance. Decision +Transformer (DT) optimizes a policy that generates actions conditioned on the +target return through supervised learning and is equipped with a mechanism to +control the agent using the target return. Despite being designed to align the +actual return with the target return, we have empirically identified a +discrepancy between the actual return and the target return in DT. In this +paper, we propose Return-Aligned Decision Transformer (RADT), designed to +effectively align the actual return with the target return. Our model decouples +returns from the conventional input sequence, which typically consists of +returns, states, and actions, to enhance the relationships between returns and +states, as well as returns and actions. Extensive experiments show that RADT +reduces the discrepancies between the actual return and the target return of +DT-based methods. + +
+
+
+
+
+ + ☆ Large Language Models to Enhance Bayesian Optimization ICLR2024 + + +
+ Bayesian optimization (BO) is a powerful approach for optimizing complex and +expensive-to-evaluate black-box functions. Its importance is underscored in +many applications, notably including hyperparameter tuning, but its efficacy +depends on efficiently balancing exploration and exploitation. While there has +been substantial progress in BO methods, striking this balance still remains a +delicate process. In this light, we present \texttt{LLAMBO}, a novel approach +that integrates the capabilities of large language models (LLM) within BO. At a +high level, we frame the BO problem in natural language terms, enabling LLMs to +iteratively propose promising solutions conditioned on historical evaluations. +More specifically, we explore how combining contextual understanding, few-shot +learning proficiency, and domain knowledge of LLMs can enhance various +components of model-based BO. Our findings illustrate that \texttt{LLAMBO} is +effective at zero-shot warmstarting, and improves surrogate modeling and +candidate sampling, especially in the early stages of search when observations +are sparse. Our approach is performed in context and does not require LLM +finetuning. Additionally, it is modular by design, allowing individual +components to be integrated into existing BO frameworks, or function cohesively +as an end-to-end method. We empirically validate \texttt{LLAMBO}'s efficacy on +the problem of hyperparameter tuning, highlighting strong empirical performance +across a range of diverse benchmarks, proprietary, and synthetic tasks. + +
+
+ comment: Accepted as Poster at ICLR2024 +
+
+
+
+
+ + ☆ Elastic Feature Consolidation for Cold Start Exemplar-free Incremental + Learning ICLR 2024 + + +
+ Exemplar-Free Class Incremental Learning (EFCIL) aims to learn from a +sequence of tasks without having access to previous task data. In this paper, +we consider the challenging Cold Start scenario in which insufficient data is +available in the first task to learn a high-quality backbone. This is +especially challenging for EFCIL since it requires high plasticity, which +results in feature drift which is difficult to compensate for in the +exemplar-free setting. To address this problem, we propose a simple and +effective approach that consolidates feature representations by regularizing +drift in directions highly relevant to previous tasks and employs prototypes to +reduce task-recency bias. Our method, called Elastic Feature Consolidation +(EFC), exploits a tractable second-order approximation of feature drift based +on an Empirical Feature Matrix (EFM). The EFM induces a pseudo-metric in +feature space which we use to regularize feature drift in important directions +and to update Gaussian prototypes used in a novel asymmetric cross entropy loss +which effectively balances prototype rehearsal with data from new tasks. +Experimental results on CIFAR-100, Tiny-ImageNet, ImageNet-Subset and +ImageNet-1K demonstrate that Elastic Feature Consolidation is better able to +learn new tasks by maintaining model plasticity and significantly outperform +the state-of-the-art. + +
+
+ comment: Accepted at Twelfth International Conference on Learning + Representations (ICLR 2024) +
+
+
+
+
+ + ☆ Learning Metrics that Maximise Power for Accelerated A/B-Tests + + +
+ Online controlled experiments are a crucial tool to allow for confident +decision-making in technology companies. A North Star metric is defined (such +as long-term revenue or user retention), and system variants that statistically +significantly improve on this metric in an A/B-test can be considered superior. +North Star metrics are typically delayed and insensitive. As a result, the cost +of experimentation is high: experiments need to run for a long time, and even +then, type-II errors (i.e. false negatives) are prevalent. + We propose to tackle this by learning metrics from short-term signals that +directly maximise the statistical power they harness with respect to the North +Star. We show that existing approaches are prone to overfitting, in that higher +average metric sensitivity does not imply improved type-II errors, and propose +to instead minimise the $p$-values a metric would have produced on a log of +past experiments. We collect such datasets from two social media applications +with over 160 million Monthly Active Users each, totalling over 153 A/B-pairs. +Empirical results show that we are able to increase statistical power by up to +78% when using our learnt metrics stand-alone, and by up to 210% when used in +tandem with the North Star. Alternatively, we can obtain constant statistical +power at a sample size that is down to 12% of what the North Star requires, +significantly reducing the cost of experimentation. + +
+
+
+
+
+ + ☆ Employee Turnover Analysis Using Machine Learning Algorithms + + +
+ Employee's knowledge is an organization asset. Turnover may impose apparent +and hidden costs and irreparable damages. To overcome and mitigate this risk, +employee's condition should be monitored. Due to high complexity of analyzing +well-being features, employee's turnover predicting can be delegated to machine +learning techniques. In this paper, we discuss employee's attrition rate. Three +different supervised learning algorithms comprising AdaBoost, SVM and +RandomForest are used to benchmark employee attrition accuracy. Attained models +can help out at establishing predictive analytics. + +
+
+ comment: 6 pages, 11 feagures, 2 tables +
+
+
+
+
+ + ☆ Compound Returns Reduce Variance in Reinforcement Learning + + +
+ Multistep returns, such as $n$-step returns and $\lambda$-returns, are +commonly used to improve the sample efficiency of reinforcement learning (RL) +methods. The variance of the multistep returns becomes the limiting factor in +their length; looking too far into the future increases variance and reverses +the benefits of multistep learning. In our work, we demonstrate the ability of +compound returns -- weighted averages of $n$-step returns -- to reduce +variance. We prove for the first time that any compound return with the same +contraction modulus as a given $n$-step return has strictly lower variance. We +additionally prove that this variance-reduction property improves the +finite-sample complexity of temporal-difference learning under linear function +approximation. Because general compound returns can be expensive to implement, +we introduce two-bootstrap returns which reduce variance while remaining +efficient, even when using minibatched experience replay. We conduct +experiments showing that two-bootstrap returns can improve the sample +efficiency of $n$-step deep RL agents, with little additional computational +cost. + +
+
+ comment: Preprint. 8 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ A phase transition between positional and semantic learning in a + solvable model of dot-product attention + + +
+ We investigate how a dot-product attention layer learns a positional +attention matrix (with tokens attending to each other based on their respective +positions) and a semantic attention matrix (with tokens attending to each other +based on their meaning). For an algorithmic task, we experimentally show how +the same simple architecture can learn to implement a solution using either the +positional or semantic mechanism. On the theoretical side, we study the +learning of a non-linear self-attention layer with trainable tied and low-rank +query and key matrices. In the asymptotic limit of high-dimensional data and a +comparably large number of training samples, we provide a closed-form +characterization of the global minimum of the non-convex empirical loss +landscape. We show that this minimum corresponds to either a positional or a +semantic mechanism and evidence an emergent phase transition from the former to +the latter with increasing sample complexity. Finally, we compare the +dot-product attention layer to linear positional baseline, and show that it +outperforms the latter using the semantic mechanism provided it has access to +sufficient data. + +
+
+
+
+
+ + ☆ Batch Universal Prediction + + +
+ Large language models (LLMs) have recently gained much popularity due to +their surprising ability at generating human-like English sentences. LLMs are +essentially predictors, estimating the probability of a sequence of words given +the past. Therefore, it is natural to evaluate their performance from a +universal prediction perspective. In order to do that fairly, we introduce the +notion of batch regret as a modification of the classical average regret, and +we study its asymptotical value for add-constant predictors, in the case of +memoryless sources and first-order Markov sources. + +
+
+
+
+
+ + ☆ DistiLLM: Towards Streamlined Distillation for Large Language Models + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +a smaller student model, reducing its inference cost and memory footprint while +preserving model capabilities. However, current KD methods for auto-regressive +sequence models (e.g., large language models) suffer from missing a +standardized objective function. Moreover, the recent use of student-generated +outputs to address training-inference mismatches has significantly escalated +computational costs. To tackle these issues, we introduce DistiLLM, a more +effective and efficient KD framework for auto-regressive language models. +DistiLLM comprises two components: (1) a novel skew Kullback-Leibler divergence +loss, where we unveil and leverage its theoretical properties, and (2) an +adaptive off-policy approach designed to enhance the efficiency in utilizing +student-generated outputs. Extensive experiments, including +instruction-following tasks, demonstrate the effectiveness of DistiLLM in +building high-performing student models while achieving up to 4.3$\times$ +speedup compared to recent KD methods. + +
+
+ comment: Code is available at https://github.com/jongwooko/distillm +
+
+
+
+
+ + ☆ Prediction Horizon Requirements for Automated Driving: Optimizing + Safety, Comfort, and Efficiency + + +
+ Predicting the movement of other road users is beneficial for improving +automated vehicle (AV) performance. However, the relationship between the time +horizon associated with these predictions and AV performance remains unclear. +Despite the existence of numerous trajectory prediction algorithms, no studies +have been conducted on how varying prediction lengths affect AV safety and +other vehicle performance metrics, resulting in undefined horizon requirements +for prediction methods. Our study addresses this gap by examining the effects +of different prediction horizons on AV performance, focusing on safety, +comfort, and efficiency. Through multiple experiments using a state-of-the-art, +risk-based predictive trajectory planner, we simulated predictions with +horizons up to 20 seconds. Based on our simulations, we propose a framework for +specifying the minimum required and optimal prediction horizons based on +specific AV performance criteria and application needs. Our results indicate +that a horizon of 1.6 seconds is required to prevent collisions with crossing +pedestrians, horizons of 7-8 seconds yield the best efficiency, and horizons up +to 15 seconds improve passenger comfort. We conclude that prediction horizon +requirements are application-dependent, and recommend aiming for a prediction +horizon of 11.8 seconds as a general guideline for applications involving +crossing pedestrians. + +
+
+ comment: Submitted to IEEE Intelligent Vehicles Symposium. 9 pages. 10 + figures. 6 tables +
+
+
+
+
+ + ☆ MOMENT: A Family of Open Time-series Foundation Models + + +
+ We introduce MOMENT, a family of open-source foundation models for +general-purpose time-series analysis. Pre-training large models on time-series +data is challenging due to (1) the absence of a large and cohesive public +time-series repository, and (2) diverse time-series characteristics which make +multi-dataset training onerous. Additionally, (3) experimental benchmarks to +evaluate these models, especially in scenarios with limited resources, time, +and supervision, are still in their nascent stages. To address these +challenges, we compile a large and diverse collection of public time-series, +called the Time-series Pile, and systematically tackle time-series-specific +challenges to unlock large-scale multi-dataset pre-training. Finally, we build +on recent work to design a benchmark to evaluate time-series foundation models +on diverse tasks and datasets in limited supervision settings. Experiments on +this benchmark demonstrate the effectiveness of our pre-trained models with +minimal data and task-specific fine-tuning. Finally, we present several +interesting empirical observations about large pre-trained time-series models. +Our code is available anonymously at anonymous.4open.science/r/BETT-773F/. + +
+
+
+
+
+ + ☆ A Framework for Bilevel Optimization on Riemannian Manifolds + + +
+ Bilevel optimization has seen an increasing presence in various domains of +applications. In this work, we propose a framework for solving bilevel +optimization problems where variables of both lower and upper level problems +are constrained on Riemannian manifolds. We provide several hypergradient +estimation strategies on manifolds and study their estimation error. We provide +convergence and complexity analysis for the proposed hypergradient descent +algorithm on manifolds. We also extend the developments to stochastic bilevel +optimization and to the use of general retraction. We showcase the utility of +the proposed framework on various applications. + +
+
+
+
+
+ + ☆ Geometric quantum machine learning of BQP$^A$ protocols and latent graph + classifiers + + +
+ Geometric quantum machine learning (GQML) aims to embed problem symmetries +for learning efficient solving protocols. However, the question remains if +(G)QML can be routinely used for constructing protocols with an exponential +separation from classical analogs. In this Letter we consider Simon's problem +for learning properties of Boolean functions, and show that this can be related +to an unsupervised circuit classification problem. Using the workflow of +geometric QML, we learn from first principles Simon's algorithm, thus +discovering an example of BQP$^A\neq$BPP protocol with respect to some dataset +(oracle $A$). Our key findings include the development of an equivariant +feature map for embedding Boolean functions, based on twirling with respect to +identified bitflip and permutational symmetries, and measurement based on +invariant observables with a sampling advantage. The proposed workflow points +to the importance of data embeddings and classical post-processing, while +keeping the variational circuit as a trivial identity operator. Next, +developing the intuition for the function learning, we visualize instances as +directed computational hypergraphs, and observe that the GQML protocol can +access their global topological features for distinguishing bijective and +surjective functions. Finally, we discuss the prospects for learning other +BQP$^A$-type protocols, and conjecture that this depends on the ability of +simplifying embeddings-based oracles $A$ applied as a linear combination of +unitaries. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ The Challenges of the Nonlinear Regime for Physics-Informed Neural + Networks + + +
+ The Neural Tangent Kernel (NTK) viewpoint represents a valuable approach to +examine the training dynamics of Physics-Informed Neural Networks (PINNs) in +the infinite width limit. We leverage this perspective and focus on the case of +nonlinear Partial Differential Equations (PDEs) solved by PINNs. We provide +theoretical results on the different behaviors of the NTK depending on the +linearity of the differential operator. Moreover, inspired by our theoretical +results, we emphasize the advantage of employing second-order methods for +training PINNs. Additionally, we explore the convergence capabilities of +second-order methods and address the challenges of spectral bias and slow +convergence. Every theoretical result is supported by numerical examples with +both linear and nonlinear PDEs, and we validate our training method on +benchmark test cases. + +
+
+ comment: 8 pages, 10 figures, appendix of 10 additional pages +
+
+
+
+
+ + ☆ Position Paper: Toward New Frameworks for Studying Model Representations + + +
+ Mechanistic interpretability (MI) aims to understand AI models by +reverse-engineering the exact algorithms neural networks learn. Most works in +MI so far have studied behaviors and capabilities that are trivial and +token-aligned. However, most capabilities are not that trivial, which advocates +for the study of hidden representations inside these networks as the unit of +analysis. We do a literature review, formalize representations for features and +behaviors, highlight their importance and evaluation, and perform some basic +exploration in the mechanistic interpretability of representations. With +discussion and exploratory results, we justify our position that studying +representations is an important and under-studied field, and that currently +established methods in MI are not sufficient to understand representations, +thus pushing for the research community to work toward new frameworks for +studying representations. + +
+
+
+
+
+ + ☆ Efficient Generation of Hidden Outliers for Improved Outlier Detection + + +
+ Outlier generation is a popular technique used for solving important outlier +detection tasks. Generating outliers with realistic behavior is challenging. +Popular existing methods tend to disregard the 'multiple views' property of +outliers in high-dimensional spaces. The only existing method accounting for +this property falls short in efficiency and effectiveness. We propose BISECT, a +new outlier generation method that creates realistic outliers mimicking said +property. To do so, BISECT employs a novel proposition introduced in this +article stating how to efficiently generate said realistic outliers. Our method +has better guarantees and complexity than the current methodology for +recreating 'multiple views'. We use the synthetic outliers generated by BISECT +to effectively enhance outlier detection in diverse datasets, for multiple use +cases. For instance, oversampling with BISECT reduced the error by up to 3 +times when compared with the baselines. + +
+
+
+
+
+ + ☆ On gauge freedom, conservativity and intrinsic dimensionality estimation + in diffusion models + + +
+ Diffusion models are generative models that have recently demonstrated +impressive performances in terms of sampling quality and density estimation in +high dimensions. They rely on a forward continuous diffusion process and a +backward continuous denoising process, which can be described by a +time-dependent vector field and is used as a generative model. In the original +formulation of the diffusion model, this vector field is assumed to be the +score function (i.e. it is the gradient of the log-probability at a given time +in the diffusion process). Curiously, on the practical side, most studies on +diffusion models implement this vector field as a neural network function and +do not constrain it be the gradient of some energy function (that is, most +studies do not constrain the vector field to be conservative). Even though some +studies investigated empirically whether such a constraint will lead to a +performance gain, they lead to contradicting results and failed to provide +analytical results. Here, we provide three analytical results regarding the +extent of the modeling freedom of this vector field. {Firstly, we propose a +novel decomposition of vector fields into a conservative component and an +orthogonal component which satisfies a given (gauge) freedom. Secondly, from +this orthogonal decomposition, we show that exact density estimation and exact +sampling is achieved when the conservative component is exactly equals to the +true score and therefore conservativity is neither necessary nor sufficient to +obtain exact density estimation and exact sampling. Finally, we show that when +it comes to inferring local information of the data manifold, constraining the +vector field to be conservative is desirable. + +
+
+
+
+
+ + ☆ Gaussian process regression with Sliced Wasserstein Weisfeiler-Lehman + graph kernels + + +
+ Supervised learning has recently garnered significant attention in the field +of computational physics due to its ability to effectively extract complex +patterns for tasks like solving partial differential equations, or predicting +material properties. Traditionally, such datasets consist of inputs given as +meshes with a large number of nodes representing the problem geometry (seen as +graphs), and corresponding outputs obtained with a numerical solver. This means +the supervised learning model must be able to handle large and sparse graphs +with continuous node attributes. In this work, we focus on Gaussian process +regression, for which we introduce the Sliced Wasserstein Weisfeiler-Lehman +(SWWL) graph kernel. In contrast to existing graph kernels, the proposed SWWL +kernel enjoys positive definiteness and a drastic complexity reduction, which +makes it possible to process datasets that were previously impossible to +handle. The new kernel is first validated on graph classification for molecular +datasets, where the input graphs have a few tens of nodes. The efficiency of +the SWWL kernel is then illustrated on graph regression in computational fluid +dynamics and solid mechanics, where the input graphs are made up of tens of +thousands of nodes. + +
+
+
+
+
+ + ☆ Estimating Barycenters of Distributions with Neural Optimal Transport + + +
+ Given a collection of probability measures, a practitioner sometimes needs to +find an "average" distribution which adequately aggregates reference +distributions. A theoretically appealing notion of such an average is the +Wasserstein barycenter, which is the primal focus of our work. By building upon +the dual formulation of Optimal Transport (OT), we propose a new scalable +approach for solving the Wasserstein barycenter problem. Our methodology is +based on the recent Neural OT solver: it has bi-level adversarial learning +objective and works for general cost functions. These are key advantages of our +method, since the typical adversarial algorithms leveraging barycenter tasks +utilize tri-level optimization and focus mostly on quadratic cost. We also +establish theoretical error bounds for our proposed approach and showcase its +applicability and effectiveness on illustrative scenarios and image data +setups. + +
+
+
+
+
+ + ☆ RevOrder: A Novel Method for Enhanced Arithmetic in Language Models + + +
+ This paper presents RevOrder, a novel technique aimed at improving arithmetic +operations in large language models (LLMs) by reversing the output digits in +addition, subtraction, and n-digit by 1-digit (nD by 1D) multiplication tasks. +Our method significantly reduces the Count of Sequential Intermediate Digits +(CSID) to $\mathcal{O}(1)$, a new metric we introduce to assess equation +complexity. Through comprehensive testing, RevOrder not only achieves perfect +accuracy in basic arithmetic operations but also substantially boosts LLM +performance in division tasks, particularly with large numbers where +traditional models struggle. Implementation of RevOrder is cost-effective for +both training and inference phases. Moreover, applying RevOrder to fine-tune +the LLaMA2-7B model on the GSM8K math task results in a considerable +improvement, reducing equation calculation errors by 46% and increasing overall +scores from 41.6 to 44.4. + +
+
+
+
+
+ + ☆ Theoretical and experimental study of SMOTE: limitations and comparisons + of rebalancing strategies + + +
+ Synthetic Minority Oversampling Technique (SMOTE) is a common rebalancing +strategy for handling imbalanced data sets. Asymptotically, we prove that SMOTE +(with default parameter) regenerates the original distribution by simply +copying the original minority samples. We also prove that SMOTE density +vanishes near the boundary of the support of the minority distribution, +therefore justifying the common BorderLine SMOTE strategy. Then we introduce +two new SMOTE-related strategies, and compare them with state-of-the-art +rebalancing procedures. We show that rebalancing strategies are only required +when the data set is highly imbalanced. For such data sets, SMOTE, our +proposals, or undersampling procedures are the best strategies. + +
+
+
+
+
+ + ☆ Asymptotic generalization error of a single-layer graph convolutional + network + + +
+ While graph convolutional networks show great practical promises, the +theoretical understanding of their generalization properties as a function of +the number of samples is still in its infancy compared to the more broadly +studied case of supervised fully connected neural networks. In this article, we +predict the performances of a single-layer graph convolutional network (GCN) +trained on data produced by attributed stochastic block models (SBMs) in the +high-dimensional limit. Previously, only ridge regression on contextual-SBM +(CSBM) has been considered in Shi et al. 2022; we generalize the analysis to +arbitrary convex loss and regularization for the CSBM and add the analysis for +another data model, the neural-prior SBM. We also study the high +signal-to-noise ratio limit, detail the convergence rates of the GCN and show +that, while consistent, it does not reach the Bayes-optimal rate for any of the +considered cases. + +
+
+
+
+
+ + ☆ Expediting In-Network Federated Learning by Voting-Based Consensus Model + Compression + + +
+ Recently, federated learning (FL) has gained momentum because of its +capability in preserving data privacy. To conduct model training by FL, +multiple clients exchange model updates with a parameter server via Internet. +To accelerate the communication speed, it has been explored to deploy a +programmable switch (PS) in lieu of the parameter server to coordinate clients. +The challenge to deploy the PS in FL lies in its scarce memory space, +prohibiting running memory consuming aggregation algorithms on the PS. To +overcome this challenge, we propose Federated Learning in-network Aggregation +with Compression (FediAC) algorithm, consisting of two phases: client voting +and model aggregating. In the former phase, clients report their significant +model update indices to the PS to estimate global significant model updates. In +the latter phase, clients upload global significant model updates to the PS for +aggregation. FediAC consumes much less memory space and communication traffic +than existing works because the first phase can guarantee consensus compression +across clients. The PS easily aligns model update indices to swiftly complete +aggregation in the second phase. Finally, we conduct extensive experiments by +using public datasets to demonstrate that FediAC remarkably surpasses the +state-of-the-art baselines in terms of model accuracy and communication +traffic. + +
+
+ comment: To appear in 2024 IEEE International Conference on Computer + Communications(INFOCOM 2024) +
+
+
+
+
+ + ☆ NK Hybrid Genetic Algorithm for Clustering + + +
+ The NK hybrid genetic algorithm for clustering is proposed in this paper. In +order to evaluate the solutions, the hybrid algorithm uses the NK clustering +validation criterion 2 (NKCV2). NKCV2 uses information about the disposition of +$N$ small groups of objects. Each group is composed of $K+1$ objects of the +dataset. Experimental results show that density-based regions can be identified +by using NKCV2 with fixed small $K$. In NKCV2, the relationship between +decision variables is known, which in turn allows us to apply gray box +optimization. Mutation operators, a partition crossover, and a local search +strategy are proposed, all using information about the relationship between +decision variables. In partition crossover, the evaluation function is +decomposed into $q$ independent components; partition crossover then +deterministically returns the best among $2^q$ possible offspring with +computational complexity $O(N)$. The NK hybrid genetic algorithm allows the +detection of clusters with arbitrary shapes and the automatic estimation of the +number of clusters. In the experiments, the NK hybrid genetic algorithm +produced very good results when compared to another genetic algorithm approach +and to state-of-art clustering algorithms. + +
+
+
+
+
+ + ☆ Masked Graph Autoencoder with Non-discrete Bandwidths WWW 2024 + + +
+ Masked graph autoencoders have emerged as a powerful graph self-supervised +learning method that has yet to be fully explored. In this paper, we unveil +that the existing discrete edge masking and binary link reconstruction +strategies are insufficient to learn topologically informative representations, +from the perspective of message propagation on graph neural networks. These +limitations include blocking message flows, vulnerability to over-smoothness, +and suboptimal neighborhood discriminability. Inspired by these understandings, +we explore non-discrete edge masks, which are sampled from a continuous and +dispersive probability distribution instead of the discrete Bernoulli +distribution. These masks restrict the amount of output messages for each edge, +referred to as "bandwidths". We propose a novel, informative, and effective +topological masked graph autoencoder using bandwidth masking and a layer-wise +bandwidth prediction objective. We demonstrate its powerful graph topological +learning ability both theoretically and empirically. Our proposed framework +outperforms representative baselines in both self-supervised link prediction +(improving the discrete edge reconstructors by at most 20%) and node +classification on numerous datasets, solely with a structure-learning pretext. +Our implementation is available at https://github.com/Newiz430/Bandana. + +
+
+ comment: Full version (17 pages, 8 figures, 12 tables), accepted by TheWebConf + 2024 (WWW 2024) +
+
+
+
+
+ + ☆ SDEMG: Score-based Diffusion Model for Surface Electromyographic Signal + Denoising + + +
+ Surface electromyography (sEMG) recordings can be influenced by +electrocardiogram (ECG) signals when the muscle being monitored is close to the +heart. Several existing methods use signal-processing-based approaches, such as +high-pass filter and template subtraction, while some derive mapping functions +to restore clean sEMG signals from noisy sEMG (sEMG with ECG interference). +Recently, the score-based diffusion model, a renowned generative model, has +been introduced to generate high-quality and accurate samples with noisy input +data. In this study, we proposed a novel approach, termed SDEMG, as a +score-based diffusion model for sEMG signal denoising. To evaluate the proposed +SDEMG approach, we conduct experiments to reduce noise in sEMG signals, +employing data from an openly accessible source, the Non-Invasive Adaptive +Prosthetics database, along with ECG signals from the MIT-BIH Normal Sinus +Rhythm Database. The experiment result indicates that SDEMG outperformed +comparative methods and produced high-quality sEMG samples. The source code of +SDEMG the framework is available at: https://github.com/tonyliu0910/SDEMG + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ SEABO: A Simple Search-Based Method for Offline Imitation Learning ICLR2024 + + +
+ Offline reinforcement learning (RL) has attracted much attention due to its +ability in learning from static offline datasets and eliminating the need of +interacting with the environment. Nevertheless, the success of offline RL +relies heavily on the offline transitions annotated with reward labels. In +practice, we often need to hand-craft the reward function, which is sometimes +difficult, labor-intensive, or inefficient. To tackle this challenge, we set +our focus on the offline imitation learning (IL) setting, and aim at getting a +reward function based on the expert data and unlabeled data. To that end, we +propose a simple yet effective search-based offline IL method, tagged SEABO. +SEABO allocates a larger reward to the transition that is close to its closest +neighbor in the expert demonstration, and a smaller reward otherwise, all in an +unsupervised learning manner. Experimental results on a variety of D4RL +datasets indicate that SEABO can achieve competitive performance to offline RL +algorithms with ground-truth rewards, given only a single expert trajectory, +and can outperform prior reward learning and offline IL methods across many +tasks. Moreover, we demonstrate that SEABO also works well if the expert +demonstrations contain only observations. Our code is publicly available at +https://github.com/dmksjfl/SEABO. + +
+
+ comment: To appear in ICLR2024 +
+
+
+
+
+ + ☆ Explainable Automated Machine Learning for Credit Decisions: Enhancing + Human Artificial Intelligence Collaboration in Financial Engineering + + +
+ This paper explores the integration of Explainable Automated Machine Learning +(AutoML) in the realm of financial engineering, specifically focusing on its +application in credit decision-making. The rapid evolution of Artificial +Intelligence (AI) in finance has necessitated a balance between sophisticated +algorithmic decision-making and the need for transparency in these systems. The +focus is on how AutoML can streamline the development of robust machine +learning models for credit scoring, while Explainable AI (XAI) methods, +particularly SHapley Additive exPlanations (SHAP), provide insights into the +models' decision-making processes. This study demonstrates how the combination +of AutoML and XAI not only enhances the efficiency and accuracy of credit +decisions but also fosters trust and collaboration between humans and AI +systems. The findings underscore the potential of explainable AutoML in +improving the transparency and accountability of AI-driven financial decisions, +aligning with regulatory requirements and ethical considerations. + +
+
+
+
+
+ + ☆ ReLU$^2$ Wins: Discovering Efficient Activation Functions for Sparse + LLMs + + +
+ Sparse computation offers a compelling solution for the inference of Large +Language Models (LLMs) in low-resource scenarios by dynamically skipping the +computation of inactive neurons. While traditional approaches focus on +ReLU-based LLMs, leveraging zeros in activation values, we broaden the scope of +sparse LLMs beyond zero activation values. We introduce a general method that +defines neuron activation through neuron output magnitudes and a tailored +magnitude threshold, demonstrating that non-ReLU LLMs also exhibit sparse +activation. To find the most efficient activation function for sparse +computation, we propose a systematic framework to examine the sparsity of LLMs +from three aspects: the trade-off between sparsity and performance, the +predictivity of sparsity, and the hardware affinity. We conduct thorough +experiments on LLMs utilizing different activation functions, including ReLU, +SwiGLU, ReGLU, and ReLU$^2$. The results indicate that models employing +ReLU$^2$ excel across all three evaluation aspects, highlighting its potential +as an efficient activation function for sparse LLMs. We will release the code +to facilitate future research. + +
+
+
+
+
+ + ♻ ☆ Building a Safer Maritime Environment Through Multi-Path Long-Term + Vessel Trajectory Forecasting + + +
+ Maritime transportation is paramount in achieving global economic growth, +entailing concurrent ecological obligations in sustainability and safeguarding +endangered marine species, most notably preserving large whale populations. In +this regard, the Automatic Identification System (AIS) data plays a significant +role by offering real-time streaming data on vessel movement, allowing enhanced +traffic monitoring. This study explores using AIS data to prevent +vessel-to-whale collisions by forecasting long-term vessel trajectories from +engineered AIS data sequences. For such a task, we have developed an +encoder-decoder model architecture using Bidirectional Long Short-Term Memory +Networks (Bi-LSTM) to predict the next 12 hours of vessel trajectories using 1 +to 3 hours of AIS data as input. We feed the model with probabilistic features +engineered from historical AIS data that refer to each trajectory's potential +route and destination. The model then predicts the vessel's trajectory, +considering these additional features by leveraging convolutional layers for +spatial feature learning and a position-aware attention mechanism that +increases the importance of recent timesteps of a sequence during temporal +feature learning. The probabilistic features have an F1 Score of approximately +85% and 75% for each feature type, respectively, demonstrating their +effectiveness in augmenting information to the neural network. We test our +model on the Gulf of St. Lawrence, a region known to be the habitat of North +Atlantic Right Whales (NARW). Our model achieved a high R2 score of over 98% +using various techniques and features. It stands out among other approaches as +it can make complex decisions during turnings and path selection. Our study +highlights the potential of data engineering and trajectory forecasting models +for marine life species preservation. + +
+
+
+
+
+ + ♻ ☆ Extreme Compression of Large Language Models via Additive Quantization + + +
+ The emergence of accurate open large language models (LLMs) has led to a race +towards quantization techniques for such models enabling execution on end-user +devices. In this paper, we revisit the problem of "extreme" LLM +compression--defined as targeting extremely low bit counts, such as 2 to 3 bits +per parameter, from the point of view of classic methods in Multi-Codebook +Quantization (MCQ). Our work builds on top of Additive Quantization, a classic +algorithm from the MCQ family, and adapts it to the quantization of language +models. The resulting algorithm advances the state-of-the-art in LLM +compression, outperforming all recently-proposed techniques in terms of +accuracy at a given compression budget. For instance, when compressing Llama 2 +models to 2 bits per parameter, our algorithm quantizes the 7B model to 6.93 +perplexity (a 1.29 improvement relative to the best prior work, and 1.81 points +from FP16), the 13B model to 5.70 perplexity (a .36 improvement) and the 70B +model to 3.94 perplexity (a .22 improvement) on WikiText2. We release our +implementation of Additive Quantization for Language Models AQLM as a baseline +to facilitate future research in LLM quantization. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open + Language Models + + +
+ Mathematical reasoning poses a significant challenge for language models due +to its complex and structured nature. In this paper, we introduce DeepSeekMath +7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B +math-related tokens sourced from Common Crawl, together with natural language +and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the +competition-level MATH benchmark without relying on external toolkits and +voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. +Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. +The mathematical reasoning capability of DeepSeekMath is attributed to two key +factors: First, we harness the significant potential of publicly available web +data through a meticulously engineered data selection pipeline. Second, we +introduce Group Relative Policy Optimization (GRPO), a variant of Proximal +Policy Optimization (PPO), that enhances mathematical reasoning abilities while +concurrently optimizing the memory usage of PPO. + +
+
+
+
+
+ + ♻ ☆ Clustered Switchback Experiments: Near-Optimal Rates Under + Spatiotemporal Interference + + +
+ We consider experimentation in the presence of non-stationarity, inter-unit +(spatial) interference, and carry-over effects (temporal interference), where +we wish to estimate the global average treatment effect (GATE), the difference +between average outcomes having exposed all units at all times to treatment or +to control. We suppose spatial interference is described by a graph, where a +unit's outcome depends on its neighborhood's treatment assignments, and that +temporal interference is described by a hidden Markov decision process, where +the transition kernel under either treatment (action) satisfies a rapid mixing +condition. We propose a clustered switchback design, where units are grouped +into clusters and time steps are grouped into blocks and each whole +cluster-block combination is assigned a single random treatment. Under this +design, we show that for graphs that admit good clustering, a truncated +exposure-mapping Horvitz-Thompson estimator achieves $\tilde O(1/NT)$ +mean-squared error (MSE), matching an $\Omega(1/NT)$ lower bound up to +logarithmic terms. Our results simultaneously generalize the $N=1$ setting of +Hu, Wager 2022 (and improves on the MSE bound shown therein for +difference-in-means estimators) as well as the $T=1$ settings of Ugander et al +2013 and Leung 2022. Simulation studies validate the favorable performance of +our approach. + +
+
+
+
+
+ + ♻ ☆ On Sample-Efficient Offline Reinforcement Learning: Data Diversity, + Posterior Sampling, and Beyond NeurIPS'23 + + +
+ We seek to understand what facilitates sample-efficient learning from +historical datasets for sequential decision-making, a problem that is popularly +known as offline reinforcement learning (RL). Further, we are interested in +algorithms that enjoy sample efficiency while leveraging (value) function +approximation. In this paper, we address these fundamental questions by (i) +proposing a notion of data diversity that subsumes the previous notions of +coverage measures in offline RL and (ii) using this notion to {unify} three +distinct classes of offline RL algorithms based on version spaces (VS), +regularized optimization (RO), and posterior sampling (PS). We establish that +VS-based, RO-based, and PS-based algorithms, under standard assumptions, +achieve \emph{comparable} sample efficiency, which recovers the +state-of-the-art sub-optimality bounds for finite and linear model classes with +the standard assumptions. This result is surprising, given that the prior work +suggested an unfavorable sample complexity of the RO-based algorithm compared +to the VS-based algorithm, whereas posterior sampling is rarely considered in +offline RL due to its explorative nature. Notably, our proposed model-free +PS-based algorithm for offline RL is {novel}, with sub-optimality bounds that +are {frequentist} (i.e., worst-case) in nature. + +
+
+ comment: NeurIPS'23; Arxiv is the authors' preferred version; v2: add a + missing related work +
+
+
+
+
+ + ♻ ☆ High-dimensional and Permutation Invariant Anomaly Detection + + +
+ Methods for anomaly detection of new physics processes are often limited to +low-dimensional spaces due to the difficulty of learning high-dimensional +probability densities. Particularly at the constituent level, incorporating +desirable properties such as permutation invariance and variable-length inputs +becomes difficult within popular density estimation methods. In this work, we +introduce a permutation-invariant density estimator for particle physics data +based on diffusion models, specifically designed to handle variable-length +inputs. We demonstrate the efficacy of our methodology by utilizing the learned +density as a permutation-invariant anomaly detection score, effectively +identifying jets with low likelihood under the background-only hypothesis. To +validate our density estimation method, we investigate the ratio of learned +densities and compare to those obtained by a supervised classification +algorithm. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction + + +
+ Efforts to align Large Language Models (LLMs) are mainly conducted via +Reinforcement Learning from Human Feedback (RLHF) methods. However, RLHF +encounters major challenges including training reward models, actor-critic +engineering, and importantly, it requires access to LLM parameters. Here we +introduce Aligner, a new efficient alignment paradigm that bypasses the whole +RLHF process by learning the correctional residuals between the aligned and the +unaligned answers. Our Aligner offers several key advantages. Firstly, it is an +autoregressive seq2seq model that is trained on the query-answer-correction +dataset via supervised learning; this offers a parameter-efficient alignment +solution with minimal resources. Secondly, the Aligner facilitates +weak-to-strong generalization; finetuning large pretrained models by Aligner's +supervisory signals demonstrates strong performance boost. Thirdly, Aligner +functions as a model-agnostic plug-and-play module, allowing for its direct +application on different open-source and API-based models. Remarkably, +Aligner-7B improves 11 different LLMs by 21.9% in helpfulness and 23.8% in +harmlessness on average (GPT-4 by 17.5% and 26.9%). When finetuning (strong) +Llama2-70B with (weak) Aligner-13B's supervision, we can improve Llama2 by 8.2% +in helpfulness and 61.6% in harmlessness. See our dataset and code at +https://aligner2024.github.io + +
+
+ comment: 34 pages +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ Provably Efficient UCB-type Algorithms For Learning Predictive State + Representations ICLR 2024 + + +
+ The general sequential decision-making problem, which includes Markov +decision processes (MDPs) and partially observable MDPs (POMDPs) as special +cases, aims at maximizing a cumulative reward by making a sequence of decisions +based on a history of observations and actions over time. Recent studies have +shown that the sequential decision-making problem is statistically learnable if +it admits a low-rank structure modeled by predictive state representations +(PSRs). Despite these advancements, existing approaches typically involve +oracles or steps that are computationally intractable. On the other hand, the +upper confidence bound (UCB) based approaches, which have served successfully +as computationally efficient methods in bandits and MDPs, have not been +investigated for more general PSRs, due to the difficulty of optimistic bonus +design in these more challenging settings. This paper proposes the first known +UCB-type approach for PSRs, featuring a novel bonus term that upper bounds the +total variation distance between the estimated and true models. We further +characterize the sample complexity bounds for our designed UCB-type algorithms +for both online and offline PSRs. In contrast to existing approaches for PSRs, +our UCB-type algorithms enjoy computational tractability, last-iterate +guaranteed near-optimal policy, and guaranteed model accuracy. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ TopoX: A Suite of Python Packages for Machine Learning on Topological + Domains + + +
+ We introduce topox, a Python software suite that provides reliable and +user-friendly building blocks for computing and machine learning on topological +domains that extend graphs: hypergraphs, simplicial, cellular, path and +combinatorial complexes. topox consists of three packages: toponetx facilitates +constructing and computing on these domains, including working with nodes, +edges and higher-order cells; topoembedx provides methods to embed topological +domains into vector spaces, akin to popular graph-based embedding algorithms +such as node2vec; topomodelx is built on top of PyTorch and offers a +comprehensive toolbox of higher-order message passing functions for neural +networks on topological domains. The extensively documented and unit-tested +source code of topox is available under MIT license at +https://github.com/pyt-team. + +
+
+
+
+
+ + ♻ ☆ CC-SGG: Corner Case Scenario Generation using Learned Scene Graphs + + +
+ Corner case scenarios are an essential tool for testing and validating the +safety of autonomous vehicles (AVs). As these scenarios are often +insufficiently present in naturalistic driving datasets, augmenting the data +with synthetic corner cases greatly enhances the safe operation of AVs in +unique situations. However, the generation of synthetic, yet realistic, corner +cases poses a significant challenge. In this work, we introduce a novel +approach based on Heterogeneous Graph Neural Networks (HGNNs) to transform +regular driving scenarios into corner cases. To achieve this, we first generate +concise representations of regular driving scenes as scene graphs, minimally +manipulating their structure and properties. Our model then learns to perturb +those graphs to generate corner cases using attention and triple embeddings. +The input and perturbed graphs are then imported back into the simulation to +generate corner case scenarios. Our model successfully learned to produce +corner cases from input scene graphs, achieving 89.9% prediction accuracy on +our testing dataset. We further validate the generated scenarios on baseline +autonomous driving methods, demonstrating our model's ability to effectively +create critical situations for the baselines. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Contrastive Diffuser: Planning Towards High Return States via + Contrastive Learning + + +
+ Applying diffusion models in reinforcement learning for long-term planning +has gained much attention recently. Several diffusion-based methods have +successfully leveraged the modeling capabilities of diffusion for arbitrary +distributions. These methods generate subsequent trajectories for planning and +have demonstrated significant improvement. However, these methods are limited +by their plain base distributions and their overlooking of the diversity of +samples, in which different states have different returns. They simply leverage +diffusion to learn the distribution of offline dataset, generate the +trajectories whose states share the same distribution with the offline dataset. +As a result, the probability of these models reaching the high-return states is +largely dependent on the dataset distribution. Even equipped with the guidance +model, the performance is still suppressed. To address these limitations, in +this paper, we propose a novel method called CDiffuser, which devises a return +contrast mechanism to pull the states in generated trajectories towards +high-return states while pushing them away from low-return states to improve +the base distribution. Experiments on 14 commonly used D4RL benchmarks +demonstrate the effectiveness of our proposed method. + +
+
+ comment: 13 pages with appendix and references, 10 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Position Paper: Bayesian Deep Learning in the Age of Large-Scale AI + + +
+ In the current landscape of deep learning research, there is a predominant +emphasis on achieving high predictive accuracy in supervised tasks involving +large image and language datasets. However, a broader perspective reveals a +multitude of overlooked metrics, tasks, and data types, such as uncertainty, +active and continual learning, and scientific data, that demand attention. +Bayesian deep learning (BDL) constitutes a promising avenue, offering +advantages across these diverse settings. This paper posits that BDL can +elevate the capabilities of deep learning. It revisits the strengths of BDL, +acknowledges existing challenges, and highlights some exciting research avenues +aimed at addressing these obstacles. Looking ahead, the discussion focuses on +possible ways to combine large-scale foundation models with BDL to unlock their +full potential. + +
+
+
+
+
+ + ♻ ☆ Interplay between depth and width for interpolation in neural ODEs + + +
+ Neural ordinary differential equations (neural ODEs) have emerged as a +natural tool for supervised learning from a control perspective, yet a complete +understanding of their optimal architecture remains elusive. In this work, we +examine the interplay between their width $p$ and number of layer transitions +$L$ (effectively the depth $L+1$). Specifically, we assess the model +expressivity in terms of its capacity to interpolate either a finite dataset +$D$ comprising $N$ pairs of points or two probability measures in +$\mathbb{R}^d$ within a Wasserstein error margin $\varepsilon>0$. Our findings +reveal a balancing trade-off between $p$ and $L$, with $L$ scaling as +$O(1+N/p)$ for dataset interpolation, and +$L=O\left(1+(p\varepsilon^d)^{-1}\right)$ for measure interpolation. + In the autonomous case, where $L=0$, a separate study is required, which we +undertake focusing on dataset interpolation. We address the relaxed problem of +$\varepsilon$-approximate controllability and establish an error decay of +$\varepsilon\sim O(\log(p)p^{-1/d})$. This decay rate is a consequence of +applying a universal approximation theorem to a custom-built Lipschitz vector +field that interpolates $D$. In the high-dimensional setting, we further +demonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact +control. + +
+
+ comment: 16 pages, 10 figures, double column +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reason may be the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever LLM in the ocean +domain, which is expert in various ocean science tasks. We propose DoInstruct, +a novel framework to automatically obtain a large volume of ocean domain +instruction data, which generates instructions based on multi-agent +collaboration. Additionally, we construct the first oceanography benchmark, +OceanBench, to evaluate the capabilities of LLMs in the ocean domain. Though +comprehensive experiments, OceanGPT not only shows a higher level of knowledge +expertise for oceans science tasks but also gains preliminary embodied +intelligence capabilities in ocean technology. Codes, data and checkpoints will +soon be available at https://github.com/zjunlp/KnowLM. + +
+
+ comment: Work in progress. Project Website: + https://zjunlp.github.io/project/OceanGPT/ +
+
+
+
+
+ + ♻ ☆ Better Batch for Deep Probabilistic Time Series Forecasting AISTATS 2024 + + +
+ Deep probabilistic time series forecasting has gained attention for its +superior performance in nonlinear approximation and its capability to offer +valuable uncertainty quantification for decision-making. However, existing +models often oversimplify the problem by assuming a time-independent error +process, overlooking serial correlation. To overcome this limitation, we +propose an innovative training method that incorporates error autocorrelation +to enhance probabilistic forecasting accuracy. Our method constructs a +mini-batch as a collection of $D$ consecutive time series segments for model +training. It explicitly learns a time-varying covariance matrix over each +mini-batch, encoding error correlation among adjacent time steps. The learned +covariance matrix can be used to improve prediction accuracy and enhance +uncertainty quantification. We evaluate our method on two different neural +forecasting models and multiple public datasets. Experimental results confirm +the effectiveness of the proposed approach in improving the performance of both +models across a range of datasets, resulting in notable improvements in +predictive accuracy. + +
+
+ comment: 10 pages, 3 figures, modified peer-review version, accepted to The + 27th International Conference on Artificial Intelligence and Statistics + (AISTATS 2024) +
+
+
+
+
+ + ♻ ☆ Towards Principled Graph Transformers + + +
+ Graph learning architectures based on the k-dimensional Weisfeiler-Leman +(k-WL) hierarchy offer a theoretically well-understood expressive power. +However, such architectures often fail to deliver solid predictive performance +on real-world tasks, limiting their practical impact. In contrast, global +attention-based models such as graph transformers demonstrate strong +performance in practice, but comparing their expressive power with the k-WL +hierarchy remains challenging, particularly since these architectures rely on +positional or structural encodings for their expressivity and predictive +performance. To address this, we show that the recently proposed Edge +Transformer, a global attention model operating on node pairs instead of nodes, +has at least 3-WL expressive power. Empirically, we demonstrate that the Edge +Transformer surpasses other theoretically aligned architectures regarding +predictive performance while not relying on positional or structural encodings. + +
+
+
+
+
+ + ♻ ☆ Language Model Training Paradigms for Clinical Feature Embeddings NeurIPS 2023 + + +
+ In research areas with scarce data, representation learning plays a +significant role. This work aims to enhance representation learning for +clinical time series by deriving universal embeddings for clinical features, +such as heart rate and blood pressure. We use self-supervised training +paradigms for language models to learn high-quality clinical feature +embeddings, achieving a finer granularity than existing time-step and +patient-level representation learning. We visualize the learnt embeddings via +unsupervised dimension reduction techniques and observe a high degree of +consistency with prior clinical knowledge. We also evaluate the model +performance on the MIMIC-III benchmark and demonstrate the effectiveness of +using clinical feature embeddings. We publish our code online for replication. + +
+
+ comment: Poster at "NeurIPS 2023 Workshop: Self-Supervised Learning - Theory + and Practice" +
+
+
+
+
+ + ♻ ☆ Critical Data Size of Language Models from a Grokking Perspective + + +
+ We explore the critical data size in language models, a threshold that marks +a fundamental shift from quick memorization to slow generalization. We +formalize the phase transition under the grokking configuration into the Data +Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus +regimes in language models training dynamics. We develop a grokking +configuration to reproduce grokking on simplistic language models stably by +rescaling initialization and weight decay. We show that generalization occurs +only when language models reach a critical size. We analyze grokking across +sample-wise and model-wise, verifying the proposed data efficiency hypothesis. +Our experiments reveal smoother phase transitions occurring at the critical +dataset size for language datasets. As the model size increases, this critical +point also becomes larger, indicating that larger models require more data. Our +results deepen the understanding of language model training, offering a novel +perspective on the role of data in the learning mechanism of language models. + +
+
+
+
+
+ + ♻ ☆ Online Recommendations for Agents with Discounted Adaptive Preferences ALT 2024 + + +
+ We consider a bandit recommendations problem in which an agent's preferences +(representing selection probabilities over recommended items) evolve as a +function of past selections, according to an unknown $\textit{preference +model}$. In each round, we show a menu of $k$ items (out of $n$ total) to the +agent, who then chooses a single item, and we aim to minimize regret with +respect to some $\textit{target set}$ (a subset of the item simplex) for +adversarial losses over the agent's choices. Extending the setting from Agarwal +and Brown (2022), where uniform-memory agents were considered, here we allow +for non-uniform memory in which a discount factor is applied to the agent's +memory vector at each subsequent round. In the "long-term memory" regime (when +the effective memory horizon scales with $T$ sublinearly), we show that +efficient sublinear regret is obtainable with respect to the set of +$\textit{everywhere instantaneously realizable distributions}$ (the "EIRD set", +as formulated in prior work) for any $\textit{smooth}$ preference model. +Further, for preferences which are bounded above and below by linear functions +of memory weight (we call these "scale-bounded" preferences) we give an +algorithm which obtains efficient sublinear regret with respect to nearly the +$\textit{entire}$ item simplex. We show an NP-hardness result for expanding to +targets beyond EIRD in general. In the "short-term memory" regime (when the +memory horizon is constant), we show that scale-bounded preferences again +enable efficient sublinear regret for nearly the entire simplex even without +smoothness if losses do not change too frequently, yet we show an +information-theoretic barrier for competing against the EIRD set under +arbitrary smooth preference models even when losses are constant. + +
+
+ comment: Updates for camera-ready version (ALT 2024) +
+
+
+
+
+ + ♻ ☆ OHQ: On-chip Hardware-aware Quantization + + +
+ Quantization emerges as one of the most promising approaches for deploying +advanced deep models on resource-constrained hardware. Mixed-precision +quantization leverages multiple bit-width architectures to unleash the accuracy +and efficiency potential of quantized models. However, existing mixed-precision +quantization suffers exhaustive search space that causes immense computational +overhead. The quantization process thus relies on separate high-performance +devices rather than locally, which also leads to a significant gap between the +considered hardware metrics and the real deployment.In this paper, we propose +an On-chip Hardware-aware Quantization (OHQ) framework that performs +hardware-aware mixed-precision quantization without accessing online devices. +First, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling +perceive the actual efficiency metrics of the quantization operator on the +hardware.Second, we propose Mask-guided Quantization Estimation (MQE) technique +to efficiently estimate the accuracy metrics of operators under the constraints +of on-chip-level computing power.By synthesizing network and hardware insights +through linear programming, we obtain optimized bit-width configurations. +Notably, the quantization process occurs on-chip entirely without any +additional computing devices and data access. We demonstrate accelerated +inference after quantization for various architectures and compression ratios, +achieving 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively. OHQ +improves latency by 15~30% compared to INT8 on deployment. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ The Landscape and Challenges of HPC Research and LLMs + + +
+ Recently, language models (LMs), especially large language models (LLMs), +have revolutionized the field of deep learning. Both encoder-decoder models and +prompt-based techniques have shown immense potential for natural language +processing and code-based tasks. Over the past several years, many research +labs and institutions have invested heavily in high-performance computing, +approaching or breaching exascale performance levels. In this paper, we posit +that adapting and utilizing such language model-based techniques for tasks in +high-performance computing (HPC) would be very beneficial. This study presents +our reasoning behind the aforementioned position and highlights how existing +ideas can be improved and adapted for HPC tasks. + +
+
+
+
+
+ + ♻ ☆ DECODE: Data-driven Energy Consumption Prediction leveraging Historical + Data and Environmental Factors in Buildings + + +
+ Energy prediction in buildings plays a crucial role in effective energy +management. Precise predictions are essential for achieving optimal energy +consumption and distribution within the grid. This paper introduces a Long +Short-Term Memory (LSTM) model designed to forecast building energy consumption +using historical energy data, occupancy patterns, and weather conditions. The +LSTM model provides accurate short, medium, and long-term energy predictions +for residential and commercial buildings compared to existing prediction +models. We compare our LSTM model with established prediction methods, +including linear regression, decision trees, and random forest. Encouragingly, +the proposed LSTM model emerges as the superior performer across all metrics. +It demonstrates exceptional prediction accuracy, boasting the highest R2 score +of 0.97 and the most favorable mean absolute error (MAE) of 0.007. An +additional advantage of our developed model is its capacity to achieve +efficient energy consumption forecasts even when trained on a limited dataset. +We address concerns about overfitting (variance) and underfitting (bias) +through rigorous training and evaluation on real-world data. In summary, our +research contributes to energy prediction by offering a robust LSTM model that +outperforms alternative methods and operates with remarkable efficiency, +generalizability, and reliability. + +
+
+ comment: 10 pages, 7 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Learners for Correction of AI Errors with Provable + Performance Guarantees + + +
+ We present a new methodology for handling AI errors by introducing weakly +supervised AI error correctors with a priori performance guarantees. These AI +correctors are auxiliary maps whose role is to moderate the decisions of some +previously constructed underlying classifier by either approving or rejecting +its decisions. The rejection of a decision can be used as a signal to suggest +abstaining from making a decision. A key technical focus of the work is in +providing performance guarantees for these new AI correctors through bounds on +the probabilities of incorrect decisions. These bounds are distribution +agnostic and do not rely on assumptions on the data dimension. Our empirical +example illustrates how the framework can be applied to improve the performance +of an image classifier in a challenging real-world task where training data are +scarce. + +
+
+
+
+
+ + ♻ ☆ LASER: Linear Compression in Wireless Distributed Optimization + + +
+ Data-parallel SGD is the de facto algorithm for distributed optimization, +especially for large scale machine learning. Despite its merits, communication +bottleneck is one of its persistent issues. Most compression schemes to +alleviate this either assume noiseless communication links, or fail to achieve +good performance on practical tasks. In this paper, we close this gap and +introduce LASER: LineAr CompreSsion in WirEless DistRibuted Optimization. LASER +capitalizes on the inherent low-rank structure of gradients and transmits them +efficiently over the noisy channels. Whilst enjoying theoretical guarantees +similar to those of the classical SGD, LASER shows consistent gains over +baselines on a variety of practical benchmarks. In particular, it outperforms +the state-of-the-art compression schemes on challenging computer vision and GPT +language modeling tasks. On the latter, we obtain $50$-$64 \%$ improvement in +perplexity over our baselines for noisy channels. + +
+
+
+
+
+ + ♻ ☆ Feudal Graph Reinforcement Learning + + +
+ Graph-based representations and weight-sharing modular policies constitute +prominent approaches to tackling composable control problems in Reinforcement +Learning (RL). However, as shown by recent graph deep learning literature, +message-passing operators can create bottlenecks in information propagation and +hinder global coordination. The issue becomes dramatic in tasks where +high-level planning is needed. In this work, we propose a novel methodology, +named Feudal Graph Reinforcement Learning (FGRL), that addresses such +challenges by relying on hierarchical RL and a pyramidal message-passing +architecture. In particular, FGRL defines a hierarchy of policies where +high-level commands are propagated from the top of the hierarchy down through a +layered graph structure. The bottom layers mimic the morphology of the physical +system, while the upper layers capture more abstract sub-modules. The resulting +agents are then characterized by a committee of policies where actions at a +certain level set goals for the level below, thus implementing a hierarchical +decision-making structure that encompasses task decomposition. We evaluate the +proposed framework on locomotion tasks on benchmark MuJoCo environments and +show that FGRL compares favorably against relevant baselines. Furthermore, an +in-depth analysis of the command propagation mechanism provides evidence that +the introduced message-passing scheme favors the learning of hierarchical +decision-making policies. + +
+
+
+
+
+ + ♻ ☆ Large Language Model Agent for Hyper-Parameter Optimization + + +
+ Hyperparameter optimization is critical in modern machine learning, requiring +expert knowledge, numerous trials, and high computational and human resources. +Despite the advancements in Automated Machine Learning (AutoML), challenges in +terms of trial efficiency, setup complexity, and interoperability still +persist. To address these issues, we introduce a novel paradigm leveraging +Large Language Models (LLMs) to automate hyperparameter optimization across +diverse machine learning tasks, which is named AgentHPO (short for LLM +Agent-based Hyperparameter Optimization). Specifically, AgentHPO processes the +task information autonomously, conducts experiments with specific +hyperparameters (HPs), and iteratively optimizes them based on historical +trials. This human-like optimization process largely reduces the number of +required trials, simplifies the setup process, and enhances interpretability +and user trust, compared to traditional AutoML methods. Extensive empirical +experiments conducted on 12 representative machine-learning tasks indicate that +AgentHPO not only matches but also often surpasses the best human trials in +terms of performance while simultaneously providing explainable results. +Further analysis sheds light on the strategies employed by the LLM in +optimizing these tasks, highlighting its effectiveness and adaptability in +various scenarios. + +
+
+
+
+
+ + ♻ ☆ Dual-stage optimizer for systematic overestimation adjustment applied to + multi-objective genetic algorithms for biomarker selection + + +
+ The challenge in biomarker discovery using machine learning from omics data +lies in the abundance of molecular features but scarcity of samples. Most +feature selection methods in machine learning require evaluating various sets +of features (models) to determine the most effective combination. This process, +typically conducted using a validation dataset, involves testing different +feature sets to optimize the model's performance. Evaluations have performance +estimation error and when the selection involves many models the best ones are +almost certainly overestimated. Biomarker identification with feature selection +methods can be addressed as a multi-objective problem with trade-offs between +predictive ability and parsimony in the number of features. Genetic algorithms +are a popular tool for multi-objective optimization but they evolve numerous +solutions thus are prone to overestimation. Methods have been proposed to +reduce the overestimation after a model has already been selected in +single-objective problems, but no algorithm existed capable of reducing the +overestimation during the optimization, improving model selection, or applied +in the more general multi-objective domain. We propose DOSA-MO, a novel +multi-objective optimization wrapper algorithm that learns how the original +estimation, its variance, and the feature set size of the solutions predict the +overestimation. DOSA-MO adjusts the expectation of the performance during the +optimization, improving the composition of the solution set. We verify that +DOSA-MO improves the performance of a state-of-the-art genetic algorithm on +left-out or external sample sets, when predicting cancer subtypes and/or +patient overall survival, using three transcriptomics datasets for kidney and +breast cancer. + +
+
+ comment: Added a picture with the algorithm steps and a supplementary section + with disambiguation of the technical terms. Moved sections in the + supplementary to shorten the main text. Fixed typos +
+
+
+
+
+ + ♻ ☆ Deep Nonnegative Matrix Factorization with Beta Divergences + + +
+ Deep Nonnegative Matrix Factorization (deep NMF) has recently emerged as a +valuable technique for extracting multiple layers of features across different +scales. However, all existing deep NMF models and algorithms have primarily +centered their evaluation on the least squares error, which may not be the most +appropriate metric for assessing the quality of approximations on diverse +datasets. For instance, when dealing with data types such as audio signals and +documents, it is widely acknowledged that $\beta$-divergences offer a more +suitable alternative. In this paper, we develop new models and algorithms for +deep NMF using some $\beta$-divergences, with a focus on the Kullback-Leibler +divergence. Subsequently, we apply these techniques to the extraction of facial +features, the identification of topics within document collections, and the +identification of materials within hyperspectral images. + +
+
+ comment: 32 pages. We have improved the presentation of the paper, and added + numerical experiments for beta=3/2 with 4 layers on the CBCL data set +
+
+
+
+
+ + ♻ ☆ The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing + + +
+ Real-life applications of deep neural networks are hindered by their unsteady +predictions when faced with noisy inputs and adversarial attacks. The certified +radius is in this context a crucial indicator of the robustness of models. +However how to design an efficient classifier with an associated certified +radius? Randomized smoothing provides a promising framework by relying on noise +injection into the inputs to obtain a smoothed and robust classifier. In this +paper, we first show that the variance introduced by the Monte-Carlo sampling +in the randomized smoothing procedure estimate closely interacts with two other +important properties of the classifier, \textit{i.e.} its Lipschitz constant +and margin. More precisely, our work emphasizes the dual impact of the +Lipschitz constant of the base classifier, on both the smoothed classifier and +the empirical variance. Moreover, to increase the certified robust radius, we +introduce a different way to convert logits to probability vectors for the base +classifier to leverage the variance-margin trade-off. We leverage the use of +Bernstein's concentration inequality along with enhanced Lipschitz bounds for +randomized smoothing. Experimental results show a significant improvement in +certified accuracy compared to current state-of-the-art methods. Our novel +certification procedure allows us to use pre-trained models that are used with +randomized smoothing, effectively improving the current certification radius in +a zero-shot manner. + +
+
+
+
+
+ + ♻ ☆ Sampling in Unit Time with Kernel Fisher-Rao Flow + + +
+ We introduce a new mean-field ODE and corresponding interacting particle +systems (IPS) for sampling from an unnormalized target density. The IPS are +gradient-free, available in closed form, and only require the ability to sample +from a reference density and compute the (unnormalized) target-to-reference +density ratio. The mean-field ODE is obtained by solving a Poisson equation for +a velocity field that transports samples along the geometric mixture of the two +densities, which is the path of a particular Fisher-Rao gradient flow. We +employ a RKHS ansatz for the velocity field, which makes the Poisson equation +tractable and enables discretization of the resulting mean-field ODE over +finite samples. The mean-field ODE can be additionally be derived from a +discrete-time perspective as the limit of successive linearizations of the +Monge-Amp\`ere equations within a framework known as sample-driven optimal +transport. We introduce a stochastic variant of our approach and demonstrate +empirically that our IPS can produce high-quality samples from varied target +distributions, outperforming comparable gradient-free particle systems and +competitive with gradient-based alternatives. + +
+
+ comment: Updated with additional numerical examples and a stochastic variant + of the approach +
+
+
+
+
+ + ♻ ☆ Theoretical Error Analysis of Entropy Approximation for Gaussian Mixture + + +
+ Gaussian mixture distributions are commonly employed to represent general +probability distributions. Despite the importance of using Gaussian mixtures +for uncertainty estimation, the entropy of a Gaussian mixture cannot be +analytically calculated. Notably, Gal and Ghahramani [2016] proposed the +approximate entropy that is the sum of the entropies of unimodal Gaussian +distributions. This approximation is easy to analytically calculate regardless +of dimension, but there lack theoretical guarantees. In this paper, we +theoretically analyze the approximation error between the true entropy and the +approximate one to reveal when this approximation works effectively. This error +is controlled by how far apart each Gaussian component of the Gaussian mixture. +To measure such separation, we introduce the ratios of the distances between +the means to the sum of the variances of each Gaussian component of the +Gaussian mixture, and we reveal that the error converges to zero as the ratios +tend to infinity. This convergence situation is more likely to occur in higher +dimensional spaces. Therefore, our results provide a guarantee that this +approximation works well in higher dimension problems, particularly in +scenarios such as neural networks that involve a large number of weights. + +
+
+ comment: 34 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Approaching an unknown communication system by latent space exploration + and causal inference + + +
+ This paper proposes a methodology for discovering meaningful properties in +data by exploring the latent space of unsupervised deep generative models. We +combine manipulation of individual latent variables to extreme values with +methods inspired by causal inference into an approach we call causal +disentanglement with extreme values (CDEV) and show that this method yields +insights for model interpretability. With this, we can test for what properties +of unknown data the model encodes as meaningful, using it to glean insight into +the communication system of sperm whales (Physeter macrocephalus), one of the +most intriguing and understudied animal communication systems. The network +architecture used has been shown to learn meaningful representations of speech; +here, it is used as a learning mechanism to decipher the properties of another +vocal communication system in which case we have no ground truth. The proposed +methodology suggests that sperm whales encode information using the number of +clicks in a sequence, the regularity of their timing, and audio properties such +as the spectral mean and the acoustic regularity of the sequences. Some of +these findings are consistent with existing hypotheses, while others are +proposed for the first time. We also argue that our models uncover rules that +govern the structure of units in the communication system and apply them while +generating innovative data not shown during training. This paper suggests that +an interpretation of the outputs of deep neural networks with causal inference +methodology can be a viable strategy for approaching data about which little is +known and presents another case of how deep learning can limit the hypothesis +space. Finally, the proposed approach can be extended to other architectures +and datasets. + +
+
+ comment: 25 pages, 23 figures; new format and section layout (moved some + sections to the appendix), added replication experiments, updated references: + to a subsequent experimental validation of the work, as well as to related + methodological work +
+
+
+
+
+ + ♻ ☆ HarmonyDream: Task Harmonization Inside World Models + + +
+ Model-based reinforcement learning (MBRL) holds the promise of +sample-efficient learning by utilizing a world model, which models how the +environment works and typically encompasses components for two tasks: +observation modeling and reward modeling. In this paper, through a dedicated +empirical investigation, we gain a deeper understanding of the role each task +plays in world models and uncover the overlooked potential of sample-efficient +MBRL by mitigating the domination of either observation or reward modeling. Our +key insight is that while prevalent approaches of explicit MBRL attempt to +restore abundant details of the environment via observation models, it is +difficult due to the environment's complexity and limited model capacity. On +the other hand, reward models, while dominating implicit MBRL and adept at +learning compact task-centric dynamics, are inadequate for sample-efficient +learning without richer learning signals. Motivated by these insights and +discoveries, we propose a simple yet effective approach, HarmonyDream, which +automatically adjusts loss coefficients to maintain task harmonization, i.e. a +dynamic equilibrium between the two tasks in world model learning. Our +experiments show that the base MBRL method equipped with HarmonyDream gains +10%-69% absolute performance boosts on visual robotic tasks and sets a new +state-of-the-art result on the Atari 100K benchmark. + +
+
+
+
+
+ + ♻ ☆ IM-META: Influence Maximization Using Node Metadata in Networks With + Unknown Topology + + +
+ Since the structure of complex networks is often unknown, we may identify the +most influential seed nodes by exploring only a part of the underlying network, +given a small budget for node queries. We propose IM-META, a solution to +influence maximization (IM) in networks with unknown topology by retrieving +information from queries and node metadata. Since using such metadata is not +without risk due to the noisy nature of metadata and uncertainties in +connectivity inference, we formulate a new IM problem that aims to find both +seed nodes and queried nodes. In IM-META, we develop an effective method that +iteratively performs three steps: 1) we learn the relationship between +collected metadata and edges via a Siamese neural network, 2) we select a +number of inferred confident edges to construct a reinforced graph, and 3) we +identify the next node to query by maximizing the inferred influence spread +using our topology-aware ranking strategy. Through experimental evaluation of +IM-META on four real-world datasets, we demonstrate a) the speed of network +exploration via node queries, b) the effectiveness of each module, c) the +superiority over benchmark methods, d) the robustness to more difficult +settings, e) the hyperparameter sensitivity, and f) the scalability. + +
+
+ comment: 14 pages, 11 figures, 4 tables, to appear in the IEEE Transactions on + Network Science and Engineering (Please cite our journal version that will + appear in an upcoming issue.) +
+
+
+
+
+ + ♻ ☆ Variational Representations of Annealing Paths: Bregman Information + under Monotonic Embedding + + +
+ Markov Chain Monte Carlo methods for sampling from complex distributions and +estimating normalization constants often simulate samples from a sequence of +intermediate distributions along an annealing path, which bridges between a +tractable initial distribution and a target density of interest. Prior works +have constructed annealing paths using quasi-arithmetic means, and interpreted +the resulting intermediate densities as minimizing an expected divergence to +the endpoints. To analyze these variational representations of annealing paths, +we extend known results showing that the arithmetic mean over arguments +minimizes the expected Bregman divergence to a single representative point. In +particular, we obtain an analogous result for quasi-arithmetic means, when the +inputs to the Bregman divergence are transformed under a monotonic embedding +function. Our analysis highlights the interplay between quasi-arithmetic means, +parametric families, and divergence functionals using the rho-tau +representational Bregman divergence framework, and associates common divergence +functionals with intermediate densities along an annealing path. + +
+
+ comment: Published in Information Geometry (Info. Geo. 2024) +
+
+
+
+
+ + ♻ ☆ Inverse Approximation Theory for Nonlinear Recurrent Neural Networks + + +
+ We prove an inverse approximation theorem for the approximation of nonlinear +sequence-to-sequence relationships using recurrent neural networks (RNNs). This +is a so-called Bernstein-type result in approximation theory, which deduces +properties of a target function under the assumption that it can be effectively +approximated by a hypothesis space. In particular, we show that nonlinear +sequence relationships that can be stably approximated by nonlinear RNNs must +have an exponential decaying memory structure - a notion that can be made +precise. This extends the previously identified curse of memory in linear RNNs +into the general nonlinear setting, and quantifies the essential limitations of +the RNN architecture for learning sequential relationships with long-term +memory. Based on the analysis, we propose a principled reparameterization +method to overcome the limitations. Our theoretical results are confirmed by +numerical experiments. The code has been released in +https://github.com/radarFudan/Curse-of-memory + +
+
+
+
+
+ + ♻ ☆ Smooth, exact rotational symmetrization for deep learning on point + clouds + + +
+ Point clouds are versatile representations of 3D objects and have found +widespread application in science and engineering. Many successful +deep-learning models have been proposed that use them as input. The domain of +chemical and materials modeling is especially challenging because exact +compliance with physical constraints is highly desirable for a model to be +usable in practice. These constraints include smoothness and invariance with +respect to translations, rotations, and permutations of identical atoms. If +these requirements are not rigorously fulfilled, atomistic simulations might +lead to absurd outcomes even if the model has excellent accuracy. Consequently, +dedicated architectures, which achieve invariance by restricting their design +space, have been developed. General-purpose point-cloud models are more varied +but often disregard rotational symmetry. We propose a general symmetrization +method that adds rotational equivariance to any given model while preserving +all the other requirements. Our approach simplifies the development of better +atomic-scale machine-learning schemes by relaxing the constraints on the design +space and making it possible to incorporate ideas that proved effective in +other domains. We demonstrate this idea by introducing the Point Edge +Transformer (PET) architecture, which is not intrinsically equivariant but +achieves state-of-the-art performance on several benchmark datasets of +molecules and solids. A-posteriori application of our general protocol makes +PET exactly equivariant, with minimal changes to its accuracy. + +
+
+ comment: Enhancing figures; minor polishing +
+
+
+
+
+ + ♻ ☆ Like an Open Book? Read Neural Network Architecture with Simple Power + Analysis on 32-bit Microcontrollers + + +
+ Model extraction is a growing concern for the security of AI systems. For +deep neural network models, the architecture is the most important information +an adversary aims to recover. Being a sequence of repeated computation blocks, +neural network models deployed on edge-devices will generate distinctive +side-channel leakages. The latter can be exploited to extract critical +information when targeted platforms are physically accessible. By combining +theoretical knowledge about deep learning practices and analysis of a +widespread implementation library (ARM CMSIS-NN), our purpose is to answer this +critical question: how far can we extract architecture information by simply +examining an EM side-channel trace? For the first time, we propose an +extraction methodology for traditional MLP and CNN models running on a high-end +32-bit microcontroller (Cortex-M7) that relies only on simple pattern +recognition analysis. Despite few challenging cases, we claim that, contrary to +parameters extraction, the complexity of the attack is relatively low and we +highlight the urgent need for practicable protections that could fit the strong +memory and latency requirements of such platforms. + +
+
+ comment: Accepted CARDIS 2023; ANR PICTURE PROJECT (ANR-20-CE39-0013) +
+
+
+
+
+ + ♻ ☆ Data-induced multiscale losses and efficient multirate gradient descent + schemes + + +
+ This paper investigates the impact of multiscale data on machine learning +algorithms, particularly in the context of deep learning. A dataset is +multiscale if its distribution shows large variations in scale across different +directions. This paper reveals multiscale structures in the loss landscape, +including its gradients and Hessians inherited from the data. Correspondingly, +it introduces a novel gradient descent approach, drawing inspiration from +multiscale algorithms used in scientific computing. This approach seeks to +transcend empirical learning rate selection, offering a more systematic, +data-informed strategy to enhance training efficiency, especially in the later +stages. + +
+
+ comment: 28 pages, 4 figures, submitted under review +
+
+
+
+
+ + ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object + Detection Systems + + +
+ In autonomous driving, LiDAR and radar are crucial for environmental +perception. LiDAR offers precise 3D spatial sensing information but struggles +in adverse weather like fog. Conversely, radar signals can penetrate rain or +mist due to their specific wavelength but are prone to noise disturbances. +Recent state-of-the-art works reveal that the fusion of radar and LiDAR can +lead to robust detection in adverse weather. The existing works adopt +convolutional neural network architecture to extract features from each sensor +data, then align and aggregate the two branch features to predict object +detection results. However, these methods have low accuracy of predicted +bounding boxes due to a simple design of label assignment and fusion +strategies. In this paper, we propose a bird's-eye view fusion learning-based +anchor box-free object detection system, which fuses the feature derived from +the radar range-azimuth heatmap and the LiDAR point cloud to estimate possible +objects. Different label assignment strategies have been designed to facilitate +the consistency between the classification of foreground or background anchor +points and the corresponding bounding box regressions. Furthermore, the +performance of the proposed object detector is further enhanced by employing a +novel interactive transformer module. The superior performance of the methods +proposed in this paper has been demonstrated using the recently published +Oxford Radar RobotCar dataset. Our system's average precision significantly +outperforms the state-of-the-art method by 13.1% and 19.0% at Intersection of +Union (IoU) of 0.8 under 'Clear+Foggy' training conditions for 'Clear' and +'Foggy' testing, respectively. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Momentum Particle Maximum Likelihood + + +
+ Maximum likelihood estimation (MLE) of latent variable models is often recast +as an optimization problem over the extended space of parameters and +probability distributions. For example, the Expectation Maximization (EM) +algorithm can be interpreted as coordinate descent applied to a suitable free +energy functional over this space. Recently, this perspective has been combined +with insights from optimal transport and Wasserstein gradient flows to develop +particle-based algorithms applicable to wider classes of models than standard +EM. + Drawing inspiration from prior works which interpret `momentum-enriched' +optimisation algorithms as discretizations of ordinary differential equations, +we propose an analogous dynamical systems-inspired approach to minimizing the +free energy functional over the extended space of parameters and probability +distributions. The result is a dynamic system that blends elements of +Nesterov's Accelerated Gradient method, the underdamped Langevin diffusion, and +particle methods. + Under suitable assumptions, we establish quantitative convergence of the +proposed system to the unique minimiser of the functional in continuous time. +We then propose a numerical discretization of this system which enables its +application to parameter estimation in latent variable models. Through +numerical experiments, we demonstrate that the resulting algorithm converges +faster than existing methods and compares favourably with other (approximate) +MLE algorithms. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models, Image Super-Resolution And Everything: A Survey + + +
+ Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field +and further closed the gap between image quality and human perceptual +preferences. They are easy to train and can produce very high-quality samples +that exceed the realism of those produced by previous generative methods. +Despite their promising results, they also come with new challenges that need +further research: high computational demands, comparability, lack of +explainability, color shifts, and more. Unfortunately, entry into this field is +overwhelming because of the abundance of publications. To address this, we +provide a unified recount of the theoretical foundations underlying DMs applied +to image SR and offer a detailed analysis that underscores the unique +characteristics and methodologies within this domain, distinct from broader +existing reviews in the field. This survey articulates a cohesive understanding +of DM principles and explores current research avenues, including alternative +input domains, conditioning techniques, guidance mechanisms, corruption spaces, +and zero-shot learning approaches. By offering a detailed examination of the +evolution and current trends in image SR through the lens of DMs, this survey +sheds light on the existing challenges and charts potential future directions, +aiming to inspire further innovation in this rapidly advancing area. + +
+
+
+
+
+ + ♻ ☆ Grounding Foundation Models through Federated Transfer Learning: A + General Framework + + +
+ Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and +powerful emergent abilities have achieved remarkable success in various natural +language processing and computer vision tasks. Grounding FMs by adapting them +to domain-specific tasks or augmenting them with domain-specific knowledge +enables us to exploit the full potential of FMs. However, grounding FMs faces +several challenges, stemming primarily from constrained computing resources, +data privacy, model heterogeneity, and model ownership. Federated Transfer +Learning (FTL), the combination of federated learning and transfer learning, +provides promising solutions to address these challenges. In recent years, the +need for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in +both academia and industry. Motivated by the strong growth in FTL-FM research +and the potential impact of FTL-FM on industrial applications, we propose an +FTL-FM framework that formulates problems of grounding FMs in the federated +learning setting, construct a detailed taxonomy based on the FTL-FM framework +to categorize state-of-the-art FTL-FM works, and comprehensively overview +FTL-FM works based on the proposed taxonomy. We also establish correspondences +between FTL-FM and conventional phases of adapting FM so that FM practitioners +can align their research works with FTL-FM. In addition, we overview advanced +efficiency-improving and privacy-preserving techniques because efficiency and +privacy are critical concerns in FTL-FM. Last, we discuss opportunities and +future research directions of FTL-FM. + +
+
+ comment: In progress. fixed some typos, errors, and revised the text a little + bit +
+
+
+
+
+ + ♻ ☆ Order Optimal Bounds for One-Shot Federated Learning over non-Convex + Loss Functions + + +
+ We consider the problem of federated learning in a one-shot setting in which +there are $m$ machines, each observing $n$ sample functions from an unknown +distribution on non-convex loss functions. Let $F:[-1,1]^d\to\mathbb{R}$ be the +expected loss function with respect to this unknown distribution. The goal is +to find an estimate of the minimizer of $F$. Based on its observations, each +machine generates a signal of bounded length $B$ and sends it to a server. The +server collects signals of all machines and outputs an estimate of the +minimizer of $F$. We show that the expected loss of any algorithm is lower +bounded by $\max\big(1/(\sqrt{n}(mB)^{1/d}), 1/\sqrt{mn}\big)$, up to a +logarithmic factor. We then prove that this lower bound is order optimal in $m$ +and $n$ by presenting a distributed learning algorithm, called Multi-Resolution +Estimator for Non-Convex loss function (MRE-NC), whose expected loss matches +the lower bound for large $mn$ up to polylogarithmic factors. + +
+
+
+
+
+ + ♻ ☆ Hilbert Curve Projection Distance for Distribution Comparison + + +
+ Distribution comparison plays a central role in many machine learning tasks +like data classification and generative modeling. In this study, we propose a +novel metric, called Hilbert curve projection (HCP) distance, to measure the +distance between two probability distributions with low complexity. In +particular, we first project two high-dimensional probability distributions +using Hilbert curve to obtain a coupling between them, and then calculate the +transport distance between these two distributions in the original space, +according to the coupling. We show that HCP distance is a proper metric and is +well-defined for probability measures with bounded supports. Furthermore, we +demonstrate that the modified empirical HCP distance with the $L_p$ cost in the +$d$-dimensional space converges to its population counterpart at a rate of no +more than $O(n^{-1/2\max\{d,p\}})$. To suppress the curse-of-dimensionality, we +also develop two variants of the HCP distance using (learnable) subspace +projections. Experiments on both synthetic and real-world data show that our +HCP distance works as an effective surrogate of the Wasserstein distance with +low complexity and overcomes the drawbacks of the sliced Wasserstein distance. + +
+
+ comment: 33 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Cool-chic video: Learned video coding with 800 parameters + + +
+ We propose a lightweight learned video codec with 900 multiplications per +decoded pixel and 800 parameters overall. To the best of our knowledge, this is +one of the neural video codecs with the lowest decoding complexity. It is built +upon the overfitted image codec Cool-chic and supplements it with an inter +coding module to leverage the video's temporal redundancies. The proposed model +is able to compress videos using both low-delay and random access +configurations and achieves rate-distortion close to AVC while out-performing +other overfitted codecs such as FFNeRV. The system is made open-source: +orange-opensource.github.io/Cool-Chic. + +
+
+ comment: 10 pages, published in Data Compression Conference 2024 +
+
+
+
+
+ + ♻ ☆ Causal Fair Metric: Bridging Causality, Individual Fairness, and + Adversarial Robustness + + +
+ Despite the essential need for comprehensive considerations in responsible +AI, factors like robustness, fairness, and causality are often studied in +isolation. Adversarial perturbation, used to identify vulnerabilities in +models, and individual fairness, aiming for equitable treatment of similar +individuals, despite initial differences, both depend on metrics to generate +comparable input data instances. Previous attempts to define such joint metrics +often lack general assumptions about data or structural causal models and were +unable to reflect counterfactual proximity. To address this, our paper +introduces a causal fair metric formulated based on causal structures +encompassing sensitive attributes and protected causal perturbation. To enhance +the practicality of our metric, we propose metric learning as a method for +metric estimation and deployment in real-world problems in the absence of +structural causal models. We also demonstrate the application of our novel +metric in classifiers. Empirical evaluation of real-world and synthetic +datasets illustrates the effectiveness of our proposed metric in achieving an +accurate classifier with fairness, resilience to adversarial perturbations, and +a nuanced understanding of causal relationships. + +
+
+
+
+
+ + ♻ ☆ Scaling Transformer to 1M tokens and beyond with RMT + + +
+ A major limitation for the broader scope of problems solvable by transformers +is the quadratic scaling of computational complexity with input size. In this +study, we investigate the recurrent memory augmentation of pre-trained +transformer models to extend input context length while linearly scaling +compute. Our approach demonstrates the capability to store information in +memory for sequences of up to an unprecedented two million tokens while +maintaining high retrieval accuracy. Experiments with language modeling tasks +show perplexity improvement as the number of processed input segments +increases. These results underscore the effectiveness of our method, which has +significant potential to enhance long-term dependency handling in natural +language understanding and generation tasks, as well as enable large-scale +context processing for memory-intensive applications. + +
+
+
+
+
+ + ♻ ☆ PAC-Bayes-Chernoff bounds for unbounded losses + + +
+ We introduce a new PAC-Bayes oracle bound for unbounded losses. This result +can be understood as a PAC-Bayesian version of the Cram\'er-Chernoff bound. The +proof technique relies on controlling the tails of certain random variables +involving the Cram\'er transform of the loss. We highlight several applications +of the main theorem. First, we show that our result naturally allows exact +optimization of the free parameter on many PAC-Bayes bounds. Second, we recover +and generalize previous results. Finally, we show that our approach allows +working with richer assumptions that result in more informative and potentially +tighter bounds. In this direction, we provide a general bound under a new +``model-dependent bounded CGF" assumption from which we obtain bounds based on +parameter norms and log-Sobolev inequalities. All these bounds can be minimized +to obtain novel posteriors. + +
+
+ comment: Updated Section 5 +
+
+
+
+
+ + ♻ ☆ Vanilla Bayesian Optimization Performs Great in High Dimensions + + +
+ High-dimensional problems have long been considered the Achilles' heel of +Bayesian optimization algorithms. Spurred by the curse of dimensionality, a +large collection of algorithms aim to make it more performant in this setting, +commonly by imposing various simplifying assumptions on the objective. In this +paper, we identify the degeneracies that make vanilla Bayesian optimization +poorly suited to high-dimensional tasks, and further show how existing +algorithms address these degeneracies through the lens of lowering the model +complexity. Moreover, we propose an enhancement to the prior assumptions that +are typical to vanilla Bayesian optimization algorithms, which reduces the +complexity to manageable levels without imposing structural restrictions on the +objective. Our modification - a simple scaling of the Gaussian process +lengthscale prior with the dimensionality - reveals that standard Bayesian +optimization works drastically better than previously thought in high +dimensions, clearly outperforming existing state-of-the-art algorithms on +multiple commonly considered real-world high-dimensional tasks. + +
+
+
+
+
+ + ♻ ☆ Generative Modeling through the Semi-dual Formulation of Unbalanced + Optimal Transport + + +
+ Optimal Transport (OT) problem investigates a transport map that bridges two +distributions while minimizing a given cost function. In this regard, OT +between tractable prior distribution and data has been utilized for generative +modeling tasks. However, OT-based methods are susceptible to outliers and face +optimization challenges during training. In this paper, we propose a novel +generative model based on the semi-dual formulation of Unbalanced Optimal +Transport (UOT). Unlike OT, UOT relaxes the hard constraint on distribution +matching. This approach provides better robustness against outliers, stability +during training, and faster convergence. We validate these properties +empirically through experiments. Moreover, we study the theoretical upper-bound +of divergence between distributions in UOT. Our model outperforms existing +OT-based generative models, achieving FID scores of 2.97 on CIFAR-10 and 6.36 +on CelebA-HQ-256. The code is available at +\url{https://github.com/Jae-Moo/UOTM}. + +
+
+ comment: 23 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ LPNL: Scalable Link Prediction with Large Language Models + + +
+ Exploring the application of large language models (LLMs) to graph learning +is a emerging endeavor. However, the vast amount of information inherent in +large graphs poses significant challenges to this process. This work focuses on +the link prediction task and introduces $\textbf{LPNL}$ (Link Prediction via +Natural Language), a framework based on large language models designed for +scalable link prediction on large-scale heterogeneous graphs. We design novel +prompts for link prediction that articulate graph details in natural language. +We propose a two-stage sampling pipeline to extract crucial information from +the graphs, and a divide-and-conquer strategy to control the input tokens +within predefined limits, addressing the challenge of overwhelming information. +We fine-tune a T5 model based on our self-supervised learning designed for link +prediction. Extensive experimental results demonstrate that LPNL outperforms +multiple advanced baselines in link prediction tasks on large-scale graphs. + +
+
+
+
+
+ + ♻ ☆ Linear Alignment of Vision-language Models for Image Captioning + + +
+ Recently, vision-language models like CLIP have advanced the state of the art +in a variety of multi-modal tasks including image captioning and caption +evaluation. Many approaches adapt CLIP-style models to a downstream task by +training a mapping network between CLIP and a language model. This is costly as +it usually involves calculating gradients for large models. We propose a more +efficient training protocol that fits a linear mapping between image and text +embeddings of CLIP via a closed-form solution. This bypasses the need for +gradient computation and results in a lightweight captioning method called +ReCap, which can be trained up to 1000 times faster than existing lightweight +methods. Moreover, we propose two new learning-based image-captioning metrics +that build on CLIP score along with our linear mapping. Furthermore, we combine +ReCap with our new metrics to design an iterative datastore-augmentation loop +(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k, +VizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art +lightweight methods on established metrics while outperforming them on our new +metrics, which are better aligned with human ratings on Flickr8k-Expert and +Flickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to +other domains and that our DAL leads to a performance boost. + +
+
+ comment: 8 pages (+ references and appendix) +
+
+
+
+
+ + ♻ ☆ XLand-MiniGrid: Scalable Meta-Reinforcement Learning Environments in JAX NeurIPS 2023 + + +
+ Inspired by the diversity and depth of XLand and the simplicity and +minimalism of MiniGrid, we present XLand-MiniGrid, a suite of tools and +grid-world environments for meta-reinforcement learning research. Written in +JAX, XLand-MiniGrid is designed to be highly scalable and can potentially run +on GPU or TPU accelerators, democratizing large-scale experimentation with +limited resources. Along with the environments, XLand-MiniGrid provides +pre-sampled benchmarks with millions of unique tasks of varying difficulty and +easy-to-use baselines that allow users to quickly start training adaptive +agents. In addition, we have conducted a preliminary analysis of scaling and +generalization, showing that our baselines are capable of reaching millions of +steps per second during training and validating that the proposed benchmarks +are challenging. + +
+
+ comment: NeurIPS 2023, Workshop, Source code: + https://github.com/corl-team/xland-minigrid +
+
+
+
+
+ + ♻ ☆ Large Margin Mechanism and Pseudo Query Set on Cross-Domain Few-Shot + Learning CVPR + + +
+ In recent years, few-shot learning problems have received a lot of attention. +While methods in most previous works were trained and tested on datasets in one +single domain, cross-domain few-shot learning is a brand-new branch of few-shot +learning problems, where models handle datasets in different domains between +training and testing phases. In this paper, to solve the problem that the model +is pre-trained (meta-trained) on a single dataset while fine-tuned on datasets +in four different domains, including common objects, satellite images, and +medical images, we propose a novel large margin fine-tuning method (LMM-PQS), +which generates pseudo query images from support images and fine-tunes the +feature extraction modules with a large margin mechanism inspired by methods in +face recognition. According to the experiment results, LMM-PQS surpasses the +baseline models by a significant margin and demonstrates that our approach is +robust and can easily adapt pre-trained models to new domains with few data. + +
+
+ comment: Full version of the CDFSL competition report (in CVPRW'20), archived +
+
+
+
+
+ + ♻ ☆ Exploiting Low-level Representations for Ultra-Fast Road Segmentation + + +
+ Achieving real-time and accuracy on embedded platforms has always been the +pursuit of road segmentation methods. To this end, they have proposed many +lightweight networks. However, they ignore the fact that roads are "stuff" +(background or environmental elements) rather than "things" (specific +identifiable objects), which inspires us to explore the feasibility of +representing roads with low-level instead of high-level features. Surprisingly, +we find that the primary stage of mainstream network models is sufficient to +represent most pixels of the road for segmentation. Motivated by this, we +propose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg). +Specifically, LFD-RoadSeg employs a bilateral structure. The spatial detail +branch is firstly designed to extract low-level feature representation for the +road by the first stage of ResNet-18. To suppress texture-less regions mistaken +as the road in the low-level feature, the context semantic branch is then +designed to extract the context feature in a fast manner. To this end, in the +second branch, we asymmetrically downsample the input image and design an +aggregation module to achieve comparable receptive fields to the third stage of +ResNet-18 but with less time consumption. Finally, to segment the road from the +low-level feature, a selective fusion module is proposed to calculate +pixel-wise attention between the low-level representation and context feature, +and suppress the non-road low-level response by this attention. On KITTI-Road, +LFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average +precision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on +a Jetson TX2, all with a compact model size of just 936k parameters. The source +code is available at https://github.com/zhouhuan-hust/LFD-RoadSeg. + +
+
+ comment: 11 pages, 7 figures, IEEE TITS +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Continual Learning: Theory, Method and + Application + + +
+ To cope with real-world dynamics, an intelligent system needs to +incrementally acquire, update, accumulate, and exploit knowledge throughout its +lifetime. This ability, known as continual learning, provides a foundation for +AI systems to develop themselves adaptively. In a general sense, continual +learning is explicitly limited by catastrophic forgetting, where learning a new +task usually results in a dramatic performance degradation of the old tasks. +Beyond this, increasingly numerous advances have emerged in recent years that +largely extend the understanding and application of continual learning. The +growing and widespread interest in this direction demonstrates its realistic +significance as well as complexity. In this work, we present a comprehensive +survey of continual learning, seeking to bridge the basic settings, theoretical +foundations, representative methods, and practical applications. Based on +existing theoretical and empirical results, we summarize the general objectives +of continual learning as ensuring a proper stability-plasticity trade-off and +an adequate intra/inter-task generalizability in the context of resource +efficiency. Then we provide a state-of-the-art and elaborated taxonomy, +extensively analyzing how representative methods address continual learning, +and how they are adapted to particular challenges in realistic applications. +Through an in-depth discussion of promising directions, we believe that such a +holistic perspective can greatly facilitate subsequent exploration in this +field and beyond. + +
+
+ comment: The concise version is in IEEE Transactions on Pattern Analysis and + Machine Intelligence (TPAMI) +
+
+
+
+
+ + ♻ ☆ Personas as a Way to Model Truthfulness in Language Models + + +
+ Large language models (LLMs) are trained on vast amounts of text from the +internet, which contains both factual and misleading information about the +world. While unintuitive from a classic view of LMs, recent work has shown that +the truth value of a statement can be elicited from the model's +representations. This paper presents an explanation for why LMs appear to know +the truth despite not being trained with truth labels. We hypothesize that the +pretraining data is generated by groups of (un)truthful agents whose outputs +share common features, and they form a (un)truthful persona. By training on +this data, LMs can infer and represent the persona in its activation space. +This allows the model to separate truth from falsehoods and controls the +truthfulness of its generation. We show evidence for the persona hypothesis via +two observations: (1) we can probe whether a model's answer will be truthful +before it is generated; (2) finetuning a model on a set of facts improves its +truthfulness on unseen topics. Next, using arithmetics as a synthetic +environment, we show that structures of the pretraining data are crucial for +the model to infer the truthful persona. Overall, our findings suggest that +models can exploit hierarchical structures in the data to learn abstract +concepts like truthfulness. + +
+
+
+
+
+ + ♻ ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with + Graph, Image, and Text + + +
+ Large language models have made significant strides in natural language +processing, enabling innovative applications in molecular science by processing +textual representations of molecules. However, most existing language models +cannot capture the rich information with complex molecular structures or +images. In this paper, we introduce GIT-Mol, a multi-modal large language model +that integrates the Graph, Image, and Text information. To facilitate the +integration of multi-modal molecular data, we propose GIT-Former, a novel +architecture that is capable of aligning all modalities into a unified latent +space. We achieve a 5%-10% accuracy increase in properties prediction and a +20.2% boost in molecule generation validity compared to the baselines. With the +any-to-language molecular translation strategy, our model has the potential to +perform more downstream tasks, such as compound name recognition and chemical +reaction prediction. + +
+
+ comment: The article has been accepted by Computers in Biology and Medicine, + with 14 pages and 4 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Self-Distillation for Minimizing Client Drift in Heterogeneous + Federated Learning + + +
+ Federated Learning (FL) is a machine learning paradigm that enables clients +to jointly train a global model by aggregating the locally trained models +without sharing any local training data. In practice, there can often be +substantial heterogeneity (e.g., class imbalance) across the local data +distributions observed by each of these clients. Under such non-iid data +distributions across clients, FL suffers from the 'client-drift' problem where +every client drifts to its own local optimum. This results in slower +convergence and poor performance of the aggregated model. To address this +limitation, we propose a novel regularization technique based on adaptive +self-distillation (ASD) for training models on the client side. Our +regularization scheme adaptively adjusts to the client's training data based on +the global model entropy and the client's label distribution. The proposed +regularization can be easily integrated atop existing, state-of-the-art FL +algorithms, leading to a further boost in the performance of these +off-the-shelf methods. We theoretically explain how ASD reduces client-drift +and also explain its generalization ability. We demonstrate the efficacy of our +approach through extensive experiments on multiple real-world benchmarks and +show substantial gains in performance over state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Improved Bayes Risk Can Yield Reduced Social Welfare Under Competition NeurIPS 2023 + + +
+ As the scale of machine learning models increases, trends such as scaling +laws anticipate consistent downstream improvements in predictive accuracy. +However, these trends take the perspective of a single model-provider in +isolation, while in reality providers often compete with each other for users. +In this work, we demonstrate that competition can fundamentally alter the +behavior of these scaling trends, even causing overall predictive accuracy +across users to be non-monotonic or decreasing with scale. We define a model of +competition for classification tasks, and use data representations as a lens +for studying the impact of increases in scale. We find many settings where +improving data representation quality (as measured by Bayes risk) decreases the +overall predictive accuracy across users (i.e., social welfare) for a +marketplace of competing model-providers. Our examples range from closed-form +formulas in simple settings to simulations with pretrained representations on +CIFAR-10. At a conceptual level, our work suggests that favorable scaling +trends for individual model-providers need not translate to downstream +improvements in social welfare in marketplaces with multiple model providers. + +
+
+ comment: Appeared at NeurIPS 2023; this is the full version +
+
+
+
+
+ + ♻ ☆ Bayes-Optimal Classifiers under Group Fairness + + +
+ Machine learning algorithms are becoming integrated into more and more +high-stakes decision-making processes, such as in social welfare issues. Due to +the need of mitigating the potentially disparate impacts from algorithmic +predictions, many approaches have been proposed in the emerging area of fair +machine learning. However, the fundamental problem of characterizing +Bayes-optimal classifiers under various group fairness constraints has only +been investigated in some special cases. Based on the classical Neyman-Pearson +argument (Neyman and Pearson, 1933; Shao, 2003) for optimal hypothesis testing, +this paper provides a unified framework for deriving Bayes-optimal classifiers +under group fairness. This enables us to propose a group-based thresholding +method we call FairBayes, that can directly control disparity, and achieve an +essentially optimal fairness-accuracy tradeoff. These advantages are supported +by thorough experiments. + +
+
+ comment: This technical report has been largely superseded by our later paper: + "Bayes-Optimal Fair Classification with Linear Disparity Constraints via + Pre-, In-, and Post-processing'' (arXiv:2402.02817). Please cite that one + instead of this technical report +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ BioNet-XR: Biological Network Visualization Framework for Virtual + Reality and Mixed Reality Environments + + +
+ Protein-protein interaction networks (PPIN) enable the study of cellular +processes in organisms. Visualizing PPINs in extended reality (XR), including +virtual reality (VR) and mixed reality (MR), is crucial for exploring +subnetworks, evaluating protein positions, and collaboratively analyzing and +discussing on networks with the help of recent technological advancements. +Here, we present BioNet-XR, a 3D visualization framework, to visualize PPINs in +VR and MR environments. BioNet-XR was developed with the Unity3D game engine. +Our framework provides state-of-the-art methods and visualization features +including teleportation between nodes, general and first-person view to explore +the network, subnetwork construction via PageRank, Steiner tree, and all-pair +shortest path algorithms for a given set of initial nodes. We used usability +tests to gather feedback from both specialists (bioinformaticians) and +generalists (multidisciplinary groups), addressing the need for usability +evaluations of visualization tools. In the MR version of BioNet-XR, users can +seamlessly transition to real-world environments and interact with protein +interaction networks. BioNet-XR is highly modular and adaptable for +visualization of other biological networks, such as metabolic and regulatory +networks, and extension with additional network methods. + +
+
+
+
+
+ + ☆ Sentiment-enhanced Graph-based Sarcasm Explanation in Dialogue + + +
+ Sarcasm Explanation in Dialogue (SED) is a new yet challenging task, which +aims to generate a natural language explanation for the given sarcastic +dialogue that involves multiple modalities (i.e., utterance, video, and audio). +Although existing studies have achieved great success based on the generative +pretrained language model BART, they overlook exploiting the sentiments +residing in the utterance, video and audio, which are vital clues for sarcasm +explanation. In fact, it is non-trivial to incorporate sentiments for boosting +SED performance, due to three main challenges: 1) diverse effects of utterance +tokens on sentiments; 2) gap between video-audio sentiment signals and the +embedding space of BART; and 3) various relations among utterances, utterance +sentiments, and video-audio sentiments. To tackle these challenges, we propose +a novel sEntiment-enhanceD Graph-based multimodal sarcasm Explanation +framework, named EDGE. In particular, we first propose a lexicon-guided +utterance sentiment inference module, where a heuristic utterance sentiment +refinement strategy is devised. We then develop a module named Joint Cross +Attention-based Sentiment Inference (JCA-SI) by extending the multimodal +sentiment analysis model JCA to derive the joint sentiment label for each +video-audio clip. Thereafter, we devise a context-sentiment graph to +comprehensively model the semantic relations among the utterances, utterance +sentiments, and video-audio sentiments, to facilitate sarcasm explanation +generation. Extensive experiments on the publicly released dataset WITS verify +the superiority of our model over cutting-edge methods. + +
+
+
+
+
+ + ♻ ☆ Perceptual-oriented Learned Image Compression with Dynamic Kernel + + +
+ In this paper, we extend our prior research named DKIC and propose the +perceptual-oriented learned image compression method, PO-DKIC. Specifically, +DKIC adopts a dynamic kernel-based dynamic residual block group to enhance the +transform coding and an asymmetric space-channel context entropy model to +facilitate the estimation of gaussian parameters. Based on DKIC, PO-DKIC +introduces PatchGAN and LPIPS loss to enhance visual quality. Furthermore, to +maximize the overall perceptual quality under a rate constraint, we formulate +this challenge into a constrained programming problem and use the Linear +Integer Programming method for resolution. The experiments demonstrate that our +proposed method can generate realistic images with richer textures and finer +details when compared to state-of-the-art image compression techniques. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models, Image Super-Resolution And Everything: A Survey + + +
+ Diffusion Models (DMs) have disrupted the image Super-Resolution (SR) field +and further closed the gap between image quality and human perceptual +preferences. They are easy to train and can produce very high-quality samples +that exceed the realism of those produced by previous generative methods. +Despite their promising results, they also come with new challenges that need +further research: high computational demands, comparability, lack of +explainability, color shifts, and more. Unfortunately, entry into this field is +overwhelming because of the abundance of publications. To address this, we +provide a unified recount of the theoretical foundations underlying DMs applied +to image SR and offer a detailed analysis that underscores the unique +characteristics and methodologies within this domain, distinct from broader +existing reviews in the field. This survey articulates a cohesive understanding +of DM principles and explores current research avenues, including alternative +input domains, conditioning techniques, guidance mechanisms, corruption spaces, +and zero-shot learning approaches. By offering a detailed examination of the +evolution and current trends in image SR through the lens of DMs, this survey +sheds light on the existing challenges and charts potential future directions, +aiming to inspire further innovation in this rapidly advancing area. + +
+
+
+
+
+ + ♻ ☆ Semantic2Graph: Graph-based Multi-modal Feature Fusion for Action + Segmentation in Videos + + +
+ Video action segmentation have been widely applied in many fields. Most +previous studies employed video-based vision models for this purpose. However, +they often rely on a large receptive field, LSTM or Transformer methods to +capture long-term dependencies within videos, leading to significant +computational resource requirements. To address this challenge, graph-based +model was proposed. However, previous graph-based models are less accurate. +Hence, this study introduces a graph-structured approach named Semantic2Graph, +to model long-term dependencies in videos, thereby reducing computational costs +and raise the accuracy. We construct a graph structure of video at the +frame-level. Temporal edges are utilized to model the temporal relations and +action order within videos. Additionally, we have designed positive and +negative semantic edges, accompanied by corresponding edge weights, to capture +both long-term and short-term semantic relationships in video actions. Node +attributes encompass a rich set of multi-modal features extracted from video +content, graph structures, and label text, encompassing visual, structural, and +semantic cues. To synthesize this multi-modal information effectively, we +employ a graph neural network (GNN) model to fuse multi-modal features for node +action label classification. Experimental results demonstrate that +Semantic2Graph outperforms state-of-the-art methods in terms of performance, +particularly on benchmark datasets such as GTEA and 50Salads. Multiple ablation +experiments further validate the effectiveness of semantic features in +enhancing model performance. Notably, the inclusion of semantic edges in +Semantic2Graph allows for the cost-effective capture of long-term dependencies, +affirming its utility in addressing the challenges posed by computational +resource constraints in video-based vision models. + +
+
+ comment: 13 pages, 3 figures, 9 tables. Published on Applied Intelligence +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 110 + +
+
+
+ + ☆ Nevermind: Instruction Override and Moderation in Large Language Models + + +
+ Given the impressive capabilities of recent Large Language Models (LLMs), we +investigate and benchmark the most popular proprietary and different sized open +source models on the task of explicit instruction following in conflicting +situations, e.g. overrides. These include the ability of the model to override +the knowledge within the weights of the model, the ability to override (or +moderate) extracted knowledge in the prompt, and lastly the ability to perform +a full jailbreak. Experimentation performed suggest several key findings to +improve instruction following - larger models perform the best in following +instructions that override internal and contextual instructions, and are +obedient, even to a fault. When scaling to longer contexts via rope scaling, a +significant buffer needs to be maintained from the edge of the perplexity cliff +in order to maintain instruction following capabilities. Finally, we observe +improving instruction following, and subsequently instruction +overrides/jailbreaks, is fundamentally at odds with the ability of a language +model to follow given safety filters or guidelines. Thus, we postulate the most +effective approach for safe, trustworthy AI should be dealt external to the LLM +itself. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open + Language Models + + +
+ Mathematical reasoning poses a significant challenge for language models due +to its complex and structured nature. In this paper, we introduce DeepSeekMath +7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B +math-related tokens sourced from Common Crawl, together with natural language +and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the +competition-level MATH benchmark without relying on external toolkits and +voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. +Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. +The mathematical reasoning capability of DeepSeekMath is attributed to two key +factors: First, we harness the significant potential of publicly available web +data through a meticulously engineered data selection pipeline. Second, we +introduce Group Relative Policy Optimization (GRPO), a variant of Proximal +Policy Optimization (PPO), that enhances mathematical reasoning abilities while +concurrently optimizing the memory usage of PPO. + +
+
+
+
+
+ + ☆ GUARD: Role-playing to Generate Natural-language Jailbreakings to Test + Guideline Adherence of Large Language Models + + +
+ The discovery of "jailbreaks" to bypass safety filters of Large Language +Models (LLMs) and harmful responses have encouraged the community to implement +safety measures. One major safety measure is to proactively test the LLMs with +jailbreaks prior to the release. Therefore, such testing will require a method +that can generate jailbreaks massively and efficiently. In this paper, we +follow a novel yet intuitive strategy to generate jailbreaks in the style of +the human generation. We propose a role-playing system that assigns four +different roles to the user LLMs to collaborate on new jailbreaks. Furthermore, +we collect existing jailbreaks and split them into different independent +characteristics using clustering frequency and semantic patterns sentence by +sentence. We organize these characteristics into a knowledge graph, making them +more accessible and easier to retrieve. Our system of different roles will +leverage this knowledge graph to generate new jailbreaks, which have proved +effective in inducing LLMs to generate unethical or guideline-violating +responses. In addition, we also pioneer a setting in our system that will +automatically follow the government-issued guidelines to generate jailbreaks to +test whether LLMs follow the guidelines accordingly. We refer to our system as +GUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have +empirically validated the effectiveness of GUARD on three cutting-edge +open-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a +widely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the +realm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing +GUARD's versatility and contributing valuable insights for the development of +safer, more reliable LLM-based applications across diverse modalities. + +
+
+ comment: 22 papges +
+
+
+
+
+ + ☆ Deal, or no deal (or who knows)? Forecasting Uncertainty in + Conversations using Large Language Models + + +
+ Effective interlocutors account for the uncertain goals, beliefs, and +emotions of others. But even the best human conversationalist cannot perfectly +anticipate the trajectory of a dialogue. How well can language models represent +inherent uncertainty in conversations? We propose FortUne Dial, an expansion of +the long-standing "conversation forecasting" task: instead of just accuracy, +evaluation is conducted with uncertainty-aware metrics, effectively enabling +abstention on individual instances. We study two ways in which language models +potentially represent outcome uncertainty (internally, using scores and +directly, using tokens) and propose fine-tuning strategies to improve +calibration of both representations. Experiments on eight difficult negotiation +corpora demonstrate that our proposed fine-tuning strategies (a traditional +supervision strategy and an off-policy reinforcement learning strategy) can +calibrate smaller open-source models to compete with pre-trained models 10x +their size. + +
+
+ comment: 2 Figures; 7 Tables; 27 pages +
+
+
+
+
+ + ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models + + +
+ In the face of uncertainty, the ability to seek information is of fundamental +importance. In many practical applications, such as medical diagnosis and +troubleshooting, the information needed to solve the task is not initially +given, and has to be actively sought by asking follow-up questions (for +example, a doctor asking a patient for more details about their symptoms). In +this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to augment +large language models with the ability to actively seek information by asking +effective questions. UoT combines 1) an uncertainty-aware simulation approach +which enables the model to simulate possible future scenarios and how likely +they are to occur, 2) uncertainty-based rewards motivated by information gain +which incentivizes the model to seek information, and 3) a reward propagation +scheme to select the optimal question to ask in a way that maximizes the +expected reward. In experiments on medical diagnosis, troubleshooting and the +'20 Questions' game, UoT achieves an average performance improvement of 57.8% +in the rate of successful task completion across multiple LLMs compared with +direct prompting, and also improves efficiency (i.e., the number of questions +needed to complete the task). + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ ISPA: Inter-Species Phonetic Alphabet for Transcribing Animal Sounds ICASSP 2024 + + +
+ Traditionally, bioacoustics has relied on spectrograms and continuous, +per-frame audio representations for the analysis of animal sounds, also serving +as input to machine learning models. Meanwhile, the International Phonetic +Alphabet (IPA) system has provided an interpretable, language-independent +method for transcribing human speech sounds. In this paper, we introduce ISPA +(Inter-Species Phonetic Alphabet), a precise, concise, and interpretable system +designed for transcribing animal sounds into text. We compare acoustics-based +and feature-based methods for transcribing and classifying animal sounds, +demonstrating their comparable performance with baseline methods utilizing +continuous, dense audio representations. By representing animal sounds with +text, we effectively treat them as a "foreign language," and we show that +established human language ML paradigms and models, such as language models, +can be successfully applied to improve performance. + +
+
+ comment: Accepted at XAI-AI Workshop (IEEEXplore track) @ ICASSP 2024 +
+
+
+
+
+ + ☆ Understanding the Reasoning Ability of Language Models From the + Perspective of Reasoning Paths Aggregation + + +
+ Pre-trained language models (LMs) are able to perform complex reasoning +without explicit fine-tuning. To understand how pre-training with a next-token +prediction objective contributes to the emergence of such reasoning capability, +we propose that we can view an LM as deriving new conclusions by aggregating +indirect reasoning paths seen at pre-training time. We found this perspective +effective in two important cases of reasoning: logic reasoning with knowledge +graphs (KGs) and math reasoning with math word problems (MWPs). More +specifically, we formalize the reasoning paths as random walk paths on the +knowledge/reasoning graphs. Analyses of learned LM distributions suggest that a +weighted sum of relevant random walk path probabilities is a reasonable way to +explain how LMs reason. Experiments and analysis on multiple KG and MWP +datasets reveal the effect of training on random walk paths and suggest that +augmenting unlabeled random walk reasoning paths can improve real-world +multi-step reasoning performance. + +
+
+
+
+
+ + ☆ Skill Set Optimization: Reinforcing Language Model Behavior via + Transferable Skills + + +
+ Large language models (LLMs) have recently been used for sequential decision +making in interactive environments. However, leveraging environment reward +signals for continual LLM actor improvement is not straightforward. We propose +Skill Set Optimization (SSO) for improving LLM actor performance through +constructing and refining sets of transferable skills. SSO constructs skills by +extracting common subtrajectories with high rewards and generating subgoals and +instructions to represent each skill. These skills are provided to the LLM +actor in-context to reinforce behaviors with high rewards. Then, SSO further +refines the skill set by pruning skills that do not continue to result in high +rewards. We evaluate our method in the classic videogame NetHack and the text +environment ScienceWorld to demonstrate SSO's ability to optimize a set of +skills and perform in-context policy improvement. SSO outperforms baselines by +40% in our custom NetHack task and outperforms the previous state-of-the-art in +ScienceWorld by 35%. + +
+
+ comment: 8 pages, preprint +
+
+
+
+
+ + ☆ JOBSKAPE: A Framework for Generating Synthetic Job Postings to Enhance + Skill Matching EACL + + +
+ Recent approaches in skill matching, employing synthetic training data for +classification or similarity model training, have shown promising results, +reducing the need for time-consuming and expensive annotations. However, +previous synthetic datasets have limitations, such as featuring only one skill +per sentence and generally comprising short sentences. In this paper, we +introduce JobSkape, a framework to generate synthetic data that tackles these +limitations, specifically designed to enhance skill-to-taxonomy matching. +Within this framework, we create SkillSkape, a comprehensive open-source +synthetic dataset of job postings tailored for skill-matching tasks. We +introduce several offline metrics that show that our dataset resembles +real-world data. Additionally, we present a multi-step pipeline for skill +extraction and matching tasks using large language models (LLMs), benchmarking +against known supervised methodologies. We outline that the downstream +evaluation results on real-world data can beat baselines, underscoring its +efficacy and adaptability. + +
+
+ comment: Published at NLP4HR 2024 (EACL Workshop) +
+
+
+
+
+ + ☆ English Prompts are Better for NLI-based Zero-Shot Emotion + Classification than Target-Language Prompts + + +
+ Emotion classification in text is a challenging and subjective task, due to +the involved cognitive inference processes that are required to interpret a +textual stimulus. In addition, the set of emotion categories is highly +domain-specific. For instance, literature analysis might require the use of +aesthetic emotions (e.g., finding something beautiful), and social media +analysis could benefit from fine-grained sets (e.g., separating anger from +annoyance) in contrast to basic emotion categories. This renders the task an +interesting field for zero-shot classifications, in which the label set is not +known at model development time. Unfortunately, most resources for emotion +analysis are English, and therefore, most studies on emotion analysis have been +performed in English, including those that involve prompting language models +for text labels. This leaves us with a research gap that we address in this +paper: In which language should we prompt for emotion labels on non-English +texts? This is particularly of interest when we have access to a multilingual +large language model, because we could request labels with English prompts even +for non-English data. Our experiments with natural language inference-based +language models show that it is consistently better to use English prompts even +if the data is in a different language. + +
+
+ comment: submitted to the PromptEng workshop at The Web Conf +
+
+
+
+
+ + ☆ "Define Your Terms" : Enhancing Efficient Offensive Speech + Classification with Definition EACL 2024 + + +
+ The propagation of offensive content through social media channels has +garnered attention of the research community. Multiple works have proposed +various semantically related yet subtle distinct categories of offensive +speech. In this work, we explore meta-earning approaches to leverage the +diversity of offensive speech corpora to enhance their reliable and efficient +detection. We propose a joint embedding architecture that incorporates the +input's label and definition for classification via Prototypical Network. Our +model achieves at least 75% of the maximal F1-score while using less than 10% +of the available training data across 4 datasets. Our experimental findings +also provide a case study of training strategies valuable to combat resource +scarcity. + +
+
+ comment: Accepted to Main Conference, EACL 2024 +
+
+
+
+
+ + ☆ BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity + Text Embeddings Through Self-Knowledge Distillation + + +
+ In this paper, we present a new embedding model, called M3-Embedding, which +is distinguished for its versatility in Multi-Linguality, Multi-Functionality, +and Multi-Granularity. It can support more than 100 working languages, leading +to new state-of-the-art performances on multi-lingual and cross-lingual +retrieval tasks. It can simultaneously perform the three common retrieval +functionalities of embedding model: dense retrieval, multi-vector retrieval, +and sparse retrieval, which provides a unified model foundation for real-world +IR applications. It is able to process inputs of different granularities, +spanning from short sentences to long documents of up to 8192 tokens. The +effective training of M3-Embedding involves the following technical +contributions. We propose a novel self-knowledge distillation approach, where +the relevance scores from different retrieval functionalities can be integrated +as the teacher signal to enhance the training quality. We also optimize the +batching strategy, enabling a large batch size and high training throughput to +ensure the discriminativeness of embeddings. To the best of our knowledge, +M3-Embedding is the first embedding model which realizes such a strong +versatility. The model and code will be publicly available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Isotropy, Clusters, and Classifiers + + +
+ Whether embedding spaces use all their dimensions equally, i.e., whether they +are isotropic, has been a recent subject of discussion. Evidence has been +accrued both for and against enforcing isotropy in embedding spaces. In the +present paper, we stress that isotropy imposes requirements on the embedding +space that are not compatible with the presence of clusters -- which also +negatively impacts linear classification objectives. We demonstrate this fact +empirically and use it to shed light on previous results from the literature. + +
+
+
+
+
+ + ☆ Unified Hallucination Detection for Multimodal Large Language Models + + +
+ Despite significant strides in multimodal tasks, Multimodal Large Language +Models (MLLMs) are plagued by the critical issue of hallucination. The reliable +detection of such hallucinations in MLLMs has, therefore, become a vital aspect +of model evaluation and the safeguarding of practical application deployment. +Prior research in this domain has been constrained by a narrow focus on +singular tasks, an inadequate range of hallucination categories addressed, and +a lack of detailed granularity. In response to these challenges, our work +expands the investigative horizons of hallucination detection. We present a +novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate +the evaluation of advancements in hallucination detection methods. +Additionally, we unveil a novel unified multimodal hallucination detection +framework, UNIHD, which leverages a suite of auxiliary tools to validate the +occurrence of hallucinations robustly. We demonstrate the effectiveness of +UNIHD through meticulous evaluation and comprehensive analysis. We also provide +strategic insights on the application of specific tools for addressing various +categories of hallucinations. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ CIDAR: Culturally Relevant Instruction Dataset For Arabic + + +
+ Instruction tuning has emerged as a prominent methodology for teaching Large +Language Models (LLMs) to follow instructions. However, current instruction +datasets predominantly cater to English or are derived from English-dominated +LLMs, resulting in inherent biases toward Western culture. This bias +significantly impacts the linguistic structures of non-English languages such +as Arabic, which has a distinct grammar reflective of the diverse cultures +across the Arab region. This paper addresses this limitation by introducing +CIDAR: https://hf.co/datasets/arbml/CIDAR, the first open Arabic +instruction-tuning dataset culturally-aligned by human reviewers. CIDAR +contains 10,000 instruction and output pairs that represent the Arab region. We +discuss the cultural relevance of CIDAR via the analysis and comparison to +other models fine-tuned on other datasets. Our experiments show that CIDAR can +help enrich research efforts in aligning LLMs with the Arabic culture. All the +code is available at https://github.com/ARBML/CIDAR. + +
+
+
+
+
+ + ☆ Multi: Multimodal Understanding Leaderboard with Text and Images + + +
+ Rapid progress in multimodal large language models (MLLMs) highlights the +need to introduce challenging yet realistic benchmarks to the academic +community. Existing benchmarks primarily focus on simple natural image +understanding, but Multi emerges as a cutting-edge benchmark for MLLMs, +offering a comprehensive dataset for evaluating MLLMs against understanding +complex figures and tables, and scientific questions. This benchmark, +reflecting current realistic examination styles, provides multimodal inputs and +requires responses that are either precise or open-ended, similar to real-life +school tests. It challenges MLLMs with a variety of tasks, ranging from formula +derivation to image detail analysis, and cross-modality reasoning. Multi +includes over 18,000 questions, with a focus on science-based QA in diverse +formats. We also introduce Multi-Elite, a 500-question subset for testing the +extremities of MLLMs, and Multi-Extend, which enhances In-Context Learning +research with more than 4,500 knowledge pieces. Our evaluation indicates +significant potential for MLLM advancement, with GPT-4V achieving a 63.7% +accuracy rate on Multi, in contrast to other MLLMs scoring between 31.3% and +53.7%. Multi serves not only as a robust evaluation platform but also paves the +way for the development of expert-level AI. + +
+
+ comment: Details and access are available at: + https://OpenDFM.github.io/MULTI-Benchmark/ +
+
+
+
+
+ + ☆ Accurate and Well-Calibrated ICD Code Assignment Through Attention Over + Diverse Label Embeddings EACL2024 + + +
+ Although the International Classification of Diseases (ICD) has been adopted +worldwide, manually assigning ICD codes to clinical text is time-consuming, +error-prone, and expensive, motivating the development of automated approaches. +This paper describes a novel approach for automated ICD coding, combining +several ideas from previous related work. We specifically employ a strong +Transformer-based model as a text encoder and, to handle lengthy clinical +narratives, we explored either (a) adapting the base encoder model into a +Longformer, or (b) dividing the text into chunks and processing each chunk +independently. The representations produced by the encoder are combined with a +label embedding mechanism that explores diverse ICD code synonyms. Experiments +with different splits of the MIMIC-III dataset show that the proposed approach +outperforms the current state-of-the-art models in ICD coding, with the label +embeddings significantly contributing to the good performance. Our approach +also leads to properly calibrated classification results, which can effectively +inform downstream tasks such as quantification. + +
+
+ comment: Accepted to EACL2024 +
+
+
+
+
+ + ☆ Homograph Attacks on Maghreb Sentiment Analyzers NeurIPS + + +
+ We examine the impact of homograph attacks on the Sentiment Analysis (SA) +task of different Arabic dialects from the Maghreb North-African countries. +Homograph attacks result in a 65.3% decrease in transformer classification from +an F1-score of 0.95 to 0.33 when data is written in "Arabizi". The goal of this +study is to highlight LLMs weaknesses' and to prioritize ethical and +responsible Machine Learning. + +
+
+ comment: NAML, North Africans in Machine Leaning, NeurIPS, Neural Information + Processing Systems +
+
+
+
+
+ + ☆ Linguistic features for sentence difficulty prediction in ABSA + + +
+ One of the challenges of natural language understanding is to deal with the +subjectivity of sentences, which may express opinions and emotions that add +layers of complexity and nuance. Sentiment analysis is a field that aims to +extract and analyze these subjective elements from text, and it can be applied +at different levels of granularity, such as document, paragraph, sentence, or +aspect. Aspect-based sentiment analysis is a well-studied topic with many +available data sets and models. However, there is no clear definition of what +makes a sentence difficult for aspect-based sentiment analysis. In this paper, +we explore this question by conducting an experiment with three data sets: +"Laptops", "Restaurants", and "MTSC" (Multi-Target-dependent Sentiment +Classification), and a merged version of these three datasets. We study the +impact of domain diversity and syntactic diversity on difficulty. We use a +combination of classifiers to identify the most difficult sentences and analyze +their characteristics. We employ two ways of defining sentence difficulty. The +first one is binary and labels a sentence as difficult if the classifiers fail +to correctly predict the sentiment polarity. The second one is a six-level +scale based on how many of the top five best-performing classifiers can +correctly predict the sentiment polarity. We also define 9 linguistic features +that, combined, aim at estimating the difficulty at sentence level. + +
+
+
+
+
+ + ☆ Video-LaVIT: Unified Video-Language Pre-training with Decoupled + Visual-Motional Tokenization + + +
+ In light of recent advances in multimodal Large Language Models (LLMs), there +is increasing attention to scaling them from image-text data to more +informative real-world videos. Compared to static images, video poses unique +challenges for effective large-scale pre-training due to the modeling of its +spatiotemporal dynamics. In this paper, we address such limitations in +video-language pre-training with an efficient video decomposition that +represents each video as keyframes and temporal motions. These are then adapted +to an LLM using well-designed tokenizers that discretize visual and temporal +information as a few tokens, thus enabling unified generative pre-training of +videos, images, and text. At inference, the generated tokens from the LLM are +carefully recovered to the original continuous pixel space to create various +video content. Our proposed framework is both capable of comprehending and +generating image and video content, as demonstrated by its competitive +performance across 13 multimodal benchmarks in image and video understanding +and generation. Our code and models will be available at +https://video-lavit.github.io. + +
+
+
+
+
+ + ☆ Sociolinguistically Informed Interpretability: A Case Study on Hinglish + Emotion Classification EACL + + +
+ Emotion classification is a challenging task in NLP due to the inherent +idiosyncratic and subjective nature of linguistic expression, especially with +code-mixed data. Pre-trained language models (PLMs) have achieved high +performance for many tasks and languages, but it remains to be seen whether +these models learn and are robust to the differences in emotional expression +across languages. Sociolinguistic studies have shown that Hinglish speakers +switch to Hindi when expressing negative emotions and to English when +expressing positive emotions. To understand if language models can learn these +associations, we study the effect of language on emotion prediction across 3 +PLMs on a Hinglish emotion classification dataset. Using LIME and token level +language ID, we find that models do learn these associations between language +choice and emotional expression. Moreover, having code-mixed data present in +the pre-training can augment that learning when task-specific data is scarce. +We also conclude from the misclassifications that the models may overgeneralise +this heuristic to other infrequent examples where this sociolinguistic +phenomenon does not apply. + +
+
+ comment: 5 pages, Accepted to SIGTYP 2024 @ EACL +
+
+
+
+
+ + ☆ Constrained Decoding for Cross-lingual Label Projection ICLR 2024 + + +
+ Zero-shot cross-lingual transfer utilizing multilingual LLMs has become a +popular learning paradigm for low-resource languages with no labeled training +data. However, for NLP tasks that involve fine-grained predictions on words and +phrases, the performance of zero-shot cross-lingual transfer learning lags far +behind supervised fine-tuning methods. Therefore, it is common to exploit +translation and label projection to further improve the performance by (1) +translating training data that is available in a high-resource language (e.g., +English) together with the gold labels into low-resource languages, and/or (2) +translating test data in low-resource languages to a high-source language to +run inference on, then projecting the predicted span-level labels back onto the +original test data. However, state-of-the-art marker-based label projection +methods suffer from translation quality degradation due to the extra label +markers injected in the input to the translation model. In this work, we +explore a new direction that leverages constrained decoding for label +projection to overcome the aforementioned issues. Our new method not only can +preserve the quality of translated texts but also has the versatility of being +applicable to both translating training and translating test data strategies. +This versatility is crucial as our experiments reveal that translating test +data can lead to a considerable boost in performance compared to translating +only training data. We evaluate on two cross-lingual transfer tasks, namely +Named Entity Recognition and Event Argument Extraction, spanning 20 languages. +The results demonstrate that our approach outperforms the state-of-the-art +marker-based method by a large margin and also shows better performance than +other label projection methods that rely on external word alignment. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ☆ Intent-based Prompt Calibration: Enhancing prompt optimization with + synthetic boundary cases + + +
+ Prompt engineering is a challenging and important task due to the high +sensitivity of Large Language Models (LLMs) to the given prompt and the +inherent ambiguity of a textual task instruction. Automatic prompt engineering +is essential to achieve optimized performance from LLMs. Recent studies have +demonstrated the capabilities of LLMs to automatically conduct prompt +engineering by employing a meta-prompt that incorporates the outcomes of the +last trials and proposes an improved prompt. However, this requires a +high-quality benchmark to compare different prompts, which is difficult and +expensive to acquire in many real-world use cases. In this work, we introduce a +new method for automatic prompt engineering, using a calibration process that +iteratively refines the prompt to the user intent. During the optimization +process, the system jointly generates synthetic data of boundary use cases and +optimizes the prompt according to the generated dataset. We demonstrate the +effectiveness of our method with respect to strong proprietary models on +real-world tasks such as moderation and generation. Our method outperforms +state-of-the-art methods with a limited number of annotated samples. +Furthermore, we validate the advantages of each one of the system's key +components. Our system is built in a modular way, facilitating easy adaptation +to other tasks. The code is available +$\href{https://github.com/Eladlev/AutoPrompt}{here}$. + +
+
+
+
+
+ + ☆ Multilingual transformer and BERTopic for short text topic modeling: The + case of Serbian + + +
+ This paper presents the results of the first application of BERTopic, a +state-of-the-art topic modeling technique, to short text written in a +morphologi-cally rich language. We applied BERTopic with three multilingual +embed-ding models on two levels of text preprocessing (partial and full) to +evalu-ate its performance on partially preprocessed short text in Serbian. We +also compared it to LDA and NMF on fully preprocessed text. The experiments +were conducted on a dataset of tweets expressing hesitancy toward COVID-19 +vaccination. Our results show that with adequate parameter setting, BERTopic +can yield informative topics even when applied to partially pre-processed short +text. When the same parameters are applied in both prepro-cessing scenarios, +the performance drop on partially preprocessed text is minimal. Compared to LDA +and NMF, judging by the keywords, BERTopic offers more informative topics and +gives novel insights when the number of topics is not limited. The findings of +this paper can be significant for re-searchers working with other +morphologically rich low-resource languages and short text. + +
+
+
+
+
+ + ☆ Multi-Lingual Malaysian Embedding: Leveraging Large Language Models for + Semantic Representations + + +
+ In this work, we present a comprehensive exploration of finetuning Malaysian +language models, specifically Llama2 and Mistral, on embedding tasks involving +negative and positive pairs. We release two distinct models tailored for +Semantic Similarity and Retrieval-Augmented Generation (RAG). + For Semantic Similarity, our 600 million parameter Llama2 model outperforms +OpenAI text-embedding-ada-002 across all recall@k metrics for b.cari.com.my, +c.cari.com.my, Malay news, and Malaysian Twitter test sets. + In the realm of RAG models, our approach proves competitive with OpenAI +text-embedding-ada-002 in the Malaysian context. Notably, our 2 billion +parameter Llama2 model achieves superior Recall@5, Recall@10 for the "Melayu" +keyword research papers dataset and excels in Recall@3, Recall@5, and Recall@10 +for the lom.agc.gov.my dataset. + These findings underscore the effectiveness of our finetuning strategy and +highlight the performance gains in both Semantic Similarity and RAG tasks. + All models released at +https://huggingface.co/collections/mesolitica/malaysian-embedding-6523612bfe5881ad35f81b99 + +
+
+
+
+
+ + ☆ A Comprehensive Study of the Current State-of-the-Art in Nepali + Automatic Speech Recognition Systems + + +
+ In this paper, we examine the research conducted in the field of Nepali +Automatic Speech Recognition (ASR). The primary objective of this survey is to +conduct a comprehensive review of the works on Nepali Automatic Speech +Recognition Systems completed to date, explore the different datasets used, +examine the technology utilized, and take account of the obstacles encountered +in implementing the Nepali ASR system. In tandem with the global trends of +ever-increasing research on speech recognition based research, the number of +Nepalese ASR-related projects are also growing. Nevertheless, the investigation +of language and acoustic models of the Nepali language has not received +adequate attention compared to languages that possess ample resources. In this +context, we provide a framework as well as directions for future +investigations. + +
+
+ comment: Accepted in International Conference on Technologies for Computer, + Electrical, Electronics & Communication (ICT-CEEL 2023) +
+
+
+
+
+ + ☆ EasyInstruct: An Easy-to-use Instruction Processing Framework for Large + Language Models + + +
+ In recent years, instruction tuning has gained increasing attention and +emerged as a crucial technique to enhance the capabilities of Large Language +Models (LLMs). To construct high-quality instruction datasets, many instruction +processing approaches have been proposed, aiming to achieve a delicate balance +between data quantity and data quality. Nevertheless, due to inconsistencies +that persist among various instruction processing methods, there is no standard +open-source instruction processing implementation framework available for the +community, which hinders practitioners from further developing and advancing. +To facilitate instruction processing research and development, we present +EasyInstruct, an easy-to-use instruction processing framework for LLMs, which +modularizes instruction generation, selection, and prompting, while also +considering their combination and interaction. EasyInstruct is publicly +released and actively maintained at https://github.com/zjunlp/EasyInstruct, +along with a running demo App at +https://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for +broader research centered on instruction data. + +
+
+ comment: Ongoing work; the project website is at + https://zjunlp.github.io/project/EasyInstruct, code is at + https://github.com/zjunlp/EasyInstruct, demo is at + https://huggingface.co/spaces/zjunlp/EasyInstruct +
+
+
+
+
+ + ☆ SIDU-TXT: An XAI Algorithm for NLP with a Holistic Assessment Approach + + +
+ Explainable AI (XAI) aids in deciphering 'black-box' models. While several +methods have been proposed and evaluated primarily in the image domain, the +exploration of explainability in the text domain remains a growing research +area. In this paper, we delve into the applicability of XAI methods for the +text domain. In this context, the 'Similarity Difference and Uniqueness' (SIDU) +XAI method, recognized for its superior capability in localizing entire salient +regions in image-based classification is extended to textual data. The extended +method, SIDU-TXT, utilizes feature activation maps from 'black-box' models to +generate heatmaps at a granular, word-based level, thereby providing +explanations that highlight contextually significant textual elements crucial +for model predictions. Given the absence of a unified standard for assessing +XAI methods, this study applies a holistic three-tiered comprehensive +evaluation framework: Functionally-Grounded, Human-Grounded and +Application-Grounded, to assess the effectiveness of the proposed SIDU-TXT +across various experiments. We find that, in sentiment analysis task of a movie +review dataset, SIDU-TXT excels in both functionally and human-grounded +evaluations, demonstrating superior performance through quantitative and +qualitative analyses compared to benchmarks like Grad-CAM and LIME. In the +application-grounded evaluation within the sensitive and complex legal domain +of asylum decision-making, SIDU-TXT and Grad-CAM demonstrate comparable +performances, each with its own set of strengths and weaknesses. However, both +methods fall short of entirely fulfilling the sophisticated criteria of expert +expectations, highlighting the imperative need for additional research in XAI +methods suitable for such domains. + +
+
+ comment: Preprint submitted to Elsevier on Jan 5th, 2024 +
+
+
+
+
+ + ☆ Automatic Combination of Sample Selection Strategies for Few-Shot + Learning + + +
+ In few-shot learning, such as meta-learning, few-shot fine-tuning or +in-context learning, the limited number of samples used to train a model have a +significant impact on the overall success. Although a large number of sample +selection strategies exist, their impact on the performance of few-shot +learning is not extensively known, as most of them have been so far evaluated +in typical supervised settings only. In this paper, we thoroughly investigate +the impact of 20 sample selection strategies on the performance of 5 few-shot +learning approaches over 8 image and 6 text datasets. In addition, we propose a +new method for automatic combination of sample selection strategies (ACSESS) +that leverages the strengths and complementary information of the individual +strategies. The experimental results show that our method consistently +outperforms the individual selection strategies, as well as the recently +proposed method for selecting support examples for in-context learning. We also +show a strong modality, dataset and approach dependence for the majority of +strategies as well as their dependence on the number of shots - demonstrating +that the sample selection strategies play a significant role for lower number +of shots, but regresses to random selection at higher number of shots. + +
+
+
+
+
+ + ☆ UniMem: Towards a Unified View of Long-Context Large Language Models + + +
+ Long-context processing is a critical ability that constrains the +applicability of large language models. Although there exist various methods +devoted to enhancing the long-context processing ability of large language +models (LLMs), they are developed in an isolated manner and lack systematic +analysis and integration of their strengths, hindering further developments. In +this paper, we introduce UniMem, a unified framework that reformulates existing +long-context methods from the view of memory augmentation of LLMs. UniMem is +characterized by four key dimensions: Memory Management, Memory Writing, Memory +Reading, and Memory Injection, providing a systematic theory for understanding +various long-context methods. We reformulate 16 existing methods based on +UniMem and analyze four representative methods: Transformer-XL, Memorizing +Transformer, RMT, and Longformer into equivalent UniMem forms to reveal their +design principles and strengths. Based on these analyses, we propose UniMix, an +innovative approach that integrates the strengths of these algorithms. +Experimental results show that UniMix achieves superior performance in handling +long contexts with significantly lower perplexity than baselines. + +
+
+
+
+
+ + ☆ Decoding-time Realignment of Language Models + + +
+ Aligning language models with human preferences is crucial for reducing +errors and biases in these models. Alignment techniques, such as reinforcement +learning from human feedback (RLHF), are typically cast as optimizing a +tradeoff between human preference rewards and a proximity regularization term +that encourages staying close to the unaligned model. Selecting an appropriate +level of regularization is critical: insufficient regularization can lead to +reduced model capabilities due to reward hacking, whereas excessive +regularization hinders alignment. Traditional methods for finding the optimal +regularization level require retraining multiple models with varying +regularization strengths. This process, however, is resource-intensive, +especially for large models. To address this challenge, we propose +decoding-time realignment (DeRa), a simple method to explore and evaluate +different regularization strengths in aligned models without retraining. DeRa +enables control over the degree of alignment, allowing users to smoothly +transition between unaligned and aligned models. It also enhances the +efficiency of hyperparameter tuning by enabling the identification of effective +regularization strengths using a validation dataset. + +
+
+
+
+
+ + ☆ Conversation Reconstruction Attack Against GPT Models + + +
+ In recent times, significant advancements have been made in the field of +large language models (LLMs), represented by GPT series models. To optimize +task execution, users often engage in multi-round conversations with GPT models +hosted in cloud environments. These multi-round conversations, potentially +replete with private information, require transmission and storage within the +cloud. However, this operational paradigm introduces additional attack +surfaces. In this paper, we first introduce a specific Conversation +Reconstruction Attack targeting GPT models. Our introduced Conversation +Reconstruction Attack is composed of two steps: hijacking a session and +reconstructing the conversations. Subsequently, we offer an exhaustive +evaluation of the privacy risks inherent in conversations when GPT models are +subjected to the proposed attack. However, GPT-4 demonstrates certain +robustness to the proposed attacks. We then introduce two advanced attacks +aimed at better reconstructing previous conversations, specifically the UNR +attack and the PBU attack. Our experimental findings indicate that the PBU +attack yields substantial performance across all models, achieving semantic +similarity scores exceeding 0.60, while the UNR attack is effective solely on +GPT-3.5. Our results reveal the concern about privacy risks associated with +conversations involving GPT models and aim to draw the community's attention to +prevent the potential misuse of these models' remarkable capabilities. We will +responsibly disclose our findings to the suppliers of related large language +models. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ Putting Context in Context: the Impact of Discussion Structure on Text + Classification EACL 2024 + + +
+ Current text classification approaches usually focus on the content to be +classified. Contextual aspects (both linguistic and extra-linguistic) are +usually neglected, even in tasks based on online discussions. Still in many +cases the multi-party and multi-turn nature of the context from which these +elements are selected can be fruitfully exploited. In this work, we propose a +series of experiments on a large dataset for stance detection in English, in +which we evaluate the contribution of different types of contextual +information, i.e. linguistic, structural and temporal, by feeding them as +natural language input into a transformer-based model. We also experiment with +different amounts of training data and analyse the topology of local discussion +networks in a privacy-compliant way. Results show that structural information +can be highly beneficial to text classification but only under certain +circumstances (e.g. depending on the amount of training data and on discussion +chain complexity). Indeed, we show that contextual information on smaller +datasets from other classification tasks does not yield significant +improvements. Our framework, based on local discussion networks, allows the +integration of structural information, while minimising user profiling, thus +preserving their privacy. + +
+
+ comment: Accepted to EACL 2024 main conference +
+
+
+
+
+ + ☆ Automated Cognate Detection as a Supervised Link Prediction Task with + Cognate Transformer EACL-2024 + + +
+ Identification of cognates across related languages is one of the primary +problems in historical linguistics. Automated cognate identification is helpful +for several downstream tasks including identifying sound correspondences, +proto-language reconstruction, phylogenetic classification, etc. Previous +state-of-the-art methods for cognate identification are mostly based on +distributions of phonemes computed across multilingual wordlists and make +little use of the cognacy labels that define links among cognate clusters. In +this paper, we present a transformer-based architecture inspired by +computational biology for the task of automated cognate detection. Beyond a +certain amount of supervision, this method performs better than the existing +methods, and shows steady improvement with further increase in supervision, +thereby proving the efficacy of utilizing the labeled information. We also +demonstrate that accepting multiple sequence alignments as input and having an +end-to-end architecture with link prediction head saves much computation time +while simultaneously yielding superior performance. + +
+
+ comment: Accepted to EACL-2024 main conference +
+
+
+
+
+ + ☆ A Computational Model for the Assessment of Mutual Intelligibility Among + Closely Related Languages + + +
+ Closely related languages show linguistic similarities that allow speakers of +one language to understand speakers of another language without having actively +learned it. Mutual intelligibility varies in degree and is typically tested in +psycholinguistic experiments. To study mutual intelligibility computationally, +we propose a computer-assisted method using the Linear Discriminative Learner, +a computational model developed to approximate the cognitive processes by which +humans learn languages, which we expand with multilingual semantic vectors and +multilingual sound classes. We test the model on cognate data from German, +Dutch, and English, three closely related Germanic languages. We find that our +model's comprehension accuracy depends on 1) the automatic trimming of +inflections and 2) the language pair for which comprehension is tested. Our +multilingual modelling approach does not only offer new methodological findings +for automatic testing of mutual intelligibility across languages but also +extends the use of Linear Discriminative Learning to multilingual settings. + +
+
+ comment: To appear in: Proceedings of the 6th Workshop on Research in + Computational Linguistic Typology and Multilingual NLP (SIGTYP 2024) +
+
+
+
+
+ + ☆ LLM Agents in Interaction: Measuring Personality Consistency and + Linguistic Alignment in Interacting Populations of Large Language Models EACL 2024 + + +
+ While both agent interaction and personalisation are vibrant topics in +research on large language models (LLMs), there has been limited focus on the +effect of language interaction on the behaviour of persona-conditioned LLM +agents. Such an endeavour is important to ensure that agents remain consistent +to their assigned traits yet are able to engage in open, naturalistic +dialogues. In our experiments, we condition GPT-3.5 on personality profiles +through prompting and create a two-group population of LLM agents using a +simple variability-inducing sampling algorithm. We then administer personality +tests and submit the agents to a collaborative writing task, finding that +different profiles exhibit different degrees of personality consistency and +linguistic alignment to their conversational partners. Our study seeks to lay +the groundwork for better understanding of dialogue-based interaction between +LLMs and highlights the need for new approaches to crafting robust, more +human-like LLM personas for interactive environments. + +
+
+ comment: To appear in Proceedings of the 1st Personalization of Generative AI + Workshop, EACL 2024 +
+
+
+
+
+ + ☆ Approximate Attributions for Off-the-Shelf Siamese Transformers EACL 2024 + + +
+ Siamese encoders such as sentence transformers are among the least understood +deep models. Established attribution methods cannot tackle this model class +since it compares two inputs rather than processing a single one. To address +this gap, we have recently proposed an attribution method specifically for +Siamese encoders (M\"oller et al., 2023). However, it requires models to be +adjusted and fine-tuned and therefore cannot be directly applied to +off-the-shelf models. In this work, we reassess these restrictions and propose +(i) a model with exact attribution ability that retains the original model's +predictive performance and (ii) a way to compute approximate attributions for +off-the-shelf models. We extensively compare approximate and exact attributions +and use them to analyze the models' attendance to different linguistic aspects. +We gain insights into which syntactic roles Siamese transformers attend to, +confirm that they mostly ignore negation, explore how they judge semantically +opposite adjectives, and find that they exhibit lexical bias. + +
+
+ comment: Accepted for EACL 2024, St. Julian's, Malta +
+
+
+
+
+ + ☆ How do Large Language Models Learn In-Context? Query and Key Matrices of + In-Context Heads are Two Towers for Metric Learning + + +
+ We explore the mechanism of in-context learning and propose a hypothesis +using locate-and-project method. In shallow layers, the features of +demonstrations are merged into their corresponding labels, and the features of +the input text are aggregated into the last token. In deep layers, in-context +heads make great contributions. In each in-context head, the value-output +matrix extracts the labels' features. Query and key matrices compute the +attention weights between the input text and each demonstration. The larger the +attention weight is, the more label information is transferred into the last +token for predicting the next word. Query and key matrices can be regarded as +two towers for learning the similarity metric between the input text and each +demonstration. Based on this hypothesis, we explain why imbalanced labels and +demonstration order affect predictions. We conduct experiments on GPT2 large, +Llama 7B, 13B and 30B. The results can support our analysis. Overall, our study +provides a new method and a reasonable hypothesis for understanding the +mechanism of in-context learning. Our code will be released on github. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ EEVEE: An Easy Annotation Tool for Natural Language Processing EACL + 2024 + + +
+ Annotation tools are the starting point for creating Natural Language +Processing (NLP) datasets. There is a wide variety of tools available; setting +up these tools is however a hindrance. We propose EEVEE, an annotation tool +focused on simplicity, efficiency, and ease of use. It can run directly in the +browser (no setup required) and uses tab-separated files (as opposed to +character offsets or task-specific formats) for annotation. It allows for +annotation of multiple tasks on a single dataset and supports four task-types: +sequence labeling, span labeling, text classification and seq2seq. + +
+
+ comment: 6 pages; accepted to The Linguistic Annotation Workshop (LAW) at EACL + 2024 +
+
+
+
+
+ + ☆ Comparing Knowledge Sources for Open-Domain Scientific Claim + Verification EACL 2024 + + +
+ The increasing rate at which scientific knowledge is discovered and health +claims shared online has highlighted the importance of developing efficient +fact-checking systems for scientific claims. The usual setting for this task in +the literature assumes that the documents containing the evidence for claims +are already provided and annotated or contained in a limited corpus. This +renders the systems unrealistic for real-world settings where knowledge sources +with potentially millions of documents need to be queried to find relevant +evidence. In this paper, we perform an array of experiments to test the +performance of open-domain claim verification systems. We test the final +verdict prediction of systems on four datasets of biomedical and health claims +in different settings. While keeping the pipeline's evidence selection and +verdict prediction parts constant, document retrieval is performed over three +common knowledge sources (PubMed, Wikipedia, Google) and using two different +information retrieval techniques. We show that PubMed works better with +specialized biomedical claims, while Wikipedia is more suited for everyday +health concerns. Likewise, BM25 excels in retrieval precision, while semantic +search in recall of relevant evidence. We discuss the results, outline frequent +retrieval patterns and challenges, and provide promising future directions. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ☆ With a Little Help from my (Linguistic) Friends: Topic Segmentation of + Multi-party Casual Conversations + + +
+ Topics play an important role in the global organisation of a conversation as +what is currently discussed constrains the possible contributions of the +participant. Understanding the way topics are organised in interaction would +provide insight on the structure of dialogue beyond the sequence of utterances. +However, studying this high-level structure is a complex task that we try to +approach by first segmenting dialogues into smaller topically coherent sets of +utterances. Understanding the interactions between these segments would then +enable us to propose a model of topic organisation at a dialogue level. In this +paper we work with open-domain conversations and try to reach a comparable +level of accuracy as recent machine learning based topic segmentation models +but with a formal approach. The features we identify as meaningful for this +task help us understand better the topical structure of a conversation. + +
+
+
+
+
+ + ☆ Are Sounds Sound for Phylogenetic Reconstruction? + + +
+ In traditional studies on language evolution, scholars often emphasize the +importance of sound laws and sound correspondences for phylogenetic inference +of language family trees. However, to date, computational approaches have +typically not taken this potential into account. Most computational studies +still rely on lexical cognates as major data source for phylogenetic +reconstruction in linguistics, although there do exist a few studies in which +authors praise the benefits of comparing words at the level of sound sequences. +Building on (a) ten diverse datasets from different language families, and (b) +state-of-the-art methods for automated cognate and sound correspondence +detection, we test, for the first time, the performance of sound-based versus +cognate-based approaches to phylogenetic reconstruction. Our results show that +phylogenies reconstructed from lexical cognates are topologically closer, by +approximately one third with respect to the generalized quartet distance on +average, to the gold standard phylogenies than phylogenies reconstructed from +sound correspondences. + +
+
+ comment: Paper accepted for SIGTYP (2024): H\"auser, Luise; J\"ager, Gerhard; + List, Johann-Mattis; Rama, Taraka; and Stamatakis, Alexandros (2024): Are + sounds sound for phylogenetic reconstruction? In: Proceedings of the 6th + Workshop on Research in Computational Linguistic Typology and Multilingual + NLP (SIGTYP 2024) +
+
+
+
+
+ + ☆ Graph-enhanced Large Language Models in Asynchronous Plan Reasoning + + +
+ Reasoning about asynchronous plans is challenging since it requires +sequential and parallel planning to optimize time costs. Can large language +models (LLMs) succeed at this task? Here, we present the first large-scale +study investigating this question. We find that a representative set of closed +and open-source LLMs, including GPT-4 and LLaMA-2, behave poorly when not +supplied with illustrations about the task-solving process in our benchmark +AsyncHow. We propose a novel technique called Plan Like a Graph (PLaG) that +combines graphs with natural language prompts and achieves state-of-the-art +results. We show that although PLaG can boost model performance, LLMs still +suffer from drastic degradation when task complexity increases, highlighting +the limits of utilizing LLMs for simulating digital devices. We see our study +as an exciting step towards using LLMs as efficient autonomous agents. + +
+
+
+
+
+ + ☆ KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language + Models + + +
+ The lottery ticket hypothesis posits the existence of ``winning tickets'' +within a randomly initialized neural network. Do winning tickets exist for LLMs +in fine-tuning scenarios? How can we find such winning tickets? In this paper, +we propose KS-Lottery, a method to identify a small subset of LLM parameters +highly effective in multilingual fine-tuning. Our key idea is to use +Kolmogorov-Smirnov Test to analyze the distribution shift of parameters before +and after fine-tuning. We further theoretically prove that KS-Lottery can find +the certified winning tickets in the embedding layer, fine-tuning on the found +parameters is guaranteed to perform as well as full fine-tuning. Comparing +KS-Lottery with other parameter-efficient tuning algorithms on translation +tasks, the experimental results show that KS-Lottery finds a much smaller set +of parameters for fine-tuning while achieving the comparable performance as +full fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens' +embedding of LLaMA suffices to reach the fine-tuning translation performance. +Code and model will be released to the public. + +
+
+
+
+
+ + ☆ Rethinking Optimization and Architecture for Tiny Language Models + + +
+ The power of large language models (LLMs) has been demonstrated through +numerous data and computing resources. However, the application of language +models on mobile devices is facing huge challenge on the computation and memory +costs, that is, tiny language models with high performance are urgently +required. Limited by the highly complex training process, there are many +details for optimizing language models that are seldom studied carefully. In +this study, based on a tiny language model with 1B parameters, we carefully +design a series of empirical study to analyze the effect of each component. +Three perspectives are mainly discussed, i.e., neural architecture, parameter +initialization, and optimization strategy. Several design formulas are +empirically proved especially effective for tiny language models, including +tokenizer compression, architecture tweaking, parameter inheritance and +multiple-round training. Then we train PanGu-$\pi$-1B Pro and PanGu-$\pi$-1.5B +Pro on 1.6T multilingual corpora, following the established formulas. +Experimental results demonstrate the improved optimization and architecture +yield a notable average improvement of 8.87 on benchmark evaluation sets for +PanGu-$\pi$-1B Pro. Besides, PanGu-$\pi$-1.5B Pro surpasses a range of SOTA +models with larger model sizes, validating its superior performance. The code +will be released soon (https://github.com/YuchuanTian/RethinkTinyLM). + +
+
+
+
+
+ + ☆ From Partial to Strictly Incremental Constituent Parsing EACL 2024 + + +
+ We study incremental constituent parsers to assess their capacity to output +trees based on prefix representations alone. Guided by strictly left-to-right +generative language models and tree-decoding modules, we build parsers that +adhere to a strong definition of incrementality across languages. This builds +upon work that asserted incrementality, but that mostly only enforced it on +either the encoder or the decoder. Finally, we conduct an analysis against +non-incremental and partially incremental models. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ☆ Dual Knowledge Distillation for Efficient Sound Event Detection ICASSP 2024 + + +
+ Sound event detection (SED) is essential for recognizing specific sounds and +their temporal locations within acoustic signals. This becomes challenging +particularly for on-device applications, where computational resources are +limited. To address this issue, we introduce a novel framework referred to as +dual knowledge distillation for developing efficient SED systems in this work. +Our proposed dual knowledge distillation commences with temporal-averaging +knowledge distillation (TAKD), utilizing a mean student model derived from the +temporal averaging of the student model's parameters. This allows the student +model to indirectly learn from a pre-trained teacher model, ensuring a stable +knowledge distillation. Subsequently, we introduce embedding-enhanced feature +distillation (EEFD), which involves incorporating an embedding distillation +layer within the student model to bolster contextual learning. On DCASE 2023 +Task 4A public evaluation dataset, our proposed SED system with dual knowledge +distillation having merely one-third of the baseline model's parameters, +demonstrates superior performance in terms of PSDS1 and PSDS2. This highlights +the importance of proposed dual knowledge distillation for compact SED systems, +which can be ideal for edge devices. + +
+
+ comment: Accepted to ICASSP 2024 (Deep Neural Network Model Compression + Workshop) +
+
+
+
+
+ + ☆ List-aware Reranking-Truncation Joint Model for Search and + Retrieval-augmented Generation WWW 2024 + + +
+ The results of information retrieval (IR) are usually presented in the form +of a ranked list of candidate documents, such as web search for humans and +retrieval-augmented generation for large language models (LLMs). List-aware +retrieval aims to capture the list-level contextual features to return a better +list, mainly including reranking and truncation. Reranking finely re-scores the +documents in the list. Truncation dynamically determines the cut-off point of +the ranked list to achieve the trade-off between overall relevance and avoiding +misinformation from irrelevant documents. Previous studies treat them as two +separate tasks and model them separately. However, the separation is not +optimal. First, it is hard to share the contextual information of the ranking +list between the two tasks. Second, the separate pipeline usually meets the +error accumulation problem, where the small error from the reranking stage can +largely affect the truncation stage. To solve these problems, we propose a +Reranking-Truncation joint model (GenRT) that can perform the two tasks +concurrently. GenRT integrates reranking and truncation via generative paradigm +based on encoder-decoder architecture. We also design the novel loss functions +for joint optimization to make the model learn both tasks. Sharing parameters +by the joint model is conducive to making full use of the common modeling +information of the two tasks. Besides, the two tasks are performed concurrently +and co-optimized to solve the error accumulation problem between separate +stages. Experiments on public learning-to-rank benchmarks and open-domain Q\&A +tasks show that our method achieves SOTA performance on both reranking and +truncation tasks for web search and retrieval-augmented LLMs. + +
+
+ comment: Accepted by WWW 2024 +
+
+
+
+
+ + ☆ KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache + + +
+ Efficiently serving large language models (LLMs) requires batching many +requests together to reduce the cost per request. Yet, the key-value (KV) +cache, which stores attention keys and values to avoid re-computations, +significantly increases memory demands and becomes the new bottleneck in speed +and memory usage. This memory demand increases with larger batch sizes and +longer context lengths. Additionally, the inference speed is limited by the +size of KV cache, as the GPU's SRAM must load the entire KV cache from the main +GPU memory for each token generated, causing the computational core to be idle +during this process. A straightforward and effective solution to reduce KV +cache size is quantization, which decreases the total bytes taken by KV cache. +However, there is a lack of in-depth studies that explore the element +distribution of KV cache to understand the hardness and limitation of KV cache +quantization. To fill the gap, we conducted a comprehensive study on the +element distribution in KV cache of popular LLMs. Our findings indicate that +the key cache should be quantized per-channel, i.e., group elements along the +channel dimension and quantize them together. In contrast, the value cache +should be quantized per-token. From this analysis, we developed a tuning-free +2bit KV cache quantization algorithm, named KIVI. With the hardware-friendly +implementation, KIVI can enable Llama (Llama-2), Falcon, and Mistral models to +maintain almost the same quality while using $\mathbf{2.6\times}$ less peak +memory usage (including the model weight). This reduction in memory usage +enables up to $\mathbf{4\times}$ larger batch size, bringing +$\mathbf{2.35\times \sim 3.47\times}$ throughput on real LLM inference +workload. The source code is available at https://github.com/jy-yuan/KIVI. + +
+
+
+
+
+ + ☆ Exploiting Class Probabilities for Black-box Sentence-level Attacks EACL 2024 + + +
+ Sentence-level attacks craft adversarial sentences that are synonymous with +correctly-classified sentences but are misclassified by the text classifiers. +Under the black-box setting, classifiers are only accessible through their +feedback to queried inputs, which is predominately available in the form of +class probabilities. Even though utilizing class probabilities results in +stronger attacks, due to the challenges of using them for sentence-level +attacks, existing attacks use either no feedback or only the class labels. +Overcoming the challenges, we develop a novel algorithm that uses class +probabilities for black-box sentence-level attacks, investigate the +effectiveness of using class probabilities on the attack's success, and examine +the question if it is worthy or practical to use class probabilities by +black-box sentence-level attacks. We conduct extensive evaluations of the +proposed attack comparing with the baselines across various classifiers and +benchmark datasets. + +
+
+ comment: EACL 2024 Findings +
+
+
+
+
+ + ☆ Large Language Models are Geographically Biased + + +
+ Large Language Models (LLMs) inherently carry the biases contained in their +training corpora, which can lead to the perpetuation of societal harm. As the +impact of these foundation models grows, understanding and evaluating their +biases becomes crucial to achieving fairness and accuracy. We propose to study +what LLMs know about the world we live in through the lens of geography. This +approach is particularly powerful as there is ground truth for the numerous +aspects of human life that are meaningfully projected onto geographic space +such as culture, race, language, politics, and religion. We show various +problematic geographic biases, which we define as systemic errors in geospatial +predictions. Initially, we demonstrate that LLMs are capable of making accurate +zero-shot geospatial predictions in the form of ratings that show strong +monotonic correlation with ground truth (Spearman's $\rho$ of up to 0.89). We +then show that LLMs exhibit common biases across a range of objective and +subjective topics. In particular, LLMs are clearly biased against locations +with lower socioeconomic conditions (e.g. most of Africa) on a variety of +sensitive subjective topics such as attractiveness, morality, and intelligence +(Spearman's $\rho$ of up to 0.70). Finally, we introduce a bias score to +quantify this and find that there is significant variation in the magnitude of +bias across existing LLMs. + +
+
+
+
+
+ + ☆ Multi-step Problem Solving Through a Verifier: An Empirical Analysis on + Model-induced Process Supervision + + +
+ Process supervision, using a trained verifier to evaluate the intermediate +steps generated by reasoner, has demonstrated significant improvements in +multi-step problem solving. In this paper, to avoid expensive human annotation +effort on the verifier training data, we introduce Model-induced Process +Supervision (MiPS), a novel method for automating data curation. MiPS annotates +an intermediate step by sampling completions of this solution through the +reasoning model, and obtaining an accuracy defined as the proportion of correct +completions. Errors in the reasoner would cause MiPS to underestimate the +accuracy of intermediate steps, therefore, we suggest and empirically show that +verification focusing on high predicted scores of the verifier shall be +preferred over that of low predicted scores, contrary to prior work. Our +approach significantly improves the performance of PaLM 2 on math and coding +tasks (accuracy +0.67% on GSM8K, +4.16% on MATH, +0.92% on MBPP compared with +an output supervision trained verifier). Additionally, our study demonstrates +that the verifier exhibits strong generalization ability across different +reasoning models. + +
+
+
+
+
+ + ☆ RACER: An LLM-powered Methodology for Scalable Analysis of + Semi-structured Mental Health Interviews + + +
+ Semi-structured interviews (SSIs) are a commonly employed data-collection +method in healthcare research, offering in-depth qualitative insights into +subject experiences. Despite their value, the manual analysis of SSIs is +notoriously time-consuming and labor-intensive, in part due to the difficulty +of extracting and categorizing emotional responses, and challenges in scaling +human evaluation for large populations. In this study, we develop RACER, a +Large Language Model (LLM) based expert-guided automated pipeline that +efficiently converts raw interview transcripts into insightful domain-relevant +themes and sub-themes. We used RACER to analyze SSIs conducted with 93 +healthcare professionals and trainees to assess the broad personal and +professional mental health impacts of the COVID-19 crisis. RACER achieves +moderately high agreement with two human evaluators (72%), which approaches the +human inter-rater agreement (77%). Interestingly, LLMs and humans struggle with +similar content involving nuanced emotional, ambivalent/dialectical, and +psychological statements. Our study highlights the opportunities and challenges +in using LLMs to improve research efficiency and opens new avenues for scalable +analysis of SSIs in healthcare research. + +
+
+
+
+
+ + ☆ VlogQA: Task, Dataset, and Baseline Models for Vietnamese Spoken-Based + Machine Reading Comprehension EACL 2024 + + +
+ This paper presents the development process of a Vietnamese spoken language +corpus for machine reading comprehension (MRC) tasks and provides insights into +the challenges and opportunities associated with using real-world data for +machine reading comprehension tasks. The existing MRC corpora in Vietnamese +mainly focus on formal written documents such as Wikipedia articles, online +newspapers, or textbooks. In contrast, the VlogQA consists of 10,076 +question-answer pairs based on 1,230 transcript documents sourced from YouTube +-- an extensive source of user-uploaded content, covering the topics of food +and travel. By capturing the spoken language of native Vietnamese speakers in +natural settings, an obscure corner overlooked in Vietnamese research, the +corpus provides a valuable resource for future research in reading +comprehension tasks for the Vietnamese language. Regarding performance +evaluation, our deep-learning models achieved the highest F1 score of 75.34% on +the test set, indicating significant progress in machine reading comprehension +for Vietnamese spoken language data. In terms of EM, the highest score we +accomplished is 53.97%, which reflects the challenge in processing spoken-based +content and highlights the need for further improvement. + +
+
+ comment: Accepted as main conference paper at EACL 2024 +
+
+
+
+
+ + ☆ Chain-of-Feedback: Mitigating the Effects of Inconsistency in Responses + + +
+ Large Language Models (LLMs) frequently suffer from knowledge-intensive +questions, often being inconsistent by providing different outputs despite +given the same input. The response quality worsens when the user expresses a +firm opposing stance which causes the LLMs to adjust its response despite the +correct initial one. These behaviors decrease the reliability and validity of +the responses provided by these models. In this paper, we attempt to 1) raise +awareness of the inherent risks that follow from overly relying on AI agents +like ChatGPT by showing how Chain-of-Feedback (CoF) triggers LLMs to deviate +more from the actual answer and 2) suggest a novel prompting method, Recursive +Chain of Feedback (R-CoF), that we are conducting further study. The CoF system +takes in an open-ended multi-step question. Then, we repetitively provide +meaningless feedback requesting another attempt. Our preliminary experiments +show that such feedback only decreases the quality of the response. On the +other hand, to mitigate the effects of the aforementioned inconsistencies, we +present a novel method of recursively revising the initial incorrect reasoning +provided by the LLM by repetitively breaking down each incorrect step into +smaller individual problems. + +
+
+ comment: Still Ongoing Work +
+
+
+
+
+ + ☆ Distinguishing the Knowable from the Unknowable with Language Models + + +
+ We study the feasibility of identifying epistemic uncertainty (reflecting a +lack of knowledge), as opposed to aleatoric uncertainty (reflecting entropy in +the underlying distribution), in the outputs of large language models (LLMs) +over free-form text. In the absence of ground-truth probabilities, we explore a +setting where, in order to (approximately) disentangle a given LLM's +uncertainty, a significantly larger model stands in as a proxy for the ground +truth. We show that small linear probes trained on the embeddings of frozen, +pretrained models accurately predict when larger models will be more confident +at the token level and that probes trained on one text domain generalize to +others. Going further, we propose a fully unsupervised method that achieves +non-trivial accuracy on the same task. Taken together, we interpret these +results as evidence that LLMs naturally contain internal representations of +different types of uncertainty that could potentially be leveraged to devise +more informative indicators of model confidence in diverse practical settings. + +
+
+
+
+
+ + ☆ VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language + Navigation AAAI 2024 + + +
+ Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate +through realistic 3D outdoor environments based on natural language +instructions. The performance of existing VLN methods is limited by +insufficient diversity in navigation environments and limited training data. To +address these issues, we propose VLN-Video, which utilizes the diverse outdoor +environments present in driving videos in multiple cities in the U.S. augmented +with automatically generated navigation instructions and actions to improve +outdoor VLN performance. VLN-Video combines the best of intuitive classical +approaches and modern deep learning techniques, using template infilling to +generate grounded navigation instructions, combined with an image rotation +similarity-based navigation action predictor to obtain VLN style data from +driving videos for pretraining deep learning VLN models. We pre-train the model +on the Touchdown dataset and our video-augmented dataset created from driving +videos with three proxy tasks: Masked Language Modeling, Instruction and +Trajectory Matching, and Next Action Prediction, so as to learn +temporally-aware and visually-aligned instruction representations. The learned +instruction representation is adapted to the state-of-the-art navigator when +fine-tuning on the Touchdown dataset. Empirical results demonstrate that +VLN-Video significantly outperforms previous state-of-the-art models by 2.1% in +task completion rate, achieving a new state-of-the-art on the Touchdown +dataset. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical + System for Punctuation Restoration EACL 2024 + + +
+ Punctuation restoration is a crucial step after Automatic Speech Recognition +(ASR) systems to enhance transcript readability and facilitate subsequent NLP +tasks. Nevertheless, conventional lexical-based approaches are inadequate for +solving the punctuation restoration task in Spanish, where ambiguity can be +often found between unpunctuated declaratives and questions. In this study, we +propose a novel hybrid acoustic-lexical punctuation restoration system for +Spanish transcription, which consolidates acoustic and lexical signals through +a modular process. Our experiment results show that the proposed system can +effectively improve F1 score of question marks and overall punctuation +restoration on both public and internal Spanish conversational datasets. +Additionally, benchmark comparison against LLMs (Large Language Model) +indicates the superiority of our approach in accuracy, reliability and latency. +Furthermore, we demonstrate that the Word Error Rate (WER) of the ASR module +also benefits from our proposed system. + +
+
+ comment: Accepted to UnImplicit workshop at EACL 2024 +
+
+
+
+
+ + ☆ Evaluating the Factuality of Zero-shot Summarizers Across Varied Domains + + +
+ Recent work has shown that large language models (LLMs) are capable of +generating summaries zero-shot (i.e., without explicit supervision) that, under +human assessment, are often comparable or even preferred to manually composed +reference summaries. However, this prior work has focussed almost exclusively +on evaluating news article summarization. How do zero-shot summarizers perform +in other (potentially more specialized) domains? In this work we evaluate +zero-shot generated summaries across specialized domains including biomedical +articles, and legal bills (in addition to standard news benchmarks for +reference). We focus especially on the factuality of outputs. We acquire +annotations from domain experts to identify inconsistencies in summaries and +systematically categorize these errors. We analyze whether the prevalence of a +given domain in the pretraining corpus affects extractiveness and faithfulness +of generated summaries of articles in this domain. We release all collected +annotations to facilitate additional research toward measuring and realizing +factually accurate summarization, beyond news articles. The dataset can be +downloaded from https://github.com/sanjanaramprasad/zero_shot_faceval_domains + +
+
+
+
+
+ + ☆ Neural networks for abstraction and reasoning: Towards broad + generalization in machines + + +
+ For half a century, artificial intelligence research has attempted to +reproduce the human qualities of abstraction and reasoning - creating computer +systems that can learn new concepts from a minimal set of examples, in settings +where humans find this easy. While specific neural networks are able to solve +an impressive range of problems, broad generalisation to situations outside +their training data has proved elusive.In this work, we look at several novel +approaches for solving the Abstraction & Reasoning Corpus (ARC), a dataset of +abstract visual reasoning tasks introduced to test algorithms on broad +generalization. Despite three international competitions with $100,000 in +prizes, the best algorithms still fail to solve a majority of ARC tasks and +rely on complex hand-crafted rules, without using machine learning at all. We +revisit whether recent advances in neural networks allow progress on this task. + First, we adapt the DreamCoder neurosymbolic reasoning solver to ARC. +DreamCoder automatically writes programs in a bespoke domain-specific language +to perform reasoning, using a neural network to mimic human intuition. We +present the Perceptual Abstraction and Reasoning Language (PeARL) language, +which allows DreamCoder to solve ARC tasks, and propose a new recognition model +that allows us to significantly improve on the previous best implementation.We +also propose a new encoding and augmentation scheme that allows large language +models (LLMs) to solve ARC tasks, and find that the largest models can solve +some ARC tasks. LLMs are able to solve a different group of problems to +state-of-the-art solvers, and provide an interesting way to complement other +approaches. We perform an ensemble analysis, combining models to achieve better +results than any system alone. Finally, we publish the arckit Python library to +make future research on ARC easier. + +
+
+ comment: 32 pages main text, 17 pages +
+
+
+
+
+ + ☆ An Inpainting-Infused Pipeline for Attire and Background Replacement + + +
+ In recent years, groundbreaking advancements in Generative Artificial +Intelligence (GenAI) have triggered a transformative paradigm shift, +significantly influencing various domains. In this work, we specifically +explore an integrated approach, leveraging advanced techniques in GenAI and +computer vision emphasizing image manipulation. The methodology unfolds through +several stages, including depth estimation, the creation of inpaint masks based +on depth information, the generation and replacement of backgrounds utilizing +Stable Diffusion in conjunction with Latent Consistency Models (LCMs), and the +subsequent replacement of clothes and application of aesthetic changes through +an inpainting pipeline. Experiments conducted in this study underscore the +methodology's efficacy, highlighting its potential to produce visually +captivating content. The convergence of these advanced techniques allows users +to input photographs of individuals and manipulate them to modify clothing and +background based on specific prompts without manually input inpainting masks, +effectively placing the subjects within the vast landscape of creative +imagination. + +
+
+
+
+
+ + ☆ Attention Meets Post-hoc Interpretability: A Mathematical Perspective + + +
+ Attention-based architectures, in particular transformers, are at the heart +of a technological revolution. Interestingly, in addition to helping obtain +state-of-the-art results on a wide range of applications, the attention +mechanism intrinsically provides meaningful insights on the internal behavior +of the model. Can these insights be used as explanations? Debate rages on. In +this paper, we mathematically study a simple attention-based architecture and +pinpoint the differences between post-hoc and attention-based explanations. We +show that they provide quite different results, and that, despite their +limitations, post-hoc methods are capable of capturing more useful insights +than merely examining the attention weights. + +
+
+
+
+
+ + ☆ Harnessing PubMed User Query Logs for Post Hoc Explanations of + Recommended Similar Articles + + +
+ Searching for a related article based on a reference article is an integral +part of scientific research. PubMed, like many academic search engines, has a +"similar articles" feature that recommends articles relevant to the current +article viewed by a user. Explaining recommended items can be of great utility +to users, particularly in the literature search process. With more than a +million biomedical papers being published each year, explaining the recommended +similar articles would facilitate researchers and clinicians in searching for +related articles. Nonetheless, the majority of current literature +recommendation systems lack explanations for their suggestions. We employ a +post hoc approach to explaining recommendations by identifying relevant tokens +in the titles of similar articles. Our major contribution is building PubCLogs +by repurposing 5.6 million pairs of coclicked articles from PubMed's user query +logs. Using our PubCLogs dataset, we train the Highlight Similar Article Title +(HSAT), a transformer-based model designed to select the most relevant parts of +the title of a similar article, based on the title and abstract of a seed +article. HSAT demonstrates strong performance in our empirical evaluations, +achieving an F1 score of 91.72 percent on the PubCLogs test set, considerably +outperforming several baselines including BM25 (70.62), MPNet (67.11), MedCPT +(62.22), GPT-3.5 (46.00), and GPT-4 (64.89). Additional evaluations on a +separate, manually annotated test set further verifies HSAT's performance. +Moreover, participants of our user study indicate a preference for HSAT, due to +its superior balance between conciseness and comprehensiveness. Our study +suggests that repurposing user query logs of academic search engines can be a +promising way to train state-of-the-art models for explaining literature +recommendation. + +
+
+
+
+
+ + ☆ SWAG: Storytelling With Action Guidance + + +
+ Automated long-form story generation typically employs long-context large +language models (LLMs) for one-shot creation, which can produce cohesive but +not necessarily engaging content. We introduce Storytelling With Action +Guidance (SWAG), a novel approach to storytelling with LLMs. Our approach +reduces story writing to a search problem through a two-model feedback loop: +one LLM generates story content, and another auxiliary LLM is used to choose +the next best "action" to steer the story's future direction. Our results show +that SWAG can substantially outperform previous end-to-end story generation +techniques when evaluated by GPT-4 and through human evaluation, and our SWAG +pipeline using only open-source models surpasses GPT-3.5-Turbo. + +
+
+
+
+
+ + ☆ Arabic Synonym BERT-based Adversarial Examples for Text Classification + + +
+ Text classification systems have been proven vulnerable to adversarial text +examples, modified versions of the original text examples that are often +unnoticed by human eyes, yet can force text classification models to alter +their classification. Often, research works quantifying the impact of +adversarial text attacks have been applied only to models trained in English. +In this paper, we introduce the first word-level study of adversarial attacks +in Arabic. Specifically, we use a synonym (word-level) attack using a Masked +Language Modeling (MLM) task with a BERT model in a black-box setting to assess +the robustness of the state-of-the-art text classification models to +adversarial attacks in Arabic. To evaluate the grammatical and semantic +similarities of the newly produced adversarial examples using our synonym +BERT-based attack, we invite four human evaluators to assess and compare the +produced adversarial examples with their original examples. We also study the +transferability of these newly produced Arabic adversarial examples to various +models and investigate the effectiveness of defense mechanisms against these +adversarial examples on the BERT models. We find that fine-tuned BERT models +were more susceptible to our synonym attacks than the other Deep Neural +Networks (DNN) models like WordCNN and WordLSTM we trained. We also find that +fine-tuned BERT models were more susceptible to transferred attacks. We, +lastly, find that fine-tuned BERT models successfully regain at least 2% in +accuracy after applying adversarial training as an initial defense mechanism. + +
+
+ comment: This paper is accepted at The 18th Conference of the European Chapter + of the Association for Computational Linguistics (Student Research Workshop), + March 17-22, 2024 +
+
+
+
+
+ + ☆ Psychological Assessments with Large Language Models: A Privacy-Focused + and Cost-Effective Approach EACL 2024 + + +
+ This study explores the use of Large Language Models (LLMs) to analyze text +comments from Reddit users, aiming to achieve two primary objectives: firstly, +to pinpoint critical excerpts that support a predefined psychological +assessment of suicidal risk; and secondly, to summarize the material to +substantiate the preassigned suicidal risk level. The work is circumscribed to +the use of "open-source" LLMs that can be run locally, thereby enhancing data +privacy. Furthermore, it prioritizes models with low computational +requirements, making it accessible to both individuals and institutions +operating on limited computing budgets. The implemented strategy only relies on +a carefully crafted prompt and a grammar to guide the LLM's text completion. +Despite its simplicity, the evaluation metrics show outstanding results, making +it a valuable privacy-focused and cost-effective approach. This work is part of +the Computational Linguistics and Clinical Psychology (CLPsych) 2024 shared +task. + +
+
+ comment: Accepted to the Workshop on Computational Linguistics and Clinical + Psychology (CLPsych) at EACL 2024 +
+
+
+
+
+ + ♻ ☆ Guiding Language Model Math Reasoning with Planning Tokens + + +
+ Large language models (LLMs) have recently attracted considerable interest +for their ability to perform complex reasoning tasks, such as chain-of-thought +reasoning. However, most of the existing approaches to enhance this ability +rely heavily on data-driven methods, while neglecting the structural aspects of +the model's reasoning capacity. We find that while LLMs can manage individual +reasoning steps well, they struggle with maintaining consistency across an +entire reasoning chain. To solve this, we introduce planning tokens at the +start of each reasoning step, serving as a guide for the model, and add their +embeddings to the model parameters. Our approach requires a negligible increase +in trainable parameters (just 0.001%) and can be applied through either full +fine-tuning or a more parameter-efficient scheme. We demonstrate our method's +effectiveness by applying it to three different LLMs, showing notable accuracy +improvements across three math word problem datasets w.r.t. standard +fine-tuning baselines. + +
+
+
+
+
+ + ♻ ☆ One Pass Streaming Algorithm for Super Long Token Attention + Approximation in Sublinear Space + + +
+ Attention computation takes both the time complexity of $O(n^2)$ and the +space complexity of $O(n^2)$ simultaneously, which makes deploying Large +Language Models (LLMs) in streaming applications that involve long contexts +requiring substantial computational resources. In recent OpenAI DevDay (Nov 6, +2023), OpenAI released a new model that is able to support a 128K-long +document, in our paper, we focus on the memory-efficient issue when context +length $n$ is much greater than 128K ($n \gg 2^d$). Considering a single-layer +self-attention with Query, Key, and Value matrices $Q, K, V \in \mathbb{R}^{n +\times d}$, the polynomial method approximates the attention output $T \in +\mathbb{R}^{n \times d}$. It accomplishes this by constructing $U_1, U_2 \in +\mathbb{R}^{n \times t}$ to expedite attention ${\sf Attn}(Q, K, V)$ +computation within $n^{1+o(1)}$ time executions. Despite this, computing the +approximated attention matrix $U_1U_2^\top \in \mathbb{R}^{n \times n}$ still +necessitates $O(n^2)$ space, leading to significant memory usage. In response +to these challenges, we introduce a new algorithm that only reads one pass of +the data in a streaming fashion. This method employs sublinear space $o(n)$ to +store three sketch matrices, alleviating the need for exact $K, V$ storage. +Notably, our algorithm exhibits exceptional memory-efficient performance with +super-long tokens. As the token length $n$ increases, our error guarantee +diminishes while the memory usage remains nearly constant. This unique +attribute underscores the potential of our technique in efficiently handling +LLMs in streaming applications. + +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of LLaMA and ChatGPT Embeddings for Molecule + Embedding + + +
+ Purpose: Large Language Models (LLMs) like ChatGPT and LLaMA are increasingly +recognized for their potential in the field of cheminformatics, particularly in +interpreting Simplified Molecular Input Line Entry System (SMILES), a standard +method for representing chemical structures. These LLMs can decode SMILES +strings into vector representations, providing a novel approach to +understanding chemical graphs. + Methods: We investigate the performance of ChatGPT and LLaMA in embedding +SMILES strings. Our evaluation focuses on two key applications: molecular +property (MP) prediction and drug-drug interaction (DDI) prediction, both +essential in drug development and healthcare. + Results: We find that SMILES embeddings generated using LLaMA outperform +those from ChatGPT in both MP and DDI prediction tasks. Notably, LLaMA-based +SMILES embeddings show results comparable to existing methods in both +prediction tasks. + Conclusion: The application of LLMs in cheminformatics, particularly in +utilizing SMILES embeddings, shows significant promise for advancing drug +development. This includes improving the prediction of chemical properties and +facilitating the drug discovery process. GitHub: +https://github.com/sshaghayeghs/LLaMA-VS-ChatGPT + +
+
+
+
+
+ + ♻ ☆ Weak-to-Strong Jailbreaking on Large Language Models + + +
+ Large language models (LLMs) are vulnerable to jailbreak attacks - resulting +in harmful, unethical, or biased text generations. However, existing +jailbreaking methods are computationally costly. In this paper, we propose the +weak-to-strong jailbreaking attack, an efficient method to attack aligned LLMs +to produce harmful text. Our key intuition is based on the observation that +jailbroken and aligned models only differ in their initial decoding +distributions. The weak-to-strong attack's key technical insight is using two +smaller models (a safe and an unsafe one) to adversarially modify a +significantly larger safe model's decoding probabilities. We evaluate the +weak-to-strong attack on 5 diverse LLMs from 3 organizations. The results show +our method can increase the misalignment rate to over 99% on two datasets with +just one forward pass per example. Our study exposes an urgent safety issue +that needs to be addressed when aligning LLMs. As an initial attempt, we +propose a defense strategy to protect against such attacks, but creating more +advanced defenses remains challenging. The code for replicating the method is +available at https://github.com/XuandongZhao/weak-to-strong + +
+
+
+
+
+ + ♻ ☆ Fundamental Limitations of Alignment in Large Language Models + + +
+ An important aspect in developing language models that interact with humans +is aligning their behavior to be useful and unharmful for their human users. +This is usually achieved by tuning the model in a way that enhances desired +behaviors and inhibits undesired ones, a process referred to as alignment. In +this paper, we propose a theoretical approach called Behavior Expectation +Bounds (BEB) which allows us to formally investigate several inherent +characteristics and limitations of alignment in large language models. +Importantly, we prove that within the limits of this framework, for any +behavior that has a finite probability of being exhibited by the model, there +exist prompts that can trigger the model into outputting this behavior, with +probability that increases with the length of the prompt. This implies that any +alignment process that attenuates an undesired behavior but does not remove it +altogether, is not safe against adversarial prompting attacks. Furthermore, our +framework hints at the mechanism by which leading alignment approaches such as +reinforcement learning from human feedback make the LLM prone to being prompted +into the undesired behaviors. This theoretical result is being experimentally +demonstrated in large scale by the so called contemporary "chatGPT jailbreaks", +where adversarial users trick the LLM into breaking its alignment guardrails by +triggering it into acting as a malicious persona. Our results expose +fundamental limitations in alignment of LLMs and bring to the forefront the +need to devise reliable mechanisms for ensuring AI safety. + +
+
+
+
+
+ + ♻ ☆ SLANG: New Concept Comprehension of Large Language Models + + +
+ The dynamic nature of language, particularly evident in the realm of slang +and memes on the Internet, poses serious challenges to the adaptability of +large language models (LLMs). Traditionally anchored to static datasets, these +models often struggle to keep up with the rapid linguistic evolution +characteristic of online communities. This research aims to bridge this gap by +enhancing LLMs' comprehension of the evolving new concepts on the Internet, +without the high cost of continual retraining. In pursuit of this goal, we +propose a new benchmark $\textbf{SLANG}$, which can autonomously integrates +novel data to stay dataset up-to-date, to assess LLMs' capability in +comprehending emerging concepts and an approach $\textbf{FOCUS}$, which uses +causal inference to enhance LLMs to understand new phrases and their colloquial +context. Our benchmark and approach involves digesting real-world instances of +linguistic shifts, serving as contextual beacons, to form more precise and +contextually relevant connections between newly emerging expressions and their +meanings. The empirical analysis shows that our causal inference-based approach +outperforms the traditional models in terms of precision and relevance in the +comprehension of Internet slang and memes. + +
+
+
+
+
+ + ♻ ☆ AI-as-exploration: Navigating intelligence space + + +
+ Artificial Intelligence is a field that lives many lives, and the term has +come to encompass a motley collection of scientific and commercial endeavours. +In this paper, I articulate the contours of a rather neglected but central +scientific role that AI has to play, which I dub `AI-as-exploration'.The basic +thrust of AI-as-exploration is that of creating and studying systems that can +reveal candidate building blocks of intelligence that may differ from the forms +of human and animal intelligence we are familiar with. In other words, I +suggest that AI is one of the best tools we have for exploring intelligence +space, namely the space of possible intelligent systems. I illustrate the value +of AI-as-exploration by focusing on a specific case study, i.e., recent work on +the capacity to combine novel and invented concepts in humans and Large +Language Models. I show that the latter, despite showing human-level accuracy +in such a task, most probably solve it in ways radically different, but no less +relevant to intelligence research, to those hypothesised for humans. + +
+
+
+
+
+ + ♻ ☆ HumBEL: A Human-in-the-Loop Approach for Evaluating Demographic Factors + of Language Models in Human-Machine Conversations + + +
+ While demographic factors like age and gender change the way people talk, and +in particular, the way people talk to machines, there is little investigation +into how large pre-trained language models (LMs) can adapt to these changes. To +remedy this gap, we consider how demographic factors in LM language skills can +be measured to determine compatibility with a target demographic. We suggest +clinical techniques from Speech Language Pathology, which has norms for +acquisition of language skills in humans. We conduct evaluation with a domain +expert (i.e., a clinically licensed speech language pathologist), and also +propose automated techniques to complement clinical evaluation at scale. +Empirically, we focus on age, finding LM capability varies widely depending on +task: GPT-3.5 mimics the ability of humans ranging from age 6-15 at tasks +requiring inference, and simultaneously, outperforms a typical 21 year old at +memorization. GPT-3.5 also has trouble with social language use, exhibiting +less than 50% of the tested pragmatic skills. Findings affirm the importance of +considering demographic alignment and conversational goals when using LMs as +public-facing tools. Code, data, and a package will be available. + +
+
+ comment: 17 pages, 9 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Transfer Learning for the Prediction of Entity Modifiers in Clinical + Text: Application to Opioid Use Disorder Case Detection + + +
+ Background: The semantics of entities extracted from a clinical text can be +dramatically altered by modifiers, including entity negation, uncertainty, +conditionality, severity, and subject. Existing models for determining +modifiers of clinical entities involve regular expression or features weights +that are trained independently for each modifier. + Methods: We develop and evaluate a multi-task transformer architecture design +where modifiers are learned and predicted jointly using the publicly available +SemEval 2015 Task 14 corpus and a new Opioid Use Disorder (OUD) data set that +contains modifiers shared with SemEval as well as novel modifiers specific for +OUD. We evaluate the effectiveness of our multi-task learning approach versus +previously published systems and assess the feasibility of transfer learning +for clinical entity modifiers when only a portion of clinical modifiers are +shared. + Results: Our approach achieved state-of-the-art results on the ShARe corpus +from SemEval 2015 Task 14, showing an increase of 1.1% on weighted accuracy, +1.7% on unweighted accuracy, and 10% on micro F1 scores. + Conclusions: We show that learned weights from our shared model can be +effectively transferred to a new partially matched data set, validating the use +of transfer learning for clinical text modifiers + +
+
+ comment: 18 pages, 2 figures, 6 tables. To be submitted to the Journal of + Biomedical Semantics +
+
+
+
+
+ + ♻ ☆ Data Diversity Matters for Robust Instruction Tuning + + +
+ Recent works have shown that by curating high quality and diverse instruction +tuning datasets, we can significantly improve instruction-following +capabilities. However, creating such datasets is difficult and most works rely +on manual curation or proprietary language models. Automatic data curation is +difficult as it is still not clear how we can define diversity for instruction +tuning, how diversity and quality depend on one other, and how we can optimize +dataset quality and diversity. To resolve these issue, we propose a new +algorithm, Quality-Diversity Instruction Tuning (QDIT). QDIT provides a simple +method to simultaneously control dataset diversity and quality, allowing us to +conduct an in-depth study on the effect of diversity and quality on instruction +tuning performance. From this study we draw two key insights (1) there is a +natural tradeoff between data diversity and quality and (2) increasing data +diversity significantly improves the worst case instruction following +performance, therefore improving robustness. We validate the performance of +QDIT on several large scale instruction tuning datasets, where we find it can +substantially improve worst and average case performance compared to +quality-driven data selection. + +
+
+ comment: 22 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ DoGE: Domain Reweighting with Generalization Estimation + + +
+ The coverage and composition of the pretraining data significantly impacts +the generalization ability of Large Language Models (LLMs). Despite its +importance, recent LLMs still rely on heuristics and trial and error to +increase or reduce the influence of data-domains. We propose DOmain reweighting +with Generalization Estimation (DoGE), which optimizes the probability of +sampling from each domain (domain weights) in a principled way. Our approach is +a two-stage process consisting of (i) training a proxy model to obtain domain +weights using a bi-level optimization algorithm; (ii) training a larger base +model by sampling training domains according to the learned domain weights. In +our experiments, we extensively show how DoGE improves the generalization of +the base model to any target data mixture. On the SlimPajama dataset, our base +model gets better perplexity and few-shot reasoning accuracies across $6$ tasks +compared to baseline methods. Moreover, aiming to generalize to out-of-domain +target tasks, which is unseen in the pretraining corpus (OOD domain), DoGE can +effectively identify inter-domain dependencies, and consistently achieves +better test perplexity on the target domain. + +
+
+
+
+
+ + ♻ ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in non-English +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a (quasi)-zero-shot +manner, even surpassing models trained on image-text data in native languages. +Taking Chinese as a practice of MPM, we build large multimodal models VisCPM in +image-to-text and text-to-image generation, which achieve state-of-the-art +(open-source) performance in Chinese. To facilitate future research, we +open-source codes and model weights at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ♻ ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts +extend beyond technology, influencing environmental and societal aspects. This +evolution has increased publications, making manual literature analysis +increasingly challenging. We address this with a Natural Language Processing +(NLP)-based systematic literature review method to explore the intersection of +Distributed Ledger Technology (DLT) with its Environmental, Social, and +Governance (ESG) aspects. Our approach involves building and refining a +directed citation network from 107 seed papers to a corpus of 24,539 +publications and fine-tuning a transformer-based language model for Named +Entity Recognition (NER) on DLT and ESG domains. Applying this model, we +distilled the corpus to 505 key publications, enabling an inaugural literature +review and temporal graph analysis of DLT's evolution in ESG contexts. Our +contributions include an adaptable and scalable NLP-driven systematic +literature review methodology and a unique NER dataset of 54,808 entities, +tailored for DLT and ESG research. Our inaugural literature review demonstrates +their applicability and effectiveness in analyzing DLT's evolution and impacts, +proving invaluable for stakeholders in the DLT domain. + +
+
+
+
+
+ + ♻ ☆ Do LLMs exhibit human-like response biases? A case study in survey + design + + +
+ As large language models (LLMs) become more capable, there is growing +excitement about the possibility of using LLMs as proxies for humans in +real-world tasks where subjective labels are desired, such as in surveys and +opinion polling. One widely-cited barrier to the adoption of LLMs is their +sensitivity to prompt wording - but interestingly, humans also display +sensitivities to instruction changes in the form of response biases. As such, +we argue that if LLMs are going to be used to approximate human opinions, it is +necessary to investigate the extent to which LLMs also reflect human response +biases, if at all. In this work, we use survey design as a case study, where +human response biases caused by permutations in wordings of "prompts" have been +extensively studied. Drawing from prior work in social psychology, we design a +dataset and propose a framework to evaluate whether LLMs exhibit human-like +response biases in survey questionnaires. Our comprehensive evaluation of nine +models shows that popular open and commercial LLMs generally fail to reflect +human-like behavior. These inconsistencies tend to be more prominent in models +that have been instruction fine-tuned. Furthermore, even if a model shows a +significant change in the same direction as humans, we find that perturbations +that are not meant to elicit significant changes in humans may also result in a +similar change. These results highlight the potential pitfalls of using LLMs to +substitute humans in parts of the annotation pipeline, and further underscore +the importance of finer-grained characterizations of model behavior. Our code, +dataset, and collected samples are available at +https://github.com/lindiatjuatja/BiasMonkey + +
+
+
+
+
+ + ♻ ☆ Hybrid Retrieval-Augmented Generation for Real-time Composition + Assistance + + +
+ Retrieval augmentation enhances performance of traditional language models by +incorporating additional context. However, the computational demands for +retrieval augmented large language models (LLMs) pose a challenge when applying +them to real-time tasks, such as composition assistance. To address this +limitation, we propose the Hybrid Retrieval-Augmented Generation (HybridRAG) +framework, a novel approach that efficiently combines a cloud-based LLM with a +smaller, client-side, language model through retrieval augmented memory. This +integration enables the client model to generate effective responses, +benefiting from the LLM's capabilities and contextual information. +Additionally, through an asynchronous memory update mechanism, the client model +can deliver real-time completions swiftly to user inputs without the need to +wait for responses from the cloud. Our experiments on five benchmark datasets +demonstrate that HybridRAG significantly improves utility over client-only +models while maintaining low latency. + +
+
+
+
+
+ + ♻ ☆ Tradeoffs Between Alignment and Helpfulness in Language Models + + +
+ Language model alignment has become an important component of AI safety, +allowing safe interactions between humans and language models, by enhancing +desired behaviors and inhibiting undesired ones. It is often done by tuning the +model or inserting preset aligning prompts. Recently, representation +engineering, a method which alters the model's behavior via changing its +representations post-training, was shown to be effective in aligning LLMs (Zou +et al., 2023a). Representation engineering yields gains in alignment oriented +tasks such as resistance to adversarial attacks and reduction of social biases, +but was also shown to cause a decrease in the ability of the model to perform +basic tasks. In this paper we study the tradeoff between the increase in +alignment and decrease in helpfulness of the model. We propose a theoretical +framework which provides bounds for these two quantities, and demonstrate their +relevance empirically. Interestingly, we find that while the helpfulness +generally decreases, it does so quadratically with the norm of the +representation engineering vector, while the alignment increases linearly with +it, indicating a regime in which it is efficient to use representation +engineering. We validate our findings empirically, and chart the boundaries to +the usefulness of representation engineering for alignment. + +
+
+
+
+
+ + ♻ ☆ The Parrot Dilemma: Human-Labeled vs. LLM-augmented Data in + Classification Tasks EACL 2024 + + +
+ In the realm of Computational Social Science (CSS), practitioners often +navigate complex, low-resource domains and face the costly and time-intensive +challenges of acquiring and annotating data. We aim to establish a set of +guidelines to address such challenges, comparing the use of human-labeled data +with synthetically generated data from GPT-4 and Llama-2 in ten distinct CSS +classification tasks of varying complexity. Additionally, we examine the impact +of training data sizes on performance. Our findings reveal that models trained +on human-labeled data consistently exhibit superior or comparable performance +compared to their synthetically augmented counterparts. Nevertheless, synthetic +augmentation proves beneficial, particularly in improving performance on rare +classes within multi-class tasks. Furthermore, we leverage GPT-4 and Llama-2 +for zero-shot classification and find that, while they generally display strong +performance, they often fall short when compared to specialized classifiers +trained on moderately sized training sets. + +
+
+ comment: Accepted at EACL 2024. 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ StepCoder: Improve Code Generation with Reinforcement Learning from + Compiler Feedback + + +
+ The advancement of large language models (LLMs) has significantly propelled +the field of code generation. Previous work integrated reinforcement learning +(RL) with compiler feedback for exploring the output space of LLMs to enhance +code generation quality. However, the lengthy code generated by LLMs in +response to complex human requirements makes RL exploration a challenge. Also, +since the unit tests may not cover the complicated code, optimizing LLMs by +using these unexecuted code snippets is ineffective. To tackle these +challenges, we introduce StepCoder, a novel RL framework for code generation, +consisting of two main components: CCCS addresses the exploration challenge by +breaking the long sequences code generation task into a Curriculum of Code +Completion Subtasks, while FGO only optimizes the model by masking the +unexecuted code segments to provide Fine-Grained Optimization. In addition, we +furthermore construct the APPS+ dataset for RL training, which is manually +verified to ensure the correctness of unit tests. Experimental results show +that our method improves the ability to explore the output space and +outperforms state-of-the-art approaches in corresponding benchmarks. Our +dataset APPS+ and StepCoder are available online. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ The DURel Annotation Tool: Human and Computational Measurement of + Semantic Proximity, Sense Clusters and Semantic Change EACL + + +
+ We present the DURel tool that implements the annotation of semantic +proximity between uses of words into an online, open source interface. The tool +supports standardized human annotation as well as computational annotation, +building on recent advances with Word-in-Context models. Annotator judgments +are clustered with automatic graph clustering techniques and visualized for +analysis. This allows to measure word senses with simple and intuitive +micro-task judgments between use pairs, requiring minimal preparation efforts. +The tool offers additional functionalities to compare the agreement between +annotators to guarantee the inter-subjectivity of the obtained judgments and to +calculate summary statistics giving insights into sense frequency +distributions, semantic variation or changes of senses over time. + +
+
+ comment: EACL Demo, 7 pages +
+
+
+
+
+ + ♻ ☆ LoTR: Low Tensor Rank Weight Adaptation + + +
+ In this paper we generalize and extend an idea of low-rank adaptation (LoRA) +of large language models (LLMs) based on Transformer architecture. Widely used +LoRA-like methods of fine-tuning LLMs are based on matrix factorization of +gradient update. We introduce LoTR, a novel approach for parameter-efficient +fine-tuning of LLMs which represents a gradient update to parameters in a form +of tensor decomposition. Low-rank adapter for each layer is constructed as a +product of three matrices, and tensor structure arises from sharing left and +right multipliers of this product among layers. Simultaneous compression of a +sequence of layers with low-rank tensor representation allows LoTR to archive +even better parameter efficiency then LoRA especially for deep models. +Moreover, the core tensor does not depend on original weight dimension and can +be made arbitrary small, which allows for extremely cheap and fast downstream +fine-tuning. + +
+
+ comment: Submitted; missing author and sections were added; +
+
+
+
+
+ + ♻ ☆ SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic + Classification in 200+ Languages and Dialects EACL 2024 + + +
+ Despite the progress we have recorded in the last few years in multilingual +natural language processing, evaluation is typically limited to a small set of +languages with available datasets which excludes a large number of low-resource +languages. In this paper, we created SIB-200 -- a large-scale open-sourced +benchmark dataset for topic classification in 200 languages and dialects to +address the lack of evaluation dataset for Natural Language Understanding +(NLU). For many of the languages covered in SIB-200, this is the first publicly +available evaluation dataset for NLU. The dataset is based on Flores-200 +machine translation corpus. We annotated the English portion of the dataset and +extended the sentence-level annotation to the remaining 203 languages covered +in the corpus. Despite the simplicity of this task, our evaluation in +full-supervised setting, cross-lingual transfer setting and prompting of large +language model setting show that there is still a large gap between the +performance of high-resource and low-resource languages when multilingual +evaluation is scaled to numerous world languages. We found that languages +unseen during the pre-training of multilingual language models, +under-represented language families (like Nilotic and Altantic-Congo), and +languages from the regions of Africa, Americas, Oceania and South East Asia, +often have the lowest performance on our topic classification dataset. We hope +our dataset will encourage a more inclusive evaluation of multilingual language +models on a more diverse set of languages. https://github.com/dadelani/sib-200 + +
+
+ comment: Accepted to EACL 2024 (main conference) +
+
+
+
+
+ + ♻ ☆ Language Models as Inductive Reasoners EACL 2024 + + +
+ Inductive reasoning is a core component of human intelligence. In the past +research of inductive reasoning within computer science, formal language is +used as representations of knowledge (facts and rules, more specifically). +However, formal language can cause systematic problems for inductive reasoning +such as disability of handling raw input such as natural language, +sensitiveness to mislabeled data, and incapacity to handle ambiguous input. To +this end, we propose a new paradigm (task) for inductive reasoning, which is to +induce natural language rules from natural language facts, and create a dataset +termed DEER containing 1.2k rule-fact pairs for the task, where rules and facts +are written in natural language. New automatic metrics are also proposed and +analysed for the evaluation of this task. With DEER, we investigate a modern +approach for inductive reasoning where we use natural language as +representation for knowledge instead of formal language and use pretrained +language models as ''reasoners''. Moreover, we provide the first and +comprehensive analysis of how well pretrained language models can induce +natural language rules from natural language facts. We also propose a new +framework drawing insights from philosophy literature for this task, which we +show in the experiment section that surpasses baselines in both automatic and +human evaluations. We discuss about our future perspectives for inductive +reasoning in Section 7. Dataset and code are available at +https://github.com/ZonglinY/Inductive_Reasoning. + +
+
+ comment: Accepted by EACL 2024 +
+
+
+
+
+ + ♻ ☆ An energy-based comparative analysis of common approaches to text + classification in the Legal domain + + +
+ Most Machine Learning research evaluates the best solutions in terms of +performance. However, in the race for the best performing model, many important +aspects are often overlooked when, on the contrary, they should be carefully +considered. In fact, sometimes the gaps in performance between different +approaches are neglectable, whereas factors such as production costs, energy +consumption, and carbon footprint must take into consideration. Large Language +Models (LLMs) are extensively adopted to address NLP problems in academia and +industry. In this work, we present a detailed quantitative comparison of LLM +and traditional approaches (e.g. SVM) on the LexGLUE benchmark, which takes +into account both performance (standard indices) and alternative metrics such +as timing, power consumption and cost, in a word: the carbon-footprint. In our +analysis, we considered the prototyping phase (model selection by +training-validation-test iterations) and in-production phases separately, since +they follow different implementation procedures and also require different +resources. The results indicate that very often, the simplest algorithms +achieve performance very close to that of large LLMs but with very low power +consumption and lower resource demands. The results obtained could suggest +companies to include additional evaluations in the choice of Machine Learning +(ML) solutions. + +
+
+ comment: Presented at The 4th International Conference on NLP & Text Mining + (NLTM 2024), January 27-28 2024, Copenhagen, Denmark - 12 pages, 1 figure, 7 + tables +
+
+
+
+
+ + ♻ ☆ SemScore: Automated Evaluation of Instruction-Tuned LLMs based on + Semantic Textual Similarity + + +
+ Instruction-tuned Large Language Models (LLMs) have recently showcased +remarkable advancements in their ability to generate fitting responses to +natural language instructions. However, many current works rely on manual +evaluation to judge the quality of generated responses. Since such manual +evaluation is time-consuming, it does not easily scale to the evaluation of +multiple models and model variants. In this short paper, we propose a +straightforward but remarkably effective evaluation metric called SemScore, in +which we directly compare model outputs to gold target responses using semantic +textual similarity (STS). We conduct a comparative evaluation of the model +outputs of 12 prominent instruction-tuned LLMs using 8 widely-used evaluation +metrics for text generation. We find that our proposed SemScore metric +outperforms all other, in many cases more complex, evaluation metrics in terms +of correlation to human evaluation. These findings indicate the utility of our +proposed metric for the evaluation of instruction-tuned LLMs. + +
+
+
+
+
+ + ♻ ☆ ICU: Conquering Language Barriers in Vision-and-Language Modeling by + Dividing the Tasks into Image Captioning and Language Understanding EMNLP 2023 + + +
+ Most multilingual vision-and-language (V&L) research aims to accomplish +multilingual and multimodal capabilities within one model. However, the +scarcity of multilingual captions for images has hindered the development. To +overcome this obstacle, we propose ICU, Image Caption Understanding, which +divides a V&L task into two stages: a V&L model performs image captioning in +English, and a multilingual language model (mLM), in turn, takes the caption as +the alt text and performs cross-lingual language understanding. The burden of +multilingual processing is lifted off V&L model and placed on mLM. Since the +multilingual text data is relatively of higher abundance and quality, ICU can +facilitate the conquering of language barriers for V&L models. In experiments +on two tasks across 9 languages in the IGLUE benchmark, we show that ICU can +achieve new state-of-the-art results for five languages, and comparable results +for the rest. + +
+
+ comment: EMNLP 2023 (Findings) +
+
+
+
+
+ + ♻ ☆ The Anatomy of Conspirators: Unveiling Traits using a Comprehensive + Twitter Dataset + + +
+ The discourse around conspiracy theories is currently thriving amidst the +rampant misinformation in online environments. Research in this field has been +focused on detecting conspiracy theories on social media, often relying on +limited datasets. In this study, we present a novel methodology for +constructing a Twitter dataset that encompasses accounts engaged in +conspiracy-related activities throughout the year 2022. Our approach centers on +data collection that is independent of specific conspiracy theories and +information operations. Additionally, our dataset includes a control group +comprising randomly selected users who can be fairly compared to the +individuals involved in conspiracy activities. This comprehensive collection +effort yielded a total of 15K accounts and 37M tweets extracted from their +timelines. We conduct a comparative analysis of the two groups across three +dimensions: topics, profiles, and behavioral characteristics. The results +indicate that conspiracy and control users exhibit similarity in terms of their +profile metadata characteristics. However, they diverge significantly in terms +of behavior and activity, particularly regarding the discussed topics, the +terminology used, and their stance on trending subjects. In addition, we find +no significant disparity in the presence of bot users between the two groups. +Finally, we develop a classifier to identify conspiracy users using features +borrowed from bot, troll and linguistic literature. The results demonstrate a +high accuracy level (with an F1 score of 0.94), enabling us to uncover the most +discriminating features associated with conspiracy-related accounts. + +
+
+
+
+
+ + ♻ ☆ Linear Alignment of Vision-language Models for Image Captioning + + +
+ Recently, vision-language models like CLIP have advanced the state of the art +in a variety of multi-modal tasks including image captioning and caption +evaluation. Many approaches adapt CLIP-style models to a downstream task by +training a mapping network between CLIP and a language model. This is costly as +it usually involves calculating gradients for large models. We propose a more +efficient training protocol that fits a linear mapping between image and text +embeddings of CLIP via a closed-form solution. This bypasses the need for +gradient computation and results in a lightweight captioning method called +ReCap, which can be trained up to 1000 times faster than existing lightweight +methods. Moreover, we propose two new learning-based image-captioning metrics +that build on CLIP score along with our linear mapping. Furthermore, we combine +ReCap with our new metrics to design an iterative datastore-augmentation loop +(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k, +VizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art +lightweight methods on established metrics while outperforming them on our new +metrics, which are better aligned with human ratings on Flickr8k-Expert and +Flickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to +other domains and that our DAL leads to a performance boost. + +
+
+ comment: 8 pages (+ references and appendix) +
+
+
+
+
+ + ♻ ☆ DialogStudio: Towards Richest and Most Diverse Unified Dataset + Collection for Conversational AI EACL 2024 + + +
+ Despite advancements in conversational AI, language models encounter +challenges to handle diverse conversational tasks, and existing dialogue +dataset collections often lack diversity and comprehensiveness. To tackle these +issues, we introduce DialogStudio: the largest and most diverse collection of +dialogue datasets, unified under a consistent format while preserving their +original information. Our collection encompasses data from open-domain +dialogues, task-oriented dialogues, natural language understanding, +conversational recommendation, dialogue summarization, and knowledge-grounded +dialogues, making it an incredibly rich and diverse resource for dialogue +research and model training. To further enhance the utility of DialogStudio, we +identify the licenses for each dataset, design external knowledge and +domain-aware prompts for selected dialogues to facilitate instruction-aware +fine-tuning. Furthermore, we develop conversational AI models using the dataset +collection, and our experiments in both zero-shot and few-shot learning +scenarios demonstrate the superiority of DialogStudio. To improve transparency +and support dataset and task-based research, as well as language model +pre-training, all datasets, licenses, codes, and models associated with +DialogStudio are made publicly +accessible\footnote{\url{https://github.com/salesforce/DialogStudio}}. + +
+
+ comment: 17 pages, accepted by EACL 2024 Findings as a long paper. All + datasets, licenses, codes, and models are available at at + https://github.com/salesforce/DialogStudio +
+
+
+
+
+ + ♻ ☆ LAraBench: Benchmarking Arabic AI with Large Language Models + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +influenced the landscape of language and speech research. Despite this +progress, these models lack specific benchmarking against state-of-the-art +(SOTA) models tailored to particular languages and tasks. LAraBench addresses +this gap for Arabic Natural Language Processing (NLP) and Speech Processing +tasks, including sequence tagging and content classification across different +domains. We utilized models such as GPT-3.5-turbo, GPT-4, BLOOMZ, +Jais-13b-chat, Whisper, and USM, employing zero and few-shot learning +techniques to tackle 33 distinct tasks across 61 publicly available datasets. +This involved 98 experimental setups, encompassing ~296K data points, ~46 hours +of speech, and 30 sentences for Text-to-Speech (TTS). This effort resulted in +330+ sets of experiments. Our analysis focused on measuring the performance gap +between SOTA models and LLMs. The overarching trend observed was that SOTA +models generally outperformed LLMs in zero-shot learning, with a few +exceptions. Notably, larger computational models with few-shot learning +techniques managed to reduce these performance gaps. Our findings provide +valuable insights into the applicability of LLMs for Arabic NLP and speech +processing tasks. + +
+
+ comment: Foundation Models, Large Language Models, Arabic NLP, Arabic Speech, + Arabic AI, GPT3.5 Evaluation, USM Evaluation, Whisper Evaluation, GPT-4, + BLOOMZ, Jais13b +
+
+
+
+
+ + ♻ ☆ TravelPlanner: A Benchmark for Real-World Planning with Language Agents + + +
+ Planning has been part of the core pursuit for artificial intelligence since +its conception, but earlier AI agents mostly focused on constrained settings +because many of the cognitive substrates necessary for human-level planning +have been lacking. Recently, language agents powered by large language models +(LLMs) have shown interesting capabilities such as tool use and reasoning. Are +these language agents capable of planning in more complex settings that are out +of the reach of prior AI agents? To advance this investigation, we propose +TravelPlanner, a new planning benchmark that focuses on travel planning, a +common real-world planning scenario. It provides a rich sandbox environment, +various tools for accessing nearly four million data records, and 1,225 +meticulously curated planning intents and reference plans. Comprehensive +evaluations show that the current language agents are not yet capable of +handling such complex planning tasks-even GPT-4 only achieves a success rate of +0.6%. Language agents struggle to stay on task, use the right tools to collect +information, or keep track of multiple constraints. However, we note that the +mere possibility for language agents to tackle such a complex problem is in +itself non-trivial progress. TravelPlanner provides a challenging yet +meaningful testbed for future language agents. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Robust Prompt Optimization for Large Language Models Against + Distribution Shifts EMNLP 2023 + + +
+ Large Language Model (LLM) has demonstrated significant ability in various +Natural Language Processing tasks. However, their effectiveness is highly +dependent on the phrasing of the task prompt, leading to research on automatic +prompt optimization using labeled task data. We reveal that these prompt +optimization techniques are vulnerable to distribution shifts such as +subpopulation shifts, which are common for LLMs in real-world scenarios such as +customer reviews analysis. In this light, we propose a new problem of robust +prompt optimization for LLMs against distribution shifts, which requires the +prompt optimized over the labeled source group can simultaneously generalize to +an unlabeled target group. To solve this problem, we propose Generalized Prompt +Optimization framework, which incorporates the unlabeled data from the target +group into prompt optimization. Extensive experimental results demonstrate the +effectiveness of the proposed framework with significant performance +improvement on the target group and comparable performance on the source group. + +
+
+ comment: EMNLP 2023 Main +
+
+
+
+
+ + ♻ ☆ AccentFold: A Journey through African Accents for Zero-Shot ASR + Adaptation to Target Accents EACL + + +
+ Despite advancements in speech recognition, accented speech remains +challenging. While previous approaches have focused on modeling techniques or +creating accented speech datasets, gathering sufficient data for the multitude +of accents, particularly in the African context, remains impractical due to +their sheer diversity and associated budget constraints. To address these +challenges, we propose AccentFold, a method that exploits spatial relationships +between learned accent embeddings to improve downstream Automatic Speech +Recognition (ASR). Our exploratory analysis of speech embeddings representing +100+ African accents reveals interesting spatial accent relationships +highlighting geographic and genealogical similarities, capturing consistent +phonological, and morphological regularities, all learned empirically from +speech. Furthermore, we discover accent relationships previously +uncharacterized by the Ethnologue. Through empirical evaluation, we demonstrate +the effectiveness of AccentFold by showing that, for out-of-distribution (OOD) +accents, sampling accent subsets for training based on AccentFold information +outperforms strong baselines a relative WER improvement of 4.6%. AccentFold +presents a promising approach for improving ASR performance on accented speech, +particularly in the context of African accents, where data scarcity and budget +constraints pose significant challenges. Our findings emphasize the potential +of leveraging linguistic relationships to improve zero-shot ASR adaptation to +target accents. + +
+
+ comment: Accepted to EACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ SqueezeLLM: Dense-and-Sparse Quantization + + +
+ Generative Large Language Models (LLMs) have demonstrated remarkable results +for a wide range of tasks. However, deploying these models for inference has +been a significant challenge due to their unprecedented resource requirements. +This has forced existing deployment frameworks to use multi-GPU inference +pipelines, which are often complex and costly, or to use smaller and less +performant models. In this work, we demonstrate that the main bottleneck for +generative inference with LLMs is memory bandwidth, rather than compute, +specifically for single batch inference. While quantization has emerged as a +promising solution by representing model weights with reduced precision, +previous efforts have often resulted in notable performance degradation. To +address this, we introduce SqueezeLLM, a post-training quantization framework +that not only enables lossless compression to ultra-low precisions of up to +3-bit, but also achieves higher quantization performance under the same memory +constraint. Our framework incorporates two novel ideas: (i) sensitivity-based +non-uniform quantization, which searches for the optimal bit precision +assignment based on second-order information; and (ii) the Dense-and-Sparse +decomposition that stores outliers and sensitive weight values in an efficient +sparse format. When applied to the LLaMA models, our 3-bit quantization +significantly reduces the perplexity gap from the FP16 baseline by up to 2.1x +as compared to the state-of-the-art methods with the same memory requirement. +Furthermore, when deployed on an A6000 GPU, our quantized models achieve up to +2.3x speedup compared to the baseline. Our code is open-sourced and available +online. + +
+
+
+
+
+ + ♻ ☆ CABINET: Content Relevance based Noise Reduction for Table Question + Answering ICLR 2024 + + +
+ Table understanding capability of Large Language Models (LLMs) has been +extensively studied through the task of question-answering (QA) over tables. +Typically, only a small part of the whole table is relevant to derive the +answer for a given question. The irrelevant parts act as noise and are +distracting information, resulting in sub-optimal performance due to the +vulnerability of LLMs to noise. To mitigate this, we propose CABINET (Content +RelevAnce-Based NoIse ReductioN for TablE QuesTion-Answering) - a framework to +enable LLMs to focus on relevant tabular data by suppressing extraneous +information. CABINET comprises an Unsupervised Relevance Scorer (URS), trained +differentially with the QA LLM, that weighs the table content based on its +relevance to the input question before feeding it to the question-answering LLM +(QA LLM). To further aid the relevance scorer, CABINET employs a weakly +supervised module that generates a parsing statement describing the criteria of +rows and columns relevant to the question and highlights the content of +corresponding table cells. CABINET significantly outperforms various tabular +LLM baselines, as well as GPT3-based in-context learning methods, is more +robust to noise, maintains outperformance on tables of varying sizes, and +establishes new SoTA performance on WikiTQ, FeTaQA, and WikiSQL datasets. We +release our code and datasets at https://github.com/Sohanpatnaik106/CABINET_QA. + +
+
+ comment: Accepted at ICLR 2024 (spotlight) +
+
+
+
+
+ + ♻ ☆ ChatTraffic: Text-to-Traffic Generation via Diffusion Model + + +
+ Traffic prediction is one of the most significant foundations in Intelligent +Transportation Systems (ITS). Traditional traffic prediction methods rely only +on historical traffic data to predict traffic trends and face two main +challenges. 1) insensitivity to unusual events. 2) limited performance in +long-term prediction. In this work, we explore how generative models combined +with text describing the traffic system can be applied for traffic generation, +and name the task Text-to-Traffic Generation (TTG). The key challenge of the +TTG task is how to associate text with the spatial structure of the road +network and traffic data for generating traffic situations. To this end, we +propose ChatTraffic, the first diffusion model for text-to-traffic generation. +To guarantee the consistency between synthetic and real data, we augment a +diffusion model with the Graph Convolutional Network (GCN) to extract spatial +correlations of traffic data. In addition, we construct a large dataset +containing text-traffic pairs for the TTG task. We benchmarked our model +qualitatively and quantitatively on the released dataset. The experimental +results indicate that ChatTraffic can generate realistic traffic situations +from the text. Our code and dataset are available at +https://github.com/ChyaZhang/ChatTraffic. + +
+
+
+
+
+ + ♻ ☆ Loss Masking Is Not Needed in Decoder-only Transformer for + Discrete-token-based ASR ICASSP 2024 + + +
+ Recently, unified speech-text models, such as SpeechGPT, VioLA, and +AudioPaLM, have achieved remarkable performance on various speech tasks. These +models discretize speech signals into tokens (speech discretization) and use a +shared vocabulary for both text and speech tokens. Then they train a single +decoder-only Transformer on a mixture of speech tasks. However, these models +rely on the Loss Masking strategy for the ASR task, which ignores the +dependency among speech tokens. In this paper, we propose to model speech +tokens in an autoregressive way, similar to text. We find that applying the +conventional cross-entropy loss on input speech tokens does not consistently +improve the ASR performance over the Loss Masking approach. To address this +issue, we propose a novel approach denoted Smoothed Label Distillation (SLD), +which applies a KL divergence loss with smoothed labels on speech tokens. Our +experiments show that SLD effectively models speech tokens and outperforms Loss +Masking for decoder-only Transformers in ASR tasks with different speech +discretization methods. The source code can be found here: +https://github.com/alibaba-damo-academy/SpokenNLP/tree/main/sld + +
+
+ comment: 5 pages, accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Divergences between Language Models and Human Brains + + +
+ Do machines and humans process language in similar ways? Recent research has +hinted in the affirmative, finding that brain signals can be effectively +predicted using the internal representations of language models (LMs). Although +such results are thought to reflect shared computational principles between LMs +and human brains, there are also clear differences in how LMs and humans +represent and use language. In this work, we systematically explore the +divergences between human and machine language processing by examining the +differences between LM representations and human brain responses to language as +measured by Magnetoencephalography (MEG) across two datasets in which subjects +read and listened to narrative stories. Using a data-driven approach, we +identify two domains that are not captured well by LMs: social/emotional +intelligence and physical commonsense. We then validate these domains with +human behavioral experiments and show that fine-tuning LMs on these domains can +improve their alignment with human brain responses. + +
+
+
+
+
+ + ♻ ☆ Exposing Limitations of Language Model Agents in Sequential-Task + Compositions on the Web + + +
+ Language model agents (LMA) recently emerged as a promising paradigm on +muti-step decision making tasks, often outperforming humans and other +reinforcement learning agents. Despite the promise, their performance on +real-world applications that often involve combinations of tasks is still +underexplored. In this work, we introduce a new benchmark, called CompWoB -- 50 +new compositional web automation tasks reflecting more realistic assumptions. +We show that while existing prompted LMAs (gpt-3.5-turbo or gpt-4) achieve +94.0% average success rate on base tasks, their performance degrades to 24.9% +success rate on compositional tasks. On the other hand, transferred LMAs +(finetuned only on base tasks) show less generalization gap, dropping from +85.4% to 54.8%. By balancing data distribution across tasks, we train a new +model, HTML-T5++, that surpasses human-level performance (95.2%) on MiniWoB, +and achieves the best zero-shot performance on CompWoB (61.5%). While these +highlight the promise of small-scale finetuned and transferred models for task +compositionality, their performance further degrades under different +instruction compositions changing combinational order. In contrast to the +recent remarkable success of LMA, our benchmark and detailed analysis emphasize +the necessity of building LMAs that are robust and generalizable to task +compositionality for real-world deployment. + +
+
+ comment: Code: + https://github.com/google-research/google-research/tree/master/compositional_rl/compwob +
+
+
+
+
+ + ♻ ☆ PhoGPT: Generative Pre-training for Vietnamese + + +
+ We open-source a state-of-the-art 4B-parameter generative model series for +Vietnamese, which includes the base pre-trained monolingual model PhoGPT-4B and +its chat variant, PhoGPT-4B-Chat. The base model, PhoGPT-4B, with exactly 3.7B +parameters, is pre-trained from scratch on a Vietnamese corpus of 102B tokens, +with an 8192 context length, employing a vocabulary of 20480 token types. The +chat variant, PhoGPT-4B-Chat, is the modeling output obtained by fine-tuning +PhoGPT-4B on a dataset of 70K instructional prompts and their responses, along +with an additional 290K conversations. We demonstrate its strong performance +compared to previous closed-source and open-source 7B-parameter models. Our +PhoGPT models are available at: https://github.com/VinAIResearch/PhoGPT + +
+
+ comment: PhoGPT-4B Technical Report - 5 pages +
+
+
+
+
+ + ♻ ☆ NESTLE: a No-Code Tool for Statistical Analysis of Legal Corpus EACL 2024 + + +
+ The statistical analysis of large scale legal corpus can provide valuable +legal insights. For such analysis one needs to (1) select a subset of the +corpus using document retrieval tools, (2) structure text using information +extraction (IE) systems, and (3) visualize the data for the statistical +analysis. Each process demands either specialized tools or programming skills +whereas no comprehensive unified "no-code" tools have been available. Here we +provide NESTLE, a no-code tool for large-scale statistical analysis of legal +corpus. Powered by a Large Language Model (LLM) and the internal custom +end-to-end IE system, NESTLE can extract any type of information that has not +been predefined in the IE system opening up the possibility of unlimited +customizable statistical analysis of the corpus without writing a single line +of code. We validate our system on 15 Korean precedent IE tasks and 3 legal +text classification tasks from LexGLUE. The comprehensive experiments reveal +NESTLE can achieve GPT-4 comparable performance by training the internal IE +module with 4 human-labeled, and 192 LLM-labeled examples. + +
+
+ comment: EACL 2024 System Demonstration Track +
+
+
+
+
+ + ♻ ☆ Improving Grounded Language Understanding in a Collaborative Environment + by Interacting with Agents Through Help Feedback EACL 2024 + + +
+ Many approaches to Natural Language Processing (NLP) tasks often treat them +as single-step problems, where an agent receives an instruction, executes it, +and is evaluated based on the final outcome. However, human language is +inherently interactive, as evidenced by the back-and-forth nature of human +conversations. In light of this, we posit that human-AI collaboration should +also be interactive, with humans monitoring the work of AI agents and providing +feedback that the agent can understand and utilize. Further, the AI agent +should be able to detect when it needs additional information and proactively +ask for help. Enabling this scenario would lead to more natural, efficient, and +engaging human-AI collaborations. + In this work, we explore these directions using the challenging task defined +by the IGLU competition, an interactive grounded language understanding task in +a MineCraft-like world. We explore multiple types of help players can give to +the AI to guide it and analyze the impact of this help in AI behavior, +resulting in performance improvements. + +
+
+ comment: Findings of EACL 2024 +
+
+
+
+
+ + ♻ ☆ O3D: Offline Data-driven Discovery and Distillation for Sequential + Decision-Making with Large Language Models + + +
+ Recent advancements in large language models (LLMs) have exhibited promising +performance in solving sequential decision-making problems. By imitating +few-shot examples provided in the prompts (i.e., in-context learning), an LLM +agent can interact with an external environment and complete given tasks +without additional training. However, such few-shot examples are often +insufficient to generate high-quality solutions for complex and long-horizon +tasks, while the limited context length cannot consume larger-scale +demonstrations with long interaction horizons. To this end, we propose an +offline learning framework that utilizes offline data at scale (e.g, logs of +human interactions) to improve LLM-powered policies without finetuning. The +proposed method O3D (Offline Data-driven Discovery and Distillation) +automatically discovers reusable skills and distills generalizable knowledge +across multiple tasks based on offline interaction data, advancing the +capability of solving downstream tasks. Empirical results under two interactive +decision-making benchmarks (ALFWorld and WebShop) verify that O3D can notably +enhance the decision-making capabilities of LLMs through the offline discovery +and distillation process, and consistently outperform baselines across various +LLMs. + +
+
+
+
+
+ + ♻ ☆ Peering Through Preferences: Unraveling Feedback Acquisition for + Aligning Large Language Models ICLR 2024 + + +
+ Aligning large language models (LLMs) with human values and intents +critically involves the use of human or AI feedback. While dense feedback +annotations are expensive to acquire and integrate, sparse feedback presents a +structural design choice between ratings (e.g., score Response A on a scale of +1-7) and rankings (e.g., is Response A better than Response B?). In this work, +we analyze the effect of this design choice for the alignment and evaluation of +LLMs. We uncover an inconsistency problem wherein the preferences inferred from +ratings and rankings significantly disagree 60% for both human and AI +annotators. Our subsequent analysis identifies various facets of annotator +biases that explain this phenomena, such as human annotators would rate denser +responses higher while preferring accuracy during pairwise judgments. To our +surprise, we also observe that the choice of feedback protocol also has a +significant effect on the evaluation of aligned LLMs. In particular, we find +that LLMs that leverage rankings data for alignment (say model X) are preferred +over those that leverage ratings data (say model Y), with a rank-based +evaluation protocol (is X/Y's response better than reference response?) but not +with a rating-based evaluation protocol (score Rank X/Y's response on a scale +of 1-7). Our findings thus shed light on critical gaps in methods for +evaluating the real-world utility of language models and their strong +dependence on the feedback protocol used for alignment. Our code and data are +available at https://github.com/Hritikbansal/sparse_feedback. + +
+
+ comment: 31 pages, Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Assessing the nature of large language models: A caution against + anthropocentrism + + +
+ Generative AI models garnered a large amount of public attention and +speculation with the release of OpenAIs chatbot, ChatGPT. At least two opinion +camps exist: one excited about possibilities these models offer for fundamental +changes to human tasks, and another highly concerned about power these models +seem to have. To address these concerns, we assessed several LLMs, primarily +GPT 3.5, using standard, normed, and validated cognitive and personality +measures. For this seedling project, we developed a battery of tests that +allowed us to estimate the boundaries of some of these models capabilities, how +stable those capabilities are over a short period of time, and how they compare +to humans. Our results indicate that LLMs are unlikely to have developed +sentience, although its ability to respond to personality inventories is +interesting. GPT3.5 did display large variability in both cognitive and +personality measures over repeated observations, which is not expected if it +had a human-like personality. Variability notwithstanding, LLMs display what in +a human would be considered poor mental health, including low self-esteem, +marked dissociation from reality, and in some cases narcissism and psychopathy, +despite upbeat and helpful responses. + +
+
+ comment: 31 pages, 6 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 130 + +
+
+
+ + ☆ Test-Time Adaptation for Depth Completion + + +
+ It is common to observe performance degradation when transferring models +trained on some (source) datasets to target testing data due to a domain gap +between them. Existing methods for bridging this gap, such as domain adaptation +(DA), may require the source data on which the model was trained (often not +available), while others, i.e., source-free DA, require many passes through the +testing data. We propose an online test-time adaptation method for depth +completion, the task of inferring a dense depth map from a single image and +associated sparse depth map, that closes the performance gap in a single pass. +We first present a study on how the domain shift in each data modality affects +model performance. Based on our observations that the sparse depth modality +exhibits a much smaller covariate shift than the image, we design an embedding +module trained in the source domain that preserves a mapping from features +encoding only sparse depth to those encoding image and sparse depth. During +test time, sparse depth features are projected using this map as a proxy for +source domain features and are used as guidance to train a set of auxiliary +parameters (i.e., adaptation layer) to align image and sparse depth features +from the target test domain to that of the source domain. We evaluate our +method on indoor and outdoor scenarios and show that it improves over baselines +by an average of 21.1%. + +
+
+
+
+
+ + ☆ HASSOD: Hierarchical Adaptive Self-Supervised Object Detection NeurIPS 2023 + + +
+ The human visual perception system demonstrates exceptional capabilities in +learning without explicit supervision and understanding the part-to-whole +composition of objects. Drawing inspiration from these two abilities, we +propose Hierarchical Adaptive Self-Supervised Object Detection (HASSOD), a +novel approach that learns to detect objects and understand their compositions +without human supervision. HASSOD employs a hierarchical adaptive clustering +strategy to group regions into object masks based on self-supervised visual +representations, adaptively determining the number of objects per image. +Furthermore, HASSOD identifies the hierarchical levels of objects in terms of +composition, by analyzing coverage relations between masks and constructing +tree structures. This additional self-supervised learning task leads to +improved detection performance and enhanced interpretability. Lastly, we +abandon the inefficient multi-round self-training process utilized in prior +methods and instead adapt the Mean Teacher framework from semi-supervised +learning, which leads to a smoother and more efficient training process. +Through extensive experiments on prevalent image datasets, we demonstrate the +superiority of HASSOD over existing methods, thereby advancing the state of the +art in self-supervised object detection. Notably, we improve Mask AR from 20.2 +to 22.5 on LVIS, and from 17.0 to 26.0 on SA-1B. Project page: +https://HASSOD-NeurIPS23.github.io. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ V-IRL: Grounding Virtual Intelligence in Real Life + + +
+ There is a sensory gulf between the Earth that humans inhabit and the digital +realms in which modern AI agents are created. To develop AI agents that can +sense, think, and act as flexibly as humans in real-world settings, it is +imperative to bridge the realism gap between the digital and physical worlds. +How can we embody agents in an environment as rich and diverse as the one we +inhabit, without the constraints imposed by real hardware and control? Towards +this end, we introduce V-IRL: a platform that enables agents to scalably +interact with the real world in a virtual yet realistic environment. Our +platform serves as a playground for developing agents that can accomplish +various practical tasks and as a vast testbed for measuring progress in +capabilities spanning perception, decision-making, and interaction with +real-world data across the entire globe. + +
+
+ comment: Project page: https://virl-platform.github.io +
+
+
+
+
+ + ☆ AONeuS: A Neural Rendering Framework for Acoustic-Optical Sensor Fusion + + +
+ Underwater perception and 3D surface reconstruction are challenging problems +with broad applications in construction, security, marine archaeology, and +environmental monitoring. Treacherous operating conditions, fragile +surroundings, and limited navigation control often dictate that submersibles +restrict their range of motion and, thus, the baseline over which they can +capture measurements. In the context of 3D scene reconstruction, it is +well-known that smaller baselines make reconstruction more challenging. Our +work develops a physics-based multimodal acoustic-optical neural surface +reconstruction framework (AONeuS) capable of effectively integrating +high-resolution RGB measurements with low-resolution depth-resolved imaging +sonar measurements. By fusing these complementary modalities, our framework can +reconstruct accurate high-resolution 3D surfaces from measurements captured +over heavily-restricted baselines. Through extensive simulations and in-lab +experiments, we demonstrate that AONeuS dramatically outperforms recent +RGB-only and sonar-only inverse-differentiable-rendering--based surface +reconstruction methods. A website visualizing the results of our paper is +located at this address: https://aoneus.github.io/ + +
+
+ comment: First two authors contributed equally. Paper website: + https://aoneus.github.io/ +
+
+
+
+
+ + ☆ 4D Gaussian Splatting: Towards Efficient Novel View Synthesis for + Dynamic Scenes + + +
+ We consider the problem of novel view synthesis (NVS) for dynamic scenes. +Recent neural approaches have accomplished exceptional NVS results for static +3D scenes, but extensions to 4D time-varying scenes remain non-trivial. Prior +efforts often encode dynamics by learning a canonical space plus implicit or +explicit deformation fields, which struggle in challenging scenarios like +sudden movements or capturing high-fidelity renderings. In this paper, we +introduce 4D Gaussian Splatting (4DGS), a novel method that represents dynamic +scenes with anisotropic 4D XYZT Gaussians, inspired by the success of 3D +Gaussian Splatting in static scenes. We model dynamics at each timestamp by +temporally slicing the 4D Gaussians, which naturally compose dynamic 3D +Gaussians and can be seamlessly projected into images. As an explicit +spatial-temporal representation, 4DGS demonstrates powerful capabilities for +modeling complicated dynamics and fine details, especially for scenes with +abrupt motions. We further implement our temporal slicing and splatting +techniques in a highly optimized CUDA acceleration framework, achieving +real-time inference rendering speeds of up to 277 FPS on an RTX 3090 GPU and +583 FPS on an RTX 4090 GPU. Rigorous evaluations on scenes with diverse motions +showcase the superior efficiency and effectiveness of 4DGS, which consistently +outperforms existing methods both quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ Do Diffusion Models Learn Semantically Meaningful and Efficient + Representations? + + +
+ Diffusion models are capable of impressive feats of image generation with +uncommon juxtapositions such as astronauts riding horses on the moon with +properly placed shadows. These outputs indicate the ability to perform +compositional generalization, but how do the models do so? We perform +controlled experiments on conditional DDPMs learning to generate 2D spherical +Gaussian bumps centered at specified $x$- and $y$-positions. Our results show +that the emergence of semantically meaningful latent representations is key to +achieving high performance. En route to successful performance over learning, +the model traverses three distinct phases of latent representations: (phase A) +no latent structure, (phase B) a 2D manifold of disordered states, and (phase +C) a 2D ordered manifold. Corresponding to each of these phases, we identify +qualitatively different generation behaviors: 1) multiple bumps are generated, +2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is +generated at the correct $x$ and y location. Furthermore, we show that even +under imbalanced datasets where features ($x$- versus $y$-positions) are +represented with skewed frequencies, the learning process for $x$ and $y$ is +coupled rather than factorized, demonstrating that simple vanilla-flavored +diffusion models cannot learn efficient representations in which localization +in $x$ and $y$ are factorized into separate 1D tasks. These findings suggest +the need for future work to find inductive biases that will push generative +models to discover and exploit factorizable independent structures in their +inputs, which will be required to vault these models into more data-efficient +regimes. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Swin-UMamba: Mamba-based UNet with ImageNet-based pretraining + + +
+ Accurate medical image segmentation demands the integration of multi-scale +information, spanning from local features to global dependencies. However, it +is challenging for existing methods to model long-range global information, +where convolutional neural networks (CNNs) are constrained by their local +receptive fields, and vision transformers (ViTs) suffer from high quadratic +complexity of their attention mechanism. Recently, Mamba-based models have +gained great attention for their impressive ability in long sequence modeling. +Several studies have demonstrated that these models can outperform popular +vision models in various tasks, offering higher accuracy, lower memory +consumption, and less computational burden. However, existing Mamba-based +models are mostly trained from scratch and do not explore the power of +pretraining, which has been proven to be quite effective for data-efficient +medical image analysis. This paper introduces a novel Mamba-based model, +Swin-UMamba, designed specifically for medical image segmentation tasks, +leveraging the advantages of ImageNet-based pretraining. Our experimental +results reveal the vital role of ImageNet-based training in enhancing the +performance of Mamba-based models. Swin-UMamba demonstrates superior +performance with a large margin compared to CNNs, ViTs, and latest Mamba-based +models. Notably, on AbdomenMRI, Encoscopy, and Microscopy datasets, Swin-UMamba +outperforms its closest counterpart U-Mamba by an average score of 3.58%. The +code and models of Swin-UMamba are publicly available at: +https://github.com/JiarunLiu/Swin-UMamba + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ GUARD: Role-playing to Generate Natural-language Jailbreakings to Test + Guideline Adherence of Large Language Models + + +
+ The discovery of "jailbreaks" to bypass safety filters of Large Language +Models (LLMs) and harmful responses have encouraged the community to implement +safety measures. One major safety measure is to proactively test the LLMs with +jailbreaks prior to the release. Therefore, such testing will require a method +that can generate jailbreaks massively and efficiently. In this paper, we +follow a novel yet intuitive strategy to generate jailbreaks in the style of +the human generation. We propose a role-playing system that assigns four +different roles to the user LLMs to collaborate on new jailbreaks. Furthermore, +we collect existing jailbreaks and split them into different independent +characteristics using clustering frequency and semantic patterns sentence by +sentence. We organize these characteristics into a knowledge graph, making them +more accessible and easier to retrieve. Our system of different roles will +leverage this knowledge graph to generate new jailbreaks, which have proved +effective in inducing LLMs to generate unethical or guideline-violating +responses. In addition, we also pioneer a setting in our system that will +automatically follow the government-issued guidelines to generate jailbreaks to +test whether LLMs follow the guidelines accordingly. We refer to our system as +GUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have +empirically validated the effectiveness of GUARD on three cutting-edge +open-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a +widely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the +realm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing +GUARD's versatility and contributing valuable insights for the development of +safer, more reliable LLM-based applications across diverse modalities. + +
+
+ comment: 22 papges +
+
+
+
+
+ + ☆ Zero-shot Object-Level OOD Detection with Context-Aware Inpainting + + +
+ Machine learning algorithms are increasingly provided as black-box cloud +services or pre-trained models, without access to their training data. This +motivates the problem of zero-shot out-of-distribution (OOD) detection. +Concretely, we aim to detect OOD objects that do not belong to the classifier's +label set but are erroneously classified as in-distribution (ID) objects. Our +approach, RONIN, uses an off-the-shelf diffusion model to replace detected +objects with inpainting. RONIN conditions the inpainting process with the +predicted ID label, drawing the input object closer to the in-distribution +domain. As a result, the reconstructed object is very close to the original in +the ID cases and far in the OOD cases, allowing RONIN to effectively +distinguish ID and OOD samples. Throughout extensive experiments, we +demonstrate that RONIN achieves competitive results compared to previous +approaches across several datasets, both in zero-shot and non-zero-shot +settings. + +
+
+
+
+
+ + ☆ InstanceDiffusion: Instance-level Control for Image Generation + + +
+ Text-to-image diffusion models produce high quality images but do not offer +control over individual instances in the image. We introduce InstanceDiffusion +that adds precise instance-level control to text-to-image diffusion models. +InstanceDiffusion supports free-form language conditions per instance and +allows flexible ways to specify instance locations such as simple single +points, scribbles, bounding boxes or intricate instance segmentation masks, and +combinations thereof. We propose three major changes to text-to-image models +that enable precise instance-level control. Our UniFusion block enables +instance-level conditions for text-to-image models, the ScaleU block improves +image fidelity, and our Multi-instance Sampler improves generations for +multiple instances. InstanceDiffusion significantly surpasses specialized +state-of-the-art models for each location condition. Notably, on the COCO +dataset, we outperform previous state-of-the-art by 20.4% AP$_{50}^\text{box}$ +for box inputs, and 25.4% IoU for mask inputs. + +
+
+ comment: Preprint; Project page: + https://people.eecs.berkeley.edu/~xdwang/projects/InstDiff/ +
+
+
+
+
+ + ☆ Training-Free Consistent Text-to-Image Generation + + +
+ Text-to-image models offer a new level of creative flexibility by allowing +users to guide the image generation process through natural language. However, +using these models to consistently portray the same subject across diverse +prompts remains challenging. Existing approaches fine-tune the model to teach +it new words that describe specific user-provided subjects or add image +conditioning to the model. These methods require lengthy per-subject +optimization or large-scale pre-training. Moreover, they struggle to align +generated images with text prompts and face difficulties in portraying multiple +subjects. Here, we present ConsiStory, a training-free approach that enables +consistent subject generation by sharing the internal activations of the +pretrained model. We introduce a subject-driven shared attention block and +correspondence-based feature injection to promote subject consistency between +images. Additionally, we develop strategies to encourage layout diversity while +maintaining subject consistency. We compare ConsiStory to a range of baselines, +and demonstrate state-of-the-art performance on subject consistency and text +alignment, without requiring a single optimization step. Finally, ConsiStory +can naturally extend to multi-subject scenarios, and even enable training-free +personalization for common objects. + +
+
+ comment: Project page is in https://consistory-paper.github.io +
+
+
+
+
+ + ☆ Towards a Flexible Scale-out Framework for Efficient Visual Data Query + Processing + + +
+ There is growing interest in visual data management systems that support +queries with specialized operations ranging from resizing an image to running +complex machine learning models. With a plethora of such operations, the basic +need to receive query responses in minimal time takes a hit, especially when +the client desires to run multiple such operations in a single query. Existing +systems provide an ad-hoc approach where different solutions are clubbed +together to provide an end-to-end visual data management system. Unlike such +solutions, the Visual Data Management System (VDMS) natively executes queries +with multiple operations, thus providing an end-to-end solution. However, a +fixed subset of native operations and a synchronous threading architecture +limit its generality and scalability. + In this paper, we develop VDMS-Async that adds the capability to run +user-defined operations with VDMS and execute operations within a query on a +remote server. VDMS-Async utilizes an event-driven architecture to create an +efficient pipeline for executing operations within a query. Our experiments +have shown that VDMS-Async reduces the query execution time by 2-3X compared to +existing state-of-the-art systems. Further, remote operations coupled with an +event-driven architecture enables VDMS-Async to scale query execution time +linearly with the addition of every new remote server. We demonstrate a 64X +reduction in query execution time when adding 64 remote servers. + +
+
+
+
+
+ + ☆ CLIP Can Understand Depth + + +
+ Recent studies on generalizing CLIP for monocular depth estimation reveal +that CLIP pre-trained on web-crawled data is inefficient for deriving proper +similarities between image patches and depth-related prompts. In this paper, we +adapt CLIP for meaningful quality of monocular depth estimation with dense +prediction, without fine-tuning its original vision-language alignment. By +jointly training a compact deconvolutional decoder with a tiny learnable +embedding matrix named mirror, as a static prompt for its text encoder, CLIP is +enabled to understand depth. With this approach, our model exhibits impressive +performance matching several previous state-of-the-art vision-only models on +the NYU Depth v2 and KITTI datasets, outperforming every CLIP-based depth +estimation model with a large margin. Experiments on temporal depth consistency +and spatial continuity demonstrate that the prior knowledge of CLIP can be +effectively refined by our proposed framework. Furthermore, an ablation study +on mirror proves that the resulting model estimates depth utilizing knowledge +not only from the image encoder but also text encoder despite not being given +any prompt written in a human way. This research demonstrates that through +minimal adjustments, the prior knowledge of vision-language foundation models, +such as CLIP, can be generalized even to domains where learning during +pretraining is challenging. We facilitate future works focused on methods to +adjust suboptimal prior knowledge of vision-language models using non-human +language prompts, achieving performance on par with task-specific +state-of-the-art methodologies. + +
+
+
+
+
+ + ☆ SGS-SLAM: Semantic Gaussian Splatting For Neural Dense SLAM + + +
+ Semantic understanding plays a crucial role in Dense Simultaneous +Localization and Mapping (SLAM), facilitating comprehensive scene +interpretation. Recent advancements that integrate Gaussian Splatting into SLAM +systems have demonstrated its effectiveness in generating high-quality +renderings through the use of explicit 3D Gaussian representations. Building on +this progress, we propose SGS-SLAM, the first semantic dense visual SLAM system +grounded in 3D Gaussians, which provides precise 3D semantic segmentation +alongside high-fidelity reconstructions. Specifically, we propose to employ +multi-channel optimization during the mapping process, integrating appearance, +geometric, and semantic constraints with key-frame optimization to enhance +reconstruction quality. Extensive experiments demonstrate that SGS-SLAM +delivers state-of-the-art performance in camera pose estimation, map +reconstruction, and semantic segmentation, outperforming existing methods +meanwhile preserving real-time rendering ability. + +
+
+
+
+
+ + ☆ FROSTER: Frozen CLIP Is A Strong Teacher for Open-Vocabulary Action + Recognition ICLR 2024 + + +
+ In this paper, we introduce FROSTER, an effective framework for +open-vocabulary action recognition. The CLIP model has achieved remarkable +success in a range of image-based tasks, benefiting from its strong +generalization capability stemming from pretaining on massive image-text pairs. +However, applying CLIP directly to the open-vocabulary action recognition task +is challenging due to the absence of temporal information in CLIP's +pretraining. Further, fine-tuning CLIP on action recognition datasets may lead +to overfitting and hinder its generalizability, resulting in unsatisfactory +results when dealing with unseen actions. + To address these issues, FROSTER employs a residual feature distillation +approach to ensure that CLIP retains its generalization capability while +effectively adapting to the action recognition task. Specifically, the residual +feature distillation treats the frozen CLIP model as a teacher to maintain the +generalizability exhibited by the original CLIP and supervises the feature +learning for the extraction of video-specific features to bridge the gap +between images and videos. Meanwhile, it uses a residual sub-network for +feature distillation to reach a balance between the two distinct objectives of +learning generalizable and video-specific features. + We extensively evaluate FROSTER on open-vocabulary action recognition +benchmarks under both base-to-novel and cross-dataset settings. FROSTER +consistently achieves state-of-the-art performance on all datasets across the +board. Project page: https://visual-ai.github.io/froster. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ ActiveAnno3D -- An Active Learning Framework for Multi-Modal 3D Object + Detection + + +
+ The curation of large-scale datasets is still costly and requires much time +and resources. Data is often manually labeled, and the challenge of creating +high-quality datasets remains. In this work, we fill the research gap using +active learning for multi-modal 3D object detection. We propose ActiveAnno3D, +an active learning framework to select data samples for labeling that are of +maximum informativeness for training. We explore various continuous training +methods and integrate the most efficient method regarding computational demand +and detection performance. Furthermore, we perform extensive experiments and +ablation studies with BEVFusion and PV-RCNN on the nuScenes and TUM Traffic +Intersection dataset. We show that we can achieve almost the same performance +with PV-RCNN and the entropy-based query strategy when using only half of the +training data (77.25 mAP compared to 83.50 mAP) of the TUM Traffic Intersection +dataset. BEVFusion achieved an mAP of 64.31 when using half of the training +data and 75.0 mAP when using the complete nuScenes dataset. We integrate our +active learning framework into the proAnno labeling tool to enable AI-assisted +data selection and labeling and minimize the labeling costs. Finally, we +provide code, weights, and visualization results on our website: +https://active3d-framework.github.io/active3d-framework. + +
+
+
+
+
+ + ☆ CT-based Anatomical Segmentation for Thoracic Surgical Planning: A + Benchmark Study for 3D U-shaped Deep Learning Models + + +
+ Recent rising interests in patient-specific thoracic surgical planning and +simulation require efficient and robust creation of digital anatomical models +from automatic medical image segmentation algorithms. Deep learning (DL) is now +state-of-the-art in various radiological tasks, and U-shaped DL models have +particularly excelled in medical image segmentation since the inception of the +2D UNet. To date, many variants of U-shaped models have been proposed by the +integration of different attention mechanisms and network configurations. +Leveraging the recent development of large multi-label databases, systematic +benchmark studies for these models can provide valuable insights for clinical +deployment and future model designs, but such studies are still rare. We +conduct the first benchmark study for variants of 3D U-shaped models (3DUNet, +STUNet, AttentionUNet, SwinUNETR, FocalSegNet, and a novel 3D SwinUnet with +four variants) with a focus on CT-based anatomical segmentation for thoracic +surgery. Our study systematically examines the impact of different attention +mechanisms, number of resolution stages, and network configurations on +segmentation accuracy and computational complexity. To allow cross-reference +with other recent benchmarking studies, we also included a performance +assessment of the BTCV abdominal structural segmentation. With the STUNet +ranking at the top, our study demonstrated the value of CNN-based U-shaped +models for the investigated tasks and the benefit of residual blocks in network +configuration designs to boost segmentation performance. + +
+
+
+
+
+ + ☆ IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of + brain MR images + + +
+ In MRI studies, the aggregation of imaging data from multiple acquisition +sites enhances sample size but may introduce site-related variabilities that +hinder consistency in subsequent analyses. Deep learning methods for image +translation have emerged as a solution for harmonizing MR images across sites. +In this study, we introduce IGUANe (Image Generation with Unified Adversarial +Networks), an original 3D model that leverages the strengths of domain +translation and straightforward application of style transfer methods for +multicenter brain MR image harmonization. IGUANe extends CycleGAN architecture +by integrating an arbitrary number of domains for training through a +many-to-one strategy. During inference, the model can be applied to any image, +even from an unknown acquisition site, making it a universal generator for +harmonization. Trained on a dataset comprising T1-weighted images from 11 +different scanners, IGUANe was evaluated on data from unseen sites. The +assessments included the transformation of MR images with traveling subjects, +the preservation of pairwise distances between MR images within domains, the +evolution of volumetric patterns related to age and Alzheimer$^\prime$s disease +(AD), and the performance in age regression and patient classification tasks. +Comparisons with other harmonization and normalization methods suggest that +IGUANe better preserves individual information in MR images and is more +suitable for maintaining and reinforcing variabilities related to age and AD. +Future studies may further assess IGUANe in other multicenter contexts, either +using the same model or retraining it for applications to different image +modalities. + +
+
+ comment: 23 pages, 8 figures +
+
+
+
+
+ + ☆ Organic or Diffused: Can We Distinguish Human Art from AI-generated + Images? + + +
+ The advent of generative AI images has completely disrupted the art world. +Identifying AI generated images from human art is a challenging problem whose +impact is growing over time. The failure to address this problem allows bad +actors to defraud individuals paying a premium for human art, and companies +whose stated policies forbid AI imagery. This is also critical for AI model +trainers, who need to filter training data to avoid potential model collapse. +There are several different approaches to distinguishing human art from AI +images, including classifiers trained by supervised learning, research tools +targeting diffusion models, and identification by professional artists using +their knowledge of artistic techniques. In this paper, we seek to understand +how well these approaches can perform against today's modern generative models +in both benign and adversarial settings. We curate real human art across 7 +styles, generate matching images from 5 generative models, and apply 8 +detectors (5 automated detectors and 3 different human groups including 180 +crowdworkers, 4000+ professional artists, and 13 expert artists experienced at +detecting AI). Both Hive and expert artists do very well, but make mistakes in +different ways (Hive is weaker against adversarial perturbations while Expert +artists produce higher false positives). We believe these weaknesses will +remain as models continue to evolve, and use our data to demonstrate why a +combined team of human and automated detectors provides the best combination of +accuracy and robustness. + +
+
+
+
+
+ + ☆ Towards mitigating uncann(eye)ness in face swaps via gaze-centric loss + terms + + +
+ Advances in face swapping have enabled the automatic generation of highly +realistic faces. Yet face swaps are perceived differently than when looking at +real faces, with key differences in viewer behavior surrounding the eyes. Face +swapping algorithms generally place no emphasis on the eyes, relying on pixel +or feature matching losses that consider the entire face to guide the training +process. We further investigate viewer perception of face swaps, focusing our +analysis on the presence of an uncanny valley effect. We additionally propose a +novel loss equation for the training of face swapping models, leveraging a +pretrained gaze estimation network to directly improve representation of the +eyes. We confirm that viewed face swaps do elicit uncanny responses from +viewers. Our proposed improvements significant reduce viewing angle errors +between face swaps and their source material. Our method additionally reduces +the prevalence of the eyes as a deciding factor when viewers perform deepfake +detection tasks. Our findings have implications on face swapping for special +effects, as digital avatars, as privacy mechanisms, and more; negative +responses from users could limit effectiveness in said applications. Our gaze +improvements are a first step towards alleviating negative viewer perceptions +via a targeted approach. + +
+
+ comment: Accepted to Computers and Graphics Special Issue: Eye Gaze + Visualization, Interaction, Synthesis, and Analysis +
+
+
+
+
+ + ☆ Multi: Multimodal Understanding Leaderboard with Text and Images + + +
+ Rapid progress in multimodal large language models (MLLMs) highlights the +need to introduce challenging yet realistic benchmarks to the academic +community. Existing benchmarks primarily focus on simple natural image +understanding, but Multi emerges as a cutting-edge benchmark for MLLMs, +offering a comprehensive dataset for evaluating MLLMs against understanding +complex figures and tables, and scientific questions. This benchmark, +reflecting current realistic examination styles, provides multimodal inputs and +requires responses that are either precise or open-ended, similar to real-life +school tests. It challenges MLLMs with a variety of tasks, ranging from formula +derivation to image detail analysis, and cross-modality reasoning. Multi +includes over 18,000 questions, with a focus on science-based QA in diverse +formats. We also introduce Multi-Elite, a 500-question subset for testing the +extremities of MLLMs, and Multi-Extend, which enhances In-Context Learning +research with more than 4,500 knowledge pieces. Our evaluation indicates +significant potential for MLLM advancement, with GPT-4V achieving a 63.7% +accuracy rate on Multi, in contrast to other MLLMs scoring between 31.3% and +53.7%. Multi serves not only as a robust evaluation platform but also paves the +way for the development of expert-level AI. + +
+
+ comment: Details and access are available at: + https://OpenDFM.github.io/MULTI-Benchmark/ +
+
+
+
+
+ + ☆ RRWNet: Recursive Refinement Network for Effective Retinal Artery/Vein + Segmentation and Classification + + +
+ The caliber and configuration of retinal blood vessels serve as important +biomarkers for various diseases and medical conditions. A thorough analysis of +the retinal vasculature requires the segmentation of blood vessels and their +classification into arteries and veins, which is typically performed on color +fundus images obtained by retinography, a widely used imaging technique. +Nonetheless, manually performing these tasks is labor-intensive and prone to +human error. Various automated methods have been proposed to address this +problem. However, the current state of art in artery/vein segmentation and +classification faces challenges due to manifest classification errors that +affect the topological consistency of segmentation maps. This study presents an +innovative end-to-end framework, RRWNet, designed to recursively refine +semantic segmentation maps and correct manifest classification errors. The +framework consists of a fully convolutional neural network with a Base +subnetwork that generates base segmentation maps from input images, and a +Recursive Refinement subnetwork that iteratively and recursively improves these +maps. Evaluation on public datasets demonstrates the state-of-the-art +performance of the proposed method, yielding more topologically consistent +segmentation maps with fewer manifest classification errors than existing +approaches. In addition, the Recursive Refinement module proves effective in +post-processing segmentation maps from other methods, automatically correcting +classification errors and improving topological consistency. The model code, +weights, and predictions are publicly available at +https://github.com/j-morano/rrwnet. + +
+
+
+
+
+ + ☆ Direct-a-Video: Customized Video Generation with User-Directed Camera + Movement and Object Motion + + +
+ Recent text-to-video diffusion models have achieved impressive progress. In +practice, users often desire the ability to control object motion and camera +movement independently for customized video creation. However, current methods +lack the focus on separately controlling object motion and camera movement in a +decoupled manner, which limits the controllability and flexibility of +text-to-video models. In this paper, we introduce Direct-a-Video, a system that +allows users to independently specify motions for one or multiple objects +and/or camera movements, as if directing a video. We propose a simple yet +effective strategy for the decoupled control of object motion and camera +movement. Object motion is controlled through spatial cross-attention +modulation using the model's inherent priors, requiring no additional +optimization. For camera movement, we introduce new temporal cross-attention +layers to interpret quantitative camera movement parameters. We further employ +an augmentation-based approach to train these layers in a self-supervised +manner on a small-scale dataset, eliminating the need for explicit motion +annotation. Both components operate independently, allowing individual or +combined control, and can generalize to open-domain scenarios. Extensive +experiments demonstrate the superiority and effectiveness of our method. +Project page: https://direct-a-video.github.io/. + +
+
+
+
+
+ + ☆ Video-LaVIT: Unified Video-Language Pre-training with Decoupled + Visual-Motional Tokenization + + +
+ In light of recent advances in multimodal Large Language Models (LLMs), there +is increasing attention to scaling them from image-text data to more +informative real-world videos. Compared to static images, video poses unique +challenges for effective large-scale pre-training due to the modeling of its +spatiotemporal dynamics. In this paper, we address such limitations in +video-language pre-training with an efficient video decomposition that +represents each video as keyframes and temporal motions. These are then adapted +to an LLM using well-designed tokenizers that discretize visual and temporal +information as a few tokens, thus enabling unified generative pre-training of +videos, images, and text. At inference, the generated tokens from the LLM are +carefully recovered to the original continuous pixel space to create various +video content. Our proposed framework is both capable of comprehending and +generating image and video content, as demonstrated by its competitive +performance across 13 multimodal benchmarks in image and video understanding +and generation. Our code and models will be available at +https://video-lavit.github.io. + +
+
+
+
+
+ + ☆ GPU-Accelerated 3D Polygon Visibility Volumes for Synergistic Perception + and Navigation + + +
+ UAV missions often require specific geometric constraints to be satisfied +between ground locations and the vehicle location. Such requirements are +typical for contexts where line-of-sight must be maintained between the vehicle +location and the ground control location and are also important in surveillance +applications where the UAV wishes to be able to sense, e.g., with a camera +sensor, a specific region within a complex geometric environment. This problem +is further complicated when the ground location is generalized to a convex 2D +polygonal region. This article describes the theory and implementation of a +system which can quickly calculate the 3D volume that encloses all 3D +coordinates from which a 2D convex planar region can be entirely viewed; +referred to as a visibility volume. The proposed approach computes visibility +volumes using a combination of depth map computation using GPU-acceleration and +geometric boolean operations. Solutions to this problem require complex 3D +geometric analysis techniques that must execute using arbitrary precision +arithmetic on a collection of discontinuous and non-analytic surfaces. +Post-processing steps incorporate navigational constraints to further restrict +the enclosed coordinates to include both visibility and navigation constraints. +Integration of sensing visibility constraints with navigational constraints +yields a range of navigable space where a vehicle will satisfy both perceptual +sensing and navigational needs of the mission. This algorithm then provides a +synergistic perception and navigation sensitive solution yielding a volume of +coordinates in 3D that satisfy both the mission path and sensing needs. + +
+
+
+
+
+ + ☆ Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks ICLR2024 + + +
+ Gradient inversion attacks aim to reconstruct local training data from +intermediate gradients exposed in the federated learning framework. Despite +successful attacks, all previous methods, starting from reconstructing a single +data point and then relaxing the single-image limit to batch level, are only +tested under hard label constraints. Even for single-image reconstruction, we +still lack an analysis-based algorithm to recover augmented soft labels. In +this work, we change the focus from enlarging batchsize to investigating the +hard label constraints, considering a more realistic circumstance where label +smoothing and mixup techniques are used in the training process. In particular, +we are the first to initiate a novel algorithm to simultaneously recover the +ground-truth augmented label and the input feature of the last fully-connected +layer from single-input gradients, and provide a necessary condition for any +analytical-based label recovery methods. Extensive experiments testify to the +label recovery accuracy, as well as the benefits to the following image +reconstruction. We believe soft labels in classification tasks are worth +further attention in gradient inversion attacks. + +
+
+ comment: ICLR2024 poster The prior submission version had a bug in the image + reconstruction implementation, which has been corrected without harm to the + main conclusions +
+
+
+
+
+ + ☆ Good Teachers Explain: Explanation-Enhanced Knowledge Distillation + + +
+ Knowledge Distillation (KD) has proven effective for compressing large +teacher models into smaller student models. While it is well known that student +models can achieve similar accuracies as the teachers, it has also been shown +that they nonetheless often do not learn the same function. It is, however, +often highly desirable that the student's and teacher's functions share similar +properties such as basing the prediction on the same input features, as this +ensures that students learn the 'right features' from the teachers. In this +work, we explore whether this can be achieved by not only optimizing the +classic KD loss but also the similarity of the explanations generated by the +teacher and the student. Despite the idea being simple and intuitive, we find +that our proposed 'explanation-enhanced' KD (e$^2$KD) (1) consistently provides +large gains in terms of accuracy and student-teacher agreement, (2) ensures +that the student learns from the teacher to be right for the right reasons and +to give similar explanations, and (3) is robust with respect to the model +architectures, the amount of training data, and even works with 'approximate', +pre-computed explanations. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ☆ Transcending Adversarial Perturbations: Manifold-Aided Adversarial + Examples with Legitimate Semantics + + +
+ Deep neural networks were significantly vulnerable to adversarial examples +manipulated by malicious tiny perturbations. Although most conventional +adversarial attacks ensured the visual imperceptibility between adversarial +examples and corresponding raw images by minimizing their geometric distance, +these constraints on geometric distance led to limited attack transferability, +inferior visual quality, and human-imperceptible interpretability. In this +paper, we proposed a supervised semantic-transformation generative model to +generate adversarial examples with real and legitimate semantics, wherein an +unrestricted adversarial manifold containing continuous semantic variations was +constructed for the first time to realize a legitimate transition from +non-adversarial examples to adversarial ones. Comprehensive experiments on +MNIST and industrial defect datasets showed that our adversarial examples not +only exhibited better visual quality but also achieved superior attack +transferability and more effective explanations for model vulnerabilities, +indicating their great potential as generic adversarial examples. The code and +pre-trained models were available at https://github.com/shuaili1027/MAELS.git. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object + Detector + + +
+ This paper addresses the challenge of cross-domain few-shot object detection +(CD-FSOD), aiming to develop an accurate object detector for novel domains with +minimal labeled examples. While transformer-based open-set detectors e.g., +DE-ViT~\cite{zhang2023detect} have excelled in both open-vocabulary object +detection and traditional few-shot object detection, detecting categories +beyond those seen during training, we thus naturally raise two key questions: +1) can such open-set detection methods easily generalize to CD-FSOD? 2) If no, +how to enhance the results of open-set methods when faced with significant +domain gaps? To address the first question, we introduce several metrics to +quantify domain variances and establish a new CD-FSOD benchmark with diverse +domain metric values. Some State-Of-The-Art (SOTA) open-set object detection +methods are evaluated on this benchmark, with evident performance degradation +observed across out-of-domain datasets. This indicates the failure of adopting +open-set detectors directly for CD-FSOD. Sequentially, to overcome the +performance degradation issue and also to answer the second proposed question, +we endeavor to enhance the vanilla DE-ViT. With several novel components +including finetuning, a learnable prototype module, and a lightweight attention +module, we present an improved Cross-Domain Vision Transformer for CD-FSOD +(CD-ViTO). Experiments show that our CD-ViTO achieves impressive results on +both out-of-domain and in-domain target datasets, establishing new SOTAs for +both CD-FSOD and FSOD. All the datasets, codes, and models will be released to +the community. + +
+
+
+
+
+ + ☆ AI-Enhanced Virtual Reality in Medicine: A Comprehensive Survey + + +
+ With the rapid advance of computer graphics and artificial intelligence +technologies, the ways we interact with the world have undergone a +transformative shift. Virtual Reality (VR) technology, aided by artificial +intelligence (AI), has emerged as a dominant interaction media in multiple +application areas, thanks to its advantage of providing users with immersive +experiences. Among those applications, medicine is considered one of the most +promising areas. In this paper, we present a comprehensive examination of the +burgeoning field of AI-enhanced VR applications in medical care and services. +By introducing a systematic taxonomy, we meticulously classify the pertinent +techniques and applications into three well-defined categories based on +different phases of medical diagnosis and treatment: Visualization Enhancement, +VR-related Medical Data Processing, and VR-assisted Intervention. This +categorization enables a structured exploration of the diverse roles that +AI-powered VR plays in the medical domain, providing a framework for a more +comprehensive understanding and evaluation of these technologies. To our best +knowledge, this is the first systematic survey of AI-powered VR systems in +medical settings, laying a foundation for future research in this +interdisciplinary domain. + +
+
+
+
+
+ + ☆ Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual + Text Processing + + +
+ Visual text, a pivotal element in both document and scene images, speaks +volumes and attracts significant attention in the computer vision domain. +Beyond visual text detection and recognition, the field of visual text +processing has experienced a surge in research, driven by the advent of +fundamental generative models. However, challenges persist due to the unique +properties and features that distinguish text from general objects. Effectively +leveraging these unique textual characteristics is crucial in visual text +processing, as observed in our study. In this survey, we present a +comprehensive, multi-perspective analysis of recent advancements in this field. +Initially, we introduce a hierarchical taxonomy encompassing areas ranging from +text image enhancement and restoration to text image manipulation, followed by +different learning paradigms. Subsequently, we conduct an in-depth discussion +of how specific textual features such as structure, stroke, semantics, style, +and spatial context are seamlessly integrated into various tasks. Furthermore, +we explore available public datasets and benchmark the reviewed methods on +several widely-used datasets. Finally, we identify principal challenges and +potential avenues for future research. Our aim is to establish this survey as a +fundamental resource, fostering continued exploration and innovation in the +dynamic area of visual text processing. + +
+
+
+
+
+ + ☆ PFDM: Parser-Free Virtual Try-on via Diffusion Model ICASSP 2024 + + +
+ Virtual try-on can significantly improve the garment shopping experiences in +both online and in-store scenarios, attracting broad interest in computer +vision. However, to achieve high-fidelity try-on performance, most +state-of-the-art methods still rely on accurate segmentation masks, which are +often produced by near-perfect parsers or manual labeling. To overcome the +bottleneck, we propose a parser-free virtual try-on method based on the +diffusion model (PFDM). Given two images, PFDM can "wear" garments on the +target person seamlessly by implicitly warping without any other information. +To learn the model effectively, we synthesize many pseudo-images and construct +sample pairs by wearing various garments on persons. Supervised by the +large-scale expanded dataset, we fuse the person and garment features using a +proposed Garment Fusion Attention (GFA) mechanism. Experiments demonstrate that +our proposed PFDM can successfully handle complex cases, synthesize +high-fidelity images, and outperform both state-of-the-art parser-free and +parser-based models. + +
+
+ comment: Accepted by IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ InteractiveVideo: User-Centric Controllable Video Generation with + Synergistic Multimodal Instructions + + +
+ We introduce $\textit{InteractiveVideo}$, a user-centric framework for video +generation. Different from traditional generative approaches that operate based +on user-provided images or text, our framework is designed for dynamic +interaction, allowing users to instruct the generative model through various +intuitive mechanisms during the whole generation process, e.g. text and image +prompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal +Instruction mechanism, designed to seamlessly integrate users' multimodal +instructions into generative models, thus facilitating a cooperative and +responsive interaction between user inputs and the generative process. This +approach enables iterative and fine-grained refinement of the generation result +through precise and effective user instructions. With +$\textit{InteractiveVideo}$, users are given the flexibility to meticulously +tailor key aspects of a video. They can paint the reference image, edit +semantics, and adjust video motions until their requirements are fully met. +Code, models, and demo are available at +https://github.com/invictus717/InteractiveVideo + +
+
+ comment: Code, models, and demo are available at + https://github.com/invictus717/InteractiveVideo +
+
+
+
+
+ + ☆ Taylor Videos for Action Recognition + + +
+ Effectively extracting motions from video is a critical and long-standing +problem for action recognition. This problem is very challenging because +motions (i) do not have an explicit form, (ii) have various concepts such as +displacement, velocity, and acceleration, and (iii) often contain noise caused +by unstable pixels. Addressing these challenges, we propose the Taylor video, a +new video format that highlights the dominate motions (e.g., a waving hand) in +each of its frames named the Taylor frame. Taylor video is named after Taylor +series, which approximates a function at a given point using important terms. +In the scenario of videos, we define an implicit motion-extraction function +which aims to extract motions from video temporal block. In this block, using +the frames, the difference frames, and higher-order difference frames, we +perform Taylor expansion to approximate this function at the starting frame. We +show the summation of the higher-order terms in the Taylor series gives us +dominant motion patterns, where static objects, small and unstable motions are +removed. Experimentally we show that Taylor videos are effective inputs to +popular architectures including 2D CNNs, 3D CNNs, and transformers. When used +individually, Taylor videos yield competitive action recognition accuracy +compared to RGB videos and optical flow. When fused with RGB or optical flow +videos, further accuracy improvement is achieved. + +
+
+ comment: Research report +
+
+
+
+
+ + ☆ [Citation needed] Data usage and citation practices in medical imaging + conferences + + +
+ Medical imaging papers often focus on methodology, but the quality of the +algorithms and the validity of the conclusions are highly dependent on the +datasets used. As creating datasets requires a lot of effort, researchers often +use publicly available datasets, there is however no adopted standard for +citing the datasets used in scientific papers, leading to difficulty in +tracking dataset usage. In this work, we present two open-source tools we +created that could help with the detection of dataset usage, a pipeline +\url{https://github.com/TheoSourget/Public_Medical_Datasets_References} using +OpenAlex and full-text analysis, and a PDF annotation software +\url{https://github.com/TheoSourget/pdf_annotator} used in our study to +manually label the presence of datasets. We applied both tools on a study of +the usage of 20 publicly available medical datasets in papers from MICCAI and +MIDL. We compute the proportion and the evolution between 2013 and 2023 of 3 +types of presence in a paper: cited, mentioned in the full text, cited and +mentioned. Our findings demonstrate the concentration of the usage of a limited +set of datasets. We also highlight different citing practices, making the +automation of tracking difficult. + +
+
+ comment: Submitted to MIDL conference +
+
+
+
+
+ + ☆ Text-Guided Image Clustering + + +
+ Image clustering divides a collection of images into meaningful groups, +typically interpreted post-hoc via human-given annotations. Those are usually +in the form of text, begging the question of using text as an abstraction for +image clustering. Current image clustering methods, however, neglect the use of +generated textual descriptions. We, therefore, propose Text-Guided Image +Clustering, i.e., generating text using image captioning and visual +question-answering (VQA) models and subsequently clustering the generated text. +Further, we introduce a novel approach to inject task- or domain knowledge for +clustering by prompting VQA models. Across eight diverse image clustering +datasets, our results show that the obtained text representations often +outperform image features. Additionally, we propose a counting-based cluster +explainability method. Our evaluations show that the derived keyword-based +explanations describe clusters better than the respective cluster accuracy +suggests. Overall, this research challenges traditional approaches and paves +the way for a paradigm shift in image clustering, using generated text. + +
+
+
+
+
+ + ☆ A Safety-Adapted Loss for Pedestrian Detection in Automated Driving + + +
+ In safety-critical domains like automated driving (AD), errors by the object +detector may endanger pedestrians and other vulnerable road users (VRU). As +common evaluation metrics are not an adequate safety indicator, recent works +employ approaches to identify safety-critical VRU and back-annotate the risk to +the object detector. However, those approaches do not consider the safety +factor in the deep neural network (DNN) training process. Thus, +state-of-the-art DNN penalizes all misdetections equally irrespective of their +criticality. Subsequently, to mitigate the occurrence of critical failure +cases, i.e., false negatives, a safety-aware training strategy might be +required to enhance the detection performance for critical pedestrians. In this +paper, we propose a novel safety-aware loss variation that leverages the +estimated per-pedestrian criticality scores during training. We exploit the +reachability set-based time-to-collision (TTC-RSB) metric from the motion +domain along with distance information to account for the worst-case threat +quantifying the criticality. Our evaluation results using RetinaNet and FCOS on +the nuScenes dataset demonstrate that training the models with our safety-aware +loss function mitigates the misdetection of critical pedestrians without +sacrificing performance for the general case, i.e., pedestrians outside the +safety-critical zone. + +
+
+
+
+
+ + ☆ Unsupervised semantic segmentation of high-resolution UAV imagery for + road scene parsing + + +
+ Two challenges are presented when parsing road scenes in UAV images. First, +the high resolution of UAV images makes processing difficult. Second, +supervised deep learning methods require a large amount of manual annotations +to train robust and accurate models. In this paper, an unsupervised road +parsing framework that leverages recent advances in vision language models and +fundamental computer vision model is introduced.Initially, a vision language +model is employed to efficiently process ultra-large resolution UAV images to +quickly detect road regions of interest in the images. Subsequently, the vision +foundation model SAM is utilized to generate masks for the road regions without +category information. Following that, a self-supervised representation learning +network extracts feature representations from all masked regions. Finally, an +unsupervised clustering algorithm is applied to cluster these feature +representations and assign IDs to each cluster. The masked regions are combined +with the corresponding IDs to generate initial pseudo-labels, which initiate an +iterative self-training process for regular semantic segmentation. The proposed +method achieves an impressive 89.96% mIoU on the development dataset without +relying on any manual annotation. Particularly noteworthy is the extraordinary +flexibility of the proposed method, which even goes beyond the limitations of +human-defined categories and is able to acquire knowledge of new categories +from the dataset itself. + +
+
+
+
+
+ + ☆ Retrieval-Augmented Score Distillation for Text-to-3D Generation + + +
+ Text-to-3D generation has achieved significant success by incorporating +powerful 2D diffusion models, but insufficient 3D prior knowledge also leads to +the inconsistency of 3D geometry. Recently, since large-scale multi-view +datasets have been released, fine-tuning the diffusion model on the multi-view +datasets becomes a mainstream to solve the 3D inconsistency problem. However, +it has confronted with fundamental difficulties regarding the limited quality +and diversity of 3D data, compared with 2D data. To sidestep these trade-offs, +we explore a retrieval-augmented approach tailored for score distillation, +dubbed RetDream. We postulate that both expressiveness of 2D diffusion models +and geometric consistency of 3D assets can be fully leveraged by employing the +semantically relevant assets directly within the optimization process. To this +end, we introduce novel framework for retrieval-based quality enhancement in +text-to-3D generation. We leverage the retrieved asset to incorporate its +geometric prior in the variational objective and adapt the diffusion model's 2D +prior toward view consistency, achieving drastic improvements in both geometry +and fidelity of generated scenes. We conduct extensive experiments to +demonstrate that RetDream exhibits superior quality with increased geometric +consistency. Project page is available at https://ku-cvlab.github.io/RetDream/. + +
+
+ comment: Project Page: https://ku-cvlab.github.io/RetDream/ +
+
+
+
+
+ + ☆ Delving into Multi-modal Multi-task Foundation Models for Road Scene + Understanding: From Learning Paradigm Perspectives + + +
+ Foundation models have indeed made a profound impact on various fields, +emerging as pivotal components that significantly shape the capabilities of +intelligent systems. In the context of intelligent vehicles, leveraging the +power of foundation models has proven to be transformative, offering notable +advancements in visual understanding. Equipped with multi-modal and multi-task +learning capabilities, multi-modal multi-task visual understanding foundation +models (MM-VUFMs) effectively process and fuse data from diverse modalities and +simultaneously handle various driving-related tasks with powerful adaptability, +contributing to a more holistic understanding of the surrounding scene. In this +survey, we present a systematic analysis of MM-VUFMs specifically designed for +road scenes. Our objective is not only to provide a comprehensive overview of +common practices, referring to task-specific models, unified multi-modal +models, unified multi-task models, and foundation model prompting techniques, +but also to highlight their advanced capabilities in diverse learning +paradigms. These paradigms include open-world understanding, efficient transfer +for road scenes, continual learning, interactive and generative capability. +Moreover, we provide insights into key challenges and future trends, such as +closed-loop driving systems, interpretability, embodied driving agents, and +world models. To facilitate researchers in staying abreast of the latest +developments in MM-VUFMs for road scenes, we have established a continuously +updated repository at https://github.com/rolsheng/MM-VUFM4DS + +
+
+ comment: 24 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ One-class anomaly detection through color-to-thermal AI for building + envelope inspection + + +
+ We present a label-free method for detecting anomalies during thermographic +inspection of building envelopes. It is based on the AI-driven prediction of +thermal distributions from color images. Effectively the method performs as a +one-class classifier of the thermal image regions with high mismatch between +the predicted and actual thermal distributions. The algorithm can learn to +identify certain features as normal or anomalous by selecting the target sample +used for training. We demonstrated this principle by training the algorithm +with data collected at different outdoors temperature, which lead to the +detection of thermal bridges. The method can be implemented to assist human +professionals during routine building inspections or combined with mobile +platforms for automating examination of large areas. + +
+
+
+
+
+ + ☆ AdaTreeFormer: Few Shot Domain Adaptation for Tree Counting from a + Single High-Resolution Image + + +
+ The process of estimating and counting tree density using only a single +aerial or satellite image is a difficult task in the fields of photogrammetry +and remote sensing. However, it plays a crucial role in the management of +forests. The huge variety of trees in varied topography severely hinders tree +counting models to perform well. The purpose of this paper is to propose a +framework that is learnt from the source domain with sufficient labeled trees +and is adapted to the target domain with only a limited number of labeled +trees. Our method, termed as AdaTreeFormer, contains one shared encoder with a +hierarchical feature extraction scheme to extract robust features from the +source and target domains. It also consists of three subnets: two for +extracting self-domain attention maps from source and target domains +respectively and one for extracting cross-domain attention maps. For the +latter, an attention-to-adapt mechanism is introduced to distill relevant +information from different domains while generating tree density maps; a +hierarchical cross-domain feature alignment scheme is proposed that +progressively aligns the features from the source and target domains. We also +adopt adversarial learning into the framework to further reduce the gap between +source and target domains. Our AdaTreeFormer is evaluated on six designed +domain adaptation tasks using three tree counting datasets, ie Jiangsu, +Yosemite, and London; and outperforms the state of the art methods +significantly. + +
+
+
+
+
+ + ☆ HoughToRadon Transform: New Neural Network Layer for Features + Improvement in Projection Space + + +
+ In this paper, we introduce HoughToRadon Transform layer, a novel layer +designed to improve the speed of neural networks incorporated with Hough +Transform to solve semantic image segmentation problems. By placing it after a +Hough Transform layer, "inner" convolutions receive modified feature maps with +new beneficial properties, such as a smaller area of processed images and +parameter space linearity by angle and shift. These properties were not +presented in Hough Transform alone. Furthermore, HoughToRadon Transform layer +allows us to adjust the size of intermediate feature maps using two new +parameters, thus allowing us to balance the speed and quality of the resulting +neural network. Our experiments on the open MIDV-500 dataset show that this new +approach leads to time savings in document segmentation tasks and achieves +state-of-the-art 97.7% accuracy, outperforming HoughEncoder with larger +computational complexity. + +
+
+
+
+
+ + ☆ Exploring the Synergies of Hybrid CNNs and ViTs Architectures for + Computer Vision: A survey + + +
+ The hybrid of Convolutional Neural Network (CNN) and Vision Transformers +(ViT) architectures has emerged as a groundbreaking approach, pushing the +boundaries of computer vision (CV). This comprehensive review provides a +thorough examination of the literature on state-of-the-art hybrid CNN-ViT +architectures, exploring the synergies between these two approaches. The main +content of this survey includes: (1) a background on the vanilla CNN and ViT, +(2) systematic review of various taxonomic hybrid designs to explore the +synergy achieved through merging CNNs and ViTs models, (3) comparative analysis +and application task-specific synergy between different hybrid architectures, +(4) challenges and future directions for hybrid models, (5) lastly, the survey +concludes with a summary of key findings and recommendations. Through this +exploration of hybrid CV architectures, the survey aims to serve as a guiding +resource, fostering a deeper understanding of the intricate dynamics between +CNNs and ViTs and their collective impact on shaping the future of CV +architectures. + +
+
+
+
+
+ + ☆ Panoramic Image Inpainting With Gated Convolution And Contextual + Reconstruction Loss ICASSP 2024 + + +
+ Deep learning-based methods have demonstrated encouraging results in tackling +the task of panoramic image inpainting. However, it is challenging for existing +methods to distinguish valid pixels from invalid pixels and find suitable +references for corrupted areas, thus leading to artifacts in the inpainted +results. In response to these challenges, we propose a panoramic image +inpainting framework that consists of a Face Generator, a Cube Generator, a +side branch, and two discriminators. We use the Cubemap Projection (CMP) format +as network input. The generator employs gated convolutions to distinguish valid +pixels from invalid ones, while a side branch is designed utilizing contextual +reconstruction (CR) loss to guide the generators to find the most suitable +reference patch for inpainting the missing region. The proposed method is +compared with state-of-the-art (SOTA) methods on SUN360 Street View dataset in +terms of PSNR and SSIM. Experimental results and ablation study demonstrate +that the proposed method outperforms SOTA both quantitatively and +qualitatively. + +
+
+ comment: Copyright 2024 IEEE - to appear in IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Instance Segmentation XXL-CT Challenge of a Historic Airplane + + +
+ Instance segmentation of compound objects in XXL-CT imagery poses a unique +challenge in non-destructive testing. This complexity arises from the lack of +known reference segmentation labels, limited applicable segmentation tools, as +well as partially degraded image quality. To asses recent advancements in the +field of machine learning-based image segmentation, the "Instance Segmentation +XXL-CT Challenge of a Historic Airplane" was conducted. The challenge aimed to +explore automatic or interactive instance segmentation methods for an efficient +delineation of the different aircraft components, such as screws, rivets, metal +sheets or pressure tubes. We report the organization and outcome of this +challenge and describe the capabilities and limitations of the submitted +segmentation methods. + +
+
+
+
+
+ + ☆ Pixel-Wise Color Constancy via Smoothness Techniques in Multi-Illuminant + Scenes ICIP 2024 + + +
+ Most scenes are illuminated by several light sources, where the traditional +assumption of uniform illumination is invalid. This issue is ignored in most +color constancy methods, primarily due to the complex spatial impact of +multiple light sources on the image. Moreover, most existing multi-illuminant +methods fail to preserve the smooth change of illumination, which stems from +spatial dependencies in natural images. Motivated by this, we propose a novel +multi-illuminant color constancy method, by learning pixel-wise illumination +maps caused by multiple light sources. The proposed method enforces smoothness +within neighboring pixels, by regularizing the training with the total +variation loss. Moreover, a bilateral filter is provisioned further to enhance +the natural appearance of the estimated images, while preserving the edges. +Additionally, we propose a label-smoothing technique that enables the model to +generalize well despite the uncertainties in ground truth. Quantitative and +qualitative experiments demonstrate that the proposed method outperforms the +state-of-the-art. + +
+
+ comment: Copyright 2024 IEEE - Submitted to IEEE ICIP 2024 +
+
+
+
+
+ + ☆ ViewFusion: Learning Composable Diffusion Models for Novel View + Synthesis + + +
+ Deep learning is providing a wealth of new approaches to the old problem of +novel view synthesis, from Neural Radiance Field (NeRF) based approaches to +end-to-end style architectures. Each approach offers specific strengths but +also comes with specific limitations in their applicability. This work +introduces ViewFusion, a state-of-the-art end-to-end generative approach to +novel view synthesis with unparalleled flexibility. ViewFusion consists in +simultaneously applying a diffusion denoising step to any number of input views +of a scene, then combining the noise gradients obtained for each view with an +(inferred) pixel-weighting mask, ensuring that for each region of the target +scene only the most informative input views are taken into account. Our +approach resolves several limitations of previous approaches by (1) being +trainable and generalizing across multiple scenes and object classes, (2) +adaptively taking in a variable number of pose-free views at both train and +test time, (3) generating plausible views even in severely undetermined +conditions (thanks to its generative nature) -- all while generating views of +quality on par or even better than state-of-the-art methods. Limitations +include not generating a 3D embedding of the scene, resulting in a relatively +slow inference speed, and our method only being tested on the relatively small +dataset NMR. Code is available. + +
+
+
+
+
+ + ☆ Motion-Aware Video Frame Interpolation + + +
+ Video frame interpolation methodologies endeavor to create novel frames +betwixt extant ones, with the intent of augmenting the video's frame frequency. +However, current methods are prone to image blurring and spurious artifacts in +challenging scenarios involving occlusions and discontinuous motion. Moreover, +they typically rely on optical flow estimation, which adds complexity to +modeling and computational costs. To address these issues, we introduce a +Motion-Aware Video Frame Interpolation (MA-VFI) network, which directly +estimates intermediate optical flow from consecutive frames by introducing a +novel hierarchical pyramid module. It not only extracts global semantic +relationships and spatial details from input frames with different receptive +fields, enabling the model to capture intricate motion patterns, but also +effectively reduces the required computational cost and complexity. +Subsequently, a cross-scale motion structure is presented to estimate and +refine intermediate flow maps by the extracted features. This approach +facilitates the interplay between input frame features and flow maps during the +frame interpolation process and markedly heightens the precision of the +intervening flow delineations. Finally, a discerningly fashioned loss centered +around an intermediate flow is meticulously contrived, serving as a deft rudder +to skillfully guide the prognostication of said intermediate flow, thereby +substantially refining the precision of the intervening flow mappings. +Experiments illustrate that MA-VFI surpasses several representative VFI methods +across various datasets, and can enhance efficiency while maintaining +commendable efficacy. + +
+
+
+
+
+ + ☆ Exploring Federated Self-Supervised Learning for General Purpose Audio + Understanding + + +
+ The integration of Federated Learning (FL) and Self-supervised Learning (SSL) +offers a unique and synergetic combination to exploit the audio data for +general-purpose audio understanding, without compromising user data privacy. +However, rare efforts have been made to investigate the SSL models in the FL +regime for general-purpose audio understanding, especially when the training +data is generated by large-scale heterogeneous audio sources. In this paper, we +evaluate the performance of feature-matching and predictive audio-SSL +techniques when integrated into large-scale FL settings simulated with +non-independently identically distributed (non-iid) data. We propose a novel +Federated SSL (F-SSL) framework, dubbed FASSL, that enables learning +intermediate feature representations from large-scale decentralized +heterogeneous clients, holding unlabelled audio data. Our study has found that +audio F-SSL approaches perform on par with the centralized audio-SSL approaches +on the audio-retrieval task. Extensive experiments demonstrate the +effectiveness and significance of FASSL as it assists in obtaining the optimal +global model for state-of-the-art FL aggregation methods. + +
+
+
+
+
+ + ☆ Time-, Memory- and Parameter-Efficient Visual Adaptation + + +
+ As foundation models become more popular, there is a growing need to +efficiently finetune them for downstream tasks. Although numerous adaptation +methods have been proposed, they are designed to be efficient only in terms of +how many parameters are trained. They, however, typically still require +backpropagating gradients throughout the model, meaning that their +training-time and -memory cost does not reduce as significantly. We propose an +adaptation method which does not backpropagate gradients through the backbone. +We achieve this by designing a lightweight network in parallel that operates on +features from the frozen, pretrained backbone. As a result, our method is +efficient not only in terms of parameters, but also in training-time and memory +usage. Our approach achieves state-of-the-art accuracy-parameter trade-offs on +the popular VTAB benchmark, and we further show how we outperform prior works +with respect to training-time and -memory usage too. We further demonstrate the +training efficiency and scalability of our method by adapting a vision +transformer backbone of 4 billion parameters for the computationally demanding +task of video classification, without any intricate model parallelism. Here, we +outperform a prior adaptor-based method which could only scale to a 1 billion +parameter backbone, or fully-finetuning a smaller backbone, with the same GPU +and less training time. + +
+
+
+
+
+ + ☆ Time-Distributed Backdoor Attacks on Federated Spiking Learning + + +
+ This paper investigates the vulnerability of spiking neural networks (SNNs) +and federated learning (FL) to backdoor attacks using neuromorphic data. +Despite the efficiency of SNNs and the privacy advantages of FL, particularly +in low-powered devices, we demonstrate that these systems are susceptible to +such attacks. We first assess the viability of using FL with SNNs using +neuromorphic data, showing its potential usage. Then, we evaluate the +transferability of known FL attack methods to SNNs, finding that these lead to +suboptimal attack performance. Therefore, we explore backdoor attacks involving +single and multiple attackers to improve the attack performance. Our primary +contribution is developing a novel attack strategy tailored to SNNs and FL, +which distributes the backdoor trigger temporally and across malicious devices, +enhancing the attack's effectiveness and stealthiness. In the best case, we +achieve a 100 attack success rate, 0.13 MSE, and 98.9 SSIM. Moreover, we adapt +and evaluate an existing defense against backdoor attacks, revealing its +inadequacy in protecting SNNs. This study underscores the need for robust +security measures in deploying SNNs and FL, particularly in the context of +backdoor attacks. + +
+
+
+
+
+ + ☆ Enhancing Compositional Generalization via Compositional Feature + Alignment + + +
+ Real-world applications of machine learning models often confront data +distribution shifts, wherein discrepancies exist between the training and test +data distributions. In the common multi-domain multi-class setup, as the number +of classes and domains scales up, it becomes infeasible to gather training data +for every domain-class combination. This challenge naturally leads the quest +for models with Compositional Generalization (CG) ability, where models can +generalize to unseen domain-class combinations. To delve into the CG challenge, +we develop CG-Bench, a suite of CG benchmarks derived from existing real-world +image datasets, and observe that the prevalent pretraining-finetuning paradigm +on foundational models, such as CLIP and DINOv2, struggles with the challenge. +To address this challenge, we propose Compositional Feature Alignment (CFA), a +simple two-stage finetuning technique that i) learns two orthogonal linear +heads on a pretrained encoder with respect to class and domain labels, and ii) +fine-tunes the encoder with the newly learned head frozen. We theoretically and +empirically justify that CFA encourages compositional feature learning of +pretrained models. We further conduct extensive experiments on CG-Bench for +CLIP and DINOv2, two powerful pretrained vision foundation models. Experiment +results show that CFA outperforms common finetuning techniques in compositional +generalization, corroborating CFA's efficacy in compositional feature learning. + +
+
+ comment: Code is released at + https://github.com/Haoxiang-Wang/Compositional-Feature-Alignment +
+
+
+
+
+ + ☆ Perceptual Learned Image Compression via End-to-End JND-Based + Optimization ICIP 2024 + + +
+ Emerging Learned image Compression (LC) achieves significant improvements in +coding efficiency by end-to-end training of neural networks for compression. An +important benefit of this approach over traditional codecs is that any +optimization criteria can be directly applied to the encoder-decoder networks +during training. Perceptual optimization of LC to comply with the Human Visual +System (HVS) is among such criteria, which has not been fully explored yet. +This paper addresses this gap by proposing a novel framework to integrate Just +Noticeable Distortion (JND) principles into LC. Leveraging existing JND +datasets, three perceptual optimization methods are proposed to integrate JND +into the LC training process: (1) Pixel-Wise JND Loss (PWL) prioritizes +pixel-by-pixel fidelity in reproducing JND characteristics, (2) Image-Wise JND +Loss (IWL) emphasizes on overall imperceptible degradation levels, and (3) +Feature-Wise JND Loss (FWL) aligns the reconstructed image features with +perceptually significant features. Experimental evaluations demonstrate the +effectiveness of JND integration, highlighting improvements in rate-distortion +performance and visual quality, compared to baseline methods. The proposed +methods add no extra complexity after training. + +
+
+ comment: Copyright 2024 IEEE - Submitted to IEEE ICIP 2024 +
+
+
+
+
+ + ☆ SynthVision -- Harnessing Minimal Input for Maximal Output in Computer + Vision Models using Synthetic Image data + + +
+ Rapid development of disease detection computer vision models is vital in +response to urgent medical crises like epidemics or events of bioterrorism. +However, traditional data gathering methods are too slow for these scenarios +necessitating innovative approaches to generate reliable models quickly from +minimal data. We demonstrate our new approach by building a comprehensive +computer vision model for detecting Human Papilloma Virus Genital warts using +only synthetic data. In our study, we employed a two phase experimental design +using diffusion models. In the first phase diffusion models were utilized to +generate a large number of diverse synthetic images from 10 HPV guide images +explicitly focusing on accurately depicting genital warts. The second phase +involved the training and testing vision model using this synthetic dataset. +This method aimed to assess the effectiveness of diffusion models in rapidly +generating high quality training data and the subsequent impact on the vision +model performance in medical image recognition. The study findings revealed +significant insights into the performance of the vision model trained on +synthetic images generated through diffusion models. The vision model showed +exceptional performance in accurately identifying cases of genital warts. It +achieved an accuracy rate of 96% underscoring its effectiveness in medical +image classification. For HPV cases the model demonstrated a high precision of +99% and a recall of 94%. In normal cases the precision was 95% with an +impressive recall of 99%. These metrics indicate the model capability to +correctly identify true positive cases and minimize false positives. The model +achieved an F1 Score of 96% for HPV cases and 97% for normal cases. The high F1 +Score across both categories highlights the balanced nature of the model +precision and recall ensuring reliability and robustness in its predictions. + +
+
+ comment: 12 pages 5 figures 1 table +
+
+
+
+
+ + ☆ Multi-scale fMRI time series analysis for understanding + neurodegeneration in MCI + + +
+ In this study, we present a technique that spans multi-scale views (global +scale -- meaning brain network-level and local scale -- examining each +individual ROI that constitutes the network) applied to resting-state fMRI +volumes. Deep learning based classification is utilized in understanding +neurodegeneration. The novelty of the proposed approach lies in utilizing two +extreme scales of analysis. One branch considers the entire network within +graph-analysis framework. Concurrently, the second branch scrutinizes each ROI +within a network independently, focusing on evolution of dynamics. For each +subject, graph-based approach employs partial correlation to profile the +subject in a single graph where each ROI is a node, providing insights into +differences in levels of participation. In contrast, non-linear analysis +employs recurrence plots to profile a subject as a multichannel 2D image, +revealing distinctions in underlying dynamics. The proposed approach is +employed for classification of a cohort of 50 healthy control (HC) and 50 Mild +Cognitive Impairment (MCI), sourced from ADNI dataset. Results point to: (1) +reduced activity in ROIs such as PCC in MCI (2) greater activity in occipital +in MCI, which is not seen in HC (3) when analysed for dynamics, all ROIs in MCI +show greater predictability in time-series. + +
+
+ comment: 12 pages, 3 figures and 4 tables +
+
+
+
+
+ + ☆ Extreme Two-View Geometry From Object Poses with Diffusion Models + + +
+ Human has an incredible ability to effortlessly perceive the viewpoint +difference between two images containing the same object, even when the +viewpoint change is astonishingly vast with no co-visible regions in the +images. This remarkable skill, however, has proven to be a challenge for +existing camera pose estimation methods, which often fail when faced with large +viewpoint differences due to the lack of overlapping local features for +matching. In this paper, we aim to effectively harness the power of object +priors to accurately determine two-view geometry in the face of extreme +viewpoint changes. In our method, we first mathematically transform the +relative camera pose estimation problem to an object pose estimation problem. +Then, to estimate the object pose, we utilize the object priors learned from a +diffusion model Zero123 to synthesize novel-view images of the object. The +novel-view images are matched to determine the object pose and thus the +two-view camera pose. In experiments, our method has demonstrated extraordinary +robustness and resilience to large viewpoint changes, consistently estimating +two-view poses with exceptional generalization ability across both synthetic +and real-world datasets. Code will be available at +https://github.com/scy639/Extreme-Two-View-Geometry-From-Object-Poses-with-Diffusion-Models. + +
+
+
+
+
+ + ☆ Joint Attention-Guided Feature Fusion Network for Saliency Detection of + Surface Defects + + +
+ Surface defect inspection plays an important role in the process of +industrial manufacture and production. Though Convolutional Neural Network +(CNN) based defect inspection methods have made huge leaps, they still confront +a lot of challenges such as defect scale variation, complex background, low +contrast, and so on. To address these issues, we propose a joint +attention-guided feature fusion network (JAFFNet) for saliency detection of +surface defects based on the encoder-decoder network. JAFFNet mainly +incorporates a joint attention-guided feature fusion (JAFF) module into +decoding stages to adaptively fuse low-level and high-level features. The JAFF +module learns to emphasize defect features and suppress background noise during +feature fusion, which is beneficial for detecting low-contrast defects. In +addition, JAFFNet introduces a dense receptive field (DRF) module following the +encoder to capture features with rich context information, which helps detect +defects of different scales. The JAFF module mainly utilizes a learned joint +channel-spatial attention map provided by high-level semantic features to guide +feature fusion. The attention map makes the model pay more attention to defect +features. The DRF module utilizes a sequence of multi-receptive-field (MRF) +units with each taking as inputs all the preceding MRF feature maps and the +original input. The obtained DRF features capture rich context information with +a large range of receptive fields. Extensive experiments conducted on +SD-saliency-900, Magnetic tile, and DAGM 2007 indicate that our method achieves +promising performance in comparison with other state-of-the-art methods. +Meanwhile, our method reaches a real-time defect detection speed of 66 FPS. + +
+
+
+
+
+ + ☆ Transmission Line Detection Based on Improved Hough Transform + + +
+ To address the challenges of low detection accuracy and high false positive +rates of transmission lines in UAV (Unmanned Aerial Vehicle) images, we explore +the linear features and spatial distribution. We introduce an enhanced +stochastic Hough transform technique tailored for detecting transmission lines +in complex backgrounds. By employing the Hessian matrix for initial +preprocessing of transmission lines, and utilizing boundary search and pixel +row segmentation, our approach distinguishes transmission line areas from the +background. We significantly reduce both false positives and missed detections, +thereby improving the accuracy of transmission line identification. Experiments +demonstrate that our method not only processes images more rapidly, but also +yields superior detection results compared to conventional and random Hough +transform methods. + +
+
+
+
+
+ + ☆ DisDet: Exploring Detectability of Backdoor Attack on Diffusion Models + + +
+ In the exciting generative AI era, the diffusion model has emerged as a very +powerful and widely adopted content generation and editing tool for various +data modalities, making the study of their potential security risks very +necessary and critical. Very recently, some pioneering works have shown the +vulnerability of the diffusion model against backdoor attacks, calling for +in-depth analysis and investigation of the security challenges of this popular +and fundamental AI technique. + In this paper, for the first time, we systematically explore the +detectability of the poisoned noise input for the backdoored diffusion models, +an important performance metric yet little explored in the existing works. +Starting from the perspective of a defender, we first analyze the properties of +the trigger pattern in the existing diffusion backdoor attacks, discovering the +important role of distribution discrepancy in Trojan detection. Based on this +finding, we propose a low-cost trigger detection mechanism that can effectively +identify the poisoned input noise. We then take a further step to study the +same problem from the attack side, proposing a backdoor attack strategy that +can learn the unnoticeable trigger to evade our proposed detection scheme. + Empirical evaluations across various diffusion models and datasets +demonstrate the effectiveness of the proposed trigger detection and +detection-evading attack strategy. For trigger detection, our distribution +discrepancy-based solution can achieve a 100\% detection rate for the Trojan +triggers used in the existing works. For evading trigger detection, our +proposed stealthy trigger design approach performs end-to-end learning to make +the distribution of poisoned noise input approach that of benign noise, +enabling nearly 100\% detection pass rate with very high attack and benign +performance for the backdoored diffusion models. + +
+
+
+
+
+ + ☆ Improving Robustness of LiDAR-Camera Fusion Model against Weather + Corruption from Fusion Strategy Perspective + + +
+ In recent years, LiDAR-camera fusion models have markedly advanced 3D object +detection tasks in autonomous driving. However, their robustness against common +weather corruption such as fog, rain, snow, and sunlight in the intricate +physical world remains underexplored. In this paper, we evaluate the robustness +of fusion models from the perspective of fusion strategies on the corrupted +dataset. Based on the evaluation, we further propose a concise yet practical +fusion strategy to enhance the robustness of the fusion models, namely flexibly +weighted fusing features from LiDAR and camera sources to adapt to varying +weather scenarios. Experiments conducted on four types of fusion models, each +with two distinct lightweight implementations, confirm the broad applicability +and effectiveness of the approach. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Using Motion Cues to Supervise Single-Frame Body Pose and Shape + Estimation in Low Data Regimes + + +
+ When enough annotated training data is available, supervised deep-learning +algorithms excel at estimating human body pose and shape using a single camera. +The effects of too little such data being available can be mitigated by using +other information sources, such as databases of body shapes, to learn priors. +Unfortunately, such sources are not always available either. We show that, in +such cases, easy-to-obtain unannotated videos can be used instead to provide +the required supervisory signals. Given a trained model using too little +annotated data, we compute poses in consecutive frames along with the optical +flow between them. We then enforce consistency between the image optical flow +and the one that can be inferred from the change in pose from one frame to the +next. This provides enough additional supervision to effectively refine the +network weights and to perform on par with methods trained using far more +annotated data. + +
+
+ comment: 21 pages; TMLR +
+
+
+
+
+ + ☆ InVA: Integrative Variational Autoencoder for Harmonization of + Multi-modal Neuroimaging Data + + +
+ There is a significant interest in exploring non-linear associations among +multiple images derived from diverse imaging modalities. While there is a +growing literature on image-on-image regression to delineate predictive +inference of an image based on multiple images, existing approaches have +limitations in efficiently borrowing information between multiple imaging +modalities in the prediction of an image. Building on the literature of +Variational Auto Encoders (VAEs), this article proposes a novel approach, +referred to as Integrative Variational Autoencoder (\texttt{InVA}) method, +which borrows information from multiple images obtained from different sources +to draw predictive inference of an image. The proposed approach captures +complex non-linear association between the outcome image and input images, +while allowing rapid computation. Numerical results demonstrate substantial +advantages of \texttt{InVA} over VAEs, which typically do not allow borrowing +information between input images. The proposed framework offers highly accurate +predictive inferences for costly positron emission topography (PET) from +multiple measures of cortical structure in human brain scans readily available +from magnetic resonance imaging (MRI). + +
+
+
+
+
+ + ☆ ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer + + +
+ Face re-aging is a prominent field in computer vision and graphics, with +significant applications in photorealistic domains such as movies, advertising, +and live streaming. Recently, the need to apply face re-aging to +non-photorealistic images, like comics, illustrations, and animations, has +emerged as an extension in various entertainment sectors. However, the absence +of a network capable of seamlessly editing the apparent age on NPR images means +that these tasks have been confined to a naive approach, applying each task +sequentially. This often results in unpleasant artifacts and a loss of facial +attributes due to domain discrepancies. In this paper, we introduce a novel +one-stage method for face re-aging combined with portrait style transfer, +executed in a single generative step. We leverage existing face re-aging and +style transfer networks, both trained within the same PR domain. Our method +uniquely fuses distinct latent vectors, each responsible for managing +aging-related attributes and NPR appearance. Adopting an exemplar-based +approach, our method offers greater flexibility than domain-level fine-tuning +approaches, which typically require separate training or fine-tuning for each +domain. This effectively addresses the limitation of requiring paired datasets +for re-aging and domain-level, data-driven approaches for stylization. Our +experiments show that our model can effortlessly generate re-aged images while +simultaneously transferring the style of examples, maintaining both natural +appearance and controllability. + +
+
+ comment: 8 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ Fast and Accurate Cooperative Radio Map Estimation Enabled by GAN + + +
+ In the 6G era, real-time radio resource monitoring and management are urged +to support diverse wireless-empowered applications. This calls for fast and +accurate estimation on the distribution of the radio resources, which is +usually represented by the spatial signal power strength over the geographical +environment, known as a radio map. In this paper, we present a cooperative +radio map estimation (CRME) approach enabled by the generative adversarial +network (GAN), called as GAN-CRME, which features fast and accurate radio map +estimation without the transmitters' information. The radio map is inferred by +exploiting the interaction between distributed received signal strength (RSS) +measurements at mobile users and the geographical map using a deep neural +network estimator, resulting in low data-acquisition cost and computational +complexity. Moreover, a GAN-based learning algorithm is proposed to boost the +inference capability of the deep neural network estimator by exploiting the +power of generative AI. Simulation results showcase that the proposed GAN-CRME +is even capable of coarse error-correction when the geographical map +information is inaccurate. + +
+
+
+
+
+ + ☆ FDNet: Frequency Domain Denoising Network For Cell Segmentation in + Astrocytes Derived From Induced Pluripotent Stem Cells + + +
+ Artificially generated induced pluripotent stem cells (iPSCs) from somatic +cells play an important role for disease modeling and drug screening of +neurodegenerative diseases. Astrocytes differentiated from iPSCs are important +targets to investigate neuronal metabolism. The astrocyte differentiation +progress can be monitored through the variations of morphology observed from +microscopy images at different differentiation stages, then determined by +molecular biology techniques upon maturation. However, the astrocytes usually +``perfectly'' blend into the background and some of them are covered by +interference information (i.e., dead cells, media sediments, and cell debris), +which makes astrocytes difficult to observe. Due to the lack of annotated +datasets, the existing state-of-the-art deep learning approaches cannot be used +to address this issue. In this paper, we introduce a new task named astrocyte +segmentation with a novel dataset, called IAI704, which contains 704 images and +their corresponding pixel-level annotation masks. Moreover, a novel frequency +domain denoising network, named FDNet, is proposed for astrocyte segmentation. +In detail, our FDNet consists of a contextual information fusion module (CIF), +an attention block (AB), and a Fourier transform block (FTB). CIF and AB fuse +multi-scale feature embeddings to localize the astrocytes. FTB transforms +feature embeddings into the frequency domain and conducts a high-pass filter to +eliminate interference information. Experimental results demonstrate the +superiority of our proposed FDNet over the state-of-the-art substitutes in +astrocyte segmentation, shedding insights for iPSC differentiation progress +prediction. + +
+
+ comment: Accepted by The IEEE International Symposium on Biomedical Imaging + (ISBI) 2024 +
+
+
+
+
+ + ☆ Representation Surgery for Multi-Task Model Merging + + +
+ Multi-task learning (MTL) compresses the information from multiple tasks into +a unified backbone to improve computational efficiency and generalization. +Recent work directly merges multiple independently trained models to perform +MTL instead of collecting their raw data for joint training, greatly expanding +the application scenarios of MTL. However, by visualizing the representation +distribution of existing model merging schemes, we find that the merged model +often suffers from the dilemma of representation bias. That is, there is a +significant discrepancy in the representation distribution between the merged +and individual models, resulting in poor performance of merged MTL. In this +paper, we propose a representation surgery solution called "Surgery" to reduce +representation bias in the merged model. Specifically, Surgery is a lightweight +task-specific module that takes the representation of the merged model as input +and attempts to output the biases contained in the representation from the +merged model. We then designed an unsupervised optimization objective that +updates the Surgery module by minimizing the distance between the merged +model's representation and the individual model's representation. Extensive +experiments demonstrate significant MTL performance improvements when our +Surgery module is applied to state-of-the-art (SOTA) model merging schemes. + +
+
+
+
+
+ + ☆ Image-Caption Encoding for Improving Zero-Shot Generalization + + +
+ Recent advances in vision-language models have combined contrastive +approaches with generative methods to achieve state-of-the-art (SOTA) on +downstream inference tasks like zero-shot image classification. However, a +persistent issue of these models for image classification is their +out-of-distribution (OOD) generalization capabilities. We first show that when +an OOD data point is misclassified, the correct class can be typically found in +the Top-K predicted classes. In order to steer the model prediction toward the +correct class within the top predicted classes, we propose the Image-Caption +Encoding (ICE) method, a straightforward approach that directly enforces +consistency between the image-conditioned and caption-conditioned predictions +at evaluation time only. Intuitively, we take advantage of unique properties of +the generated captions to guide our local search for the correct class label +within the Top-K predicted classes. We show that our method can be easily +combined with other SOTA methods to enhance Top-1 OOD accuracies by 0.5% on +average and up to 3% on challenging datasets. Our code: +https://github.com/Chris210634/ice + +
+
+
+
+
+ + ☆ Learning with Mixture of Prototypes for Out-of-Distribution Detection ICLR 2024 + + +
+ Out-of-distribution (OOD) detection aims to detect testing samples far away +from the in-distribution (ID) training data, which is crucial for the safe +deployment of machine learning models in the real world. Distance-based OOD +detection methods have emerged with enhanced deep representation learning. They +identify unseen OOD samples by measuring their distances from ID class +centroids or prototypes. However, existing approaches learn the representation +relying on oversimplified data assumptions, e.g, modeling ID data of each class +with one centroid class prototype or using loss functions not designed for OOD +detection, which overlook the natural diversities within the data. Naively +enforcing data samples of each class to be compact around only one prototype +leads to inadequate modeling of realistic data and limited performance. To +tackle these issues, we propose PrototypicAl Learning with a Mixture of +prototypes (PALM) which models each class with multiple prototypes to capture +the sample diversities, and learns more faithful and compact samples embeddings +to enhance OOD detection. Our method automatically identifies and dynamically +updates prototypes, assigning each sample to a subset of prototypes via +reciprocal neighbor soft assignment weights. PALM optimizes a maximum +likelihood estimation (MLE) loss to encourage the sample embeddings to be +compact around the associated prototypes, as well as a contrastive loss on all +prototypes to enhance intra-class compactness and inter-class discrimination at +the prototype level. Moreover, the automatic estimation of prototypes enables +our approach to be extended to the challenging OOD detection task with +unlabelled ID data. Extensive experiments demonstrate the superiority of PALM, +achieving state-of-the-art average AUROC performance of 93.82 on the +challenging CIFAR-100 benchmark. Code is available at +https://github.com/jeff024/PALM. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ☆ Densely Decoded Networks with Adaptive Deep Supervision for Medical + Image Segmentation + + +
+ Medical image segmentation using deep neural networks has been highly +successful. However, the effectiveness of these networks is often limited by +inadequate dense prediction and inability to extract robust features. To +achieve refined dense prediction, we propose densely decoded networks (ddn), by +selectively introducing 'crutch' network connections. Such 'crutch' connections +in each upsampling stage of the network decoder (1) enhance target localization +by incorporating high resolution features from the encoder, and (2) improve +segmentation by facilitating multi-stage contextual information flow. Further, +we present a training strategy based on adaptive deep supervision (ads), which +exploits and adapts specific attributes of input dataset, for robust feature +extraction. In particular, ads strategically locates and deploys auxiliary +supervision, by matching the average input object size with the layer-wise +effective receptive fields (lerf) of a network, resulting in a class of ddns. +Such inclusion of 'companion objective' from a specific hidden layer, helps the +model pay close attention to some distinct input-dependent features, which the +network might otherwise 'ignore' during training. Our new networks and training +strategy are validated on 4 diverse datasets of different modalities, +demonstrating their effectiveness. + +
+
+
+
+
+ + ☆ Decoder-Only Image Registration + + +
+ In unsupervised medical image registration, the predominant approaches +involve the utilization of a encoder-decoder network architecture, allowing for +precise prediction of dense, full-resolution displacement fields from given +paired images. Despite its widespread use in the literature, we argue for the +necessity of making both the encoder and decoder learnable in such an +architecture. For this, we propose a novel network architecture, termed LessNet +in this paper, which contains only a learnable decoder, while entirely omitting +the utilization of a learnable encoder. LessNet substitutes the learnable +encoder with simple, handcrafted features, eliminating the need to learn +(optimize) network parameters in the encoder altogether. Consequently, this +leads to a compact, efficient, and decoder-only architecture for 3D medical +image registration. Evaluated on two publicly available brain MRI datasets, we +demonstrate that our decoder-only LessNet can effectively and efficiently learn +both dense displacement and diffeomorphic deformation fields in 3D. +Furthermore, our decoder-only LessNet can achieve comparable registration +performance to state-of-the-art methods such as VoxelMorph and TransMorph, +while requiring significantly fewer computational resources. Our code and +pre-trained models are available at https://github.com/xi-jia/LessNet. + +
+
+
+
+
+ + ☆ VLN-Video: Utilizing Driving Videos for Outdoor Vision-and-Language + Navigation AAAI 2024 + + +
+ Outdoor Vision-and-Language Navigation (VLN) requires an agent to navigate +through realistic 3D outdoor environments based on natural language +instructions. The performance of existing VLN methods is limited by +insufficient diversity in navigation environments and limited training data. To +address these issues, we propose VLN-Video, which utilizes the diverse outdoor +environments present in driving videos in multiple cities in the U.S. augmented +with automatically generated navigation instructions and actions to improve +outdoor VLN performance. VLN-Video combines the best of intuitive classical +approaches and modern deep learning techniques, using template infilling to +generate grounded navigation instructions, combined with an image rotation +similarity-based navigation action predictor to obtain VLN style data from +driving videos for pretraining deep learning VLN models. We pre-train the model +on the Touchdown dataset and our video-augmented dataset created from driving +videos with three proxy tasks: Masked Language Modeling, Instruction and +Trajectory Matching, and Next Action Prediction, so as to learn +temporally-aware and visually-aligned instruction representations. The learned +instruction representation is adapted to the state-of-the-art navigator when +fine-tuning on the Touchdown dataset. Empirical results demonstrate that +VLN-Video significantly outperforms previous state-of-the-art models by 2.1% in +task completion rate, achieving a new state-of-the-art on the Touchdown +dataset. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Robust Analysis of Multi-Task Learning on a Complex Vision System + + +
+ Multi-task learning (MTL) has been widely studied in the past decade. In +particular, dozens of optimization algorithms have been proposed for different +settings. While each of them claimed improvement when applied to certain models +on certain datasets, there is still lack of deep understanding on the +performance in complex real-worlds scenarios. We identify the gaps between +research and application and make the following 4 contributions. (1) We +comprehensively evaluate a large set of existing MTL optimization algorithms on +the MetaGraspNet dataset designed for robotic grasping task, which is complex +and has high real-world application values, and conclude the best-performing +methods. (2) We empirically compare the method performance when applied on +feature-level gradients versus parameter-level gradients over a large set of +MTL optimization algorithms, and conclude that this feature-level gradients +surrogate is reasonable when there are method-specific theoretical guarantee +but not generalizable to all methods. (3) We provide insights on the problem of +task interference and show that the existing perspectives of gradient angles +and relative gradient norms do not precisely reflect the challenges of MTL, as +the rankings of the methods based on these two indicators do not align well +with those based on the test-set performance. (4) We provide a novel view of +the task interference problem from the perspective of the latent space induced +by the feature extractor and provide training monitoring results based on +feature disentanglement. + +
+
+
+
+
+ + ☆ One-shot Neural Face Reenactment via Finding Directions in GAN's Latent + Space + + +
+ In this paper, we present our framework for neural face/head reenactment +whose goal is to transfer the 3D head orientation and expression of a target +face to a source face. Previous methods focus on learning embedding networks +for identity and head pose/expression disentanglement which proves to be a +rather hard task, degrading the quality of the generated images. We take a +different approach, bypassing the training of such networks, by using +(fine-tuned) pre-trained GANs which have been shown capable of producing +high-quality facial images. Because GANs are characterized by weak +controllability, the core of our approach is a method to discover which +directions in latent GAN space are responsible for controlling head pose and +expression variations. We present a simple pipeline to learn such directions +with the aid of a 3D shape model which, by construction, inherently captures +disentangled directions for head pose, identity, and expression. Moreover, we +show that by embedding real images in the GAN latent space, our method can be +successfully used for the reenactment of real-world faces. Our method features +several favorable properties including using a single source image (one-shot) +and enabling cross-person reenactment. Extensive qualitative and quantitative +results show that our approach typically produces reenacted faces of notably +higher quality than those produced by state-of-the-art methods for the standard +benchmarks of VoxCeleb1 & 2. + +
+
+ comment: Preprint version, accepted for publication in International Journal + of Computer Vision (IJCV) +
+
+
+
+
+ + ☆ AnaMoDiff: 2D Analogical Motion Diffusion via Disentangled Denoising + + +
+ We present AnaMoDiff, a novel diffusion-based method for 2D motion analogies +that is applied to raw, unannotated videos of articulated characters. Our goal +is to accurately transfer motions from a 2D driving video onto a source +character, with its identity, in terms of appearance and natural movement, well +preserved, even when there may be significant discrepancies between the source +and driving characters in their part proportions and movement speed and styles. +Our diffusion model transfers the input motion via a latent optical flow (LOF) +network operating in a noised latent space, which is spatially aware, efficient +to process compared to the original RGB videos, and artifact-resistant through +the diffusion denoising process even amid dense movements. To accomplish both +motion analogy and identity preservation, we train our denoising model in a +feature-disentangled manner, operating at two noise levels. While +identity-revealing features of the source are learned via conventional noise +injection, motion features are learned from LOF-warped videos by only injecting +noise with large values, with the stipulation that motion properties involving +pose and limbs are encoded by higher-level features. Experiments demonstrate +that our method achieves the best trade-off between motion analogy and identity +preservation. + +
+
+
+
+
+ + ☆ Improving Pediatric Low-Grade Neuroepithelial Tumors Molecular Subtype + Identification Using a Novel AUROC Loss Function for Convolutional Neural + Networks + + +
+ Pediatric Low-Grade Neuroepithelial Tumors (PLGNT) are the most common +pediatric cancer type, accounting for 40% of brain tumors in children, and +identifying PLGNT molecular subtype is crucial for treatment planning. However, +the gold standard to determine the PLGNT subtype is biopsy, which can be +impractical or dangerous for patients. This research improves the performance +of Convolutional Neural Networks (CNNs) in classifying PLGNT subtypes through +MRI scans by introducing a loss function that specifically improves the model's +Area Under the Receiver Operating Characteristic (ROC) Curve (AUROC), offering +a non-invasive diagnostic alternative. In this study, a retrospective dataset +of 339 children with PLGNT (143 BRAF fusion, 71 with BRAF V600E mutation, and +125 non-BRAF) was curated. We employed a CNN model with Monte Carlo random data +splitting. The baseline model was trained using binary cross entropy (BCE), and +achieved an AUROC of 86.11% for differentiating BRAF fusion and BRAF V600E +mutations, which was improved to 87.71% using our proposed AUROC loss function +(p-value 0.045). With multiclass classification, the AUROC improved from 74.42% +to 76. 59% (p-value 0.0016). + +
+
+
+
+
+ + ☆ nnMamba: 3D Biomedical Image Segmentation, Classification and Landmark + Detection with State Space Model + + +
+ In the field of biomedical image analysis, the quest for architectures +capable of effectively capturing long-range dependencies is paramount, +especially when dealing with 3D image segmentation, classification, and +landmark detection. Traditional Convolutional Neural Networks (CNNs) struggle +with locality respective field, and Transformers have a heavy computational +load when applied to high-dimensional medical images. In this paper, we +introduce nnMamba, a novel architecture that integrates the strengths of CNNs +and the advanced long-range modeling capabilities of State Space Sequence +Models (SSMs). nnMamba adds the SSMs to the convolutional residual-block to +extract local features and model complex dependencies. For diffirent tasks, we +build different blocks to learn the features. Extensive experiments demonstrate +nnMamba's superiority over state-of-the-art methods in a suite of challenging +tasks, including 3D image segmentation, classification, and landmark detection. +nnMamba emerges as a robust solution, offering both the local representation +ability of CNNs and the efficient global context processing of SSMs, setting a +new standard for long-range dependency modeling in medical image analysis. Code +is available at https://github.com/lhaof/nnMamba + +
+
+ comment: 7 pages, Code is available at https://github.com/lhaof/nnMamba +
+
+
+
+
+ + ☆ An Inpainting-Infused Pipeline for Attire and Background Replacement + + +
+ In recent years, groundbreaking advancements in Generative Artificial +Intelligence (GenAI) have triggered a transformative paradigm shift, +significantly influencing various domains. In this work, we specifically +explore an integrated approach, leveraging advanced techniques in GenAI and +computer vision emphasizing image manipulation. The methodology unfolds through +several stages, including depth estimation, the creation of inpaint masks based +on depth information, the generation and replacement of backgrounds utilizing +Stable Diffusion in conjunction with Latent Consistency Models (LCMs), and the +subsequent replacement of clothes and application of aesthetic changes through +an inpainting pipeline. Experiments conducted in this study underscore the +methodology's efficacy, highlighting its potential to produce visually +captivating content. The convergence of these advanced techniques allows users +to input photographs of individuals and manipulate them to modify clothing and +background based on specific prompts without manually input inpainting masks, +effectively placing the subjects within the vast landscape of creative +imagination. + +
+
+
+
+
+ + ☆ Beyond Strong labels: Weakly-supervised Learning Based on Gaussian + Pseudo Labels for The Segmentation of Ellipse-like Vascular Structures in + Non-contrast CTs + + +
+ Deep-learning-based automated segmentation of vascular structures in +preoperative CT scans contributes to computer-assisted diagnosis and +intervention procedure in vascular diseases. While CT angiography (CTA) is the +common standard, non-contrast CT imaging is significant as a contrast-risk-free +alternative, avoiding complications associated with contrast agents. However, +the challenges of labor-intensive labeling and high labeling variability due to +the ambiguity of vascular boundaries hinder conventional strong-label-based, +fully-supervised learning in non-contrast CTs. This paper introduces a +weakly-supervised framework using ellipses' topology in slices, including 1) an +efficient annotation process based on predefined standards, 2) ellipse-fitting +processing, 3) the generation of 2D Gaussian heatmaps serving as pseudo labels, +4) a training process through a combination of voxel reconstruction loss and +distribution loss with the pseudo labels. We assess the effectiveness of the +proposed method on one local and two public datasets comprising non-contrast CT +scans, particularly focusing on the abdominal aorta. On the local dataset, our +weakly-supervised learning approach based on pseudo labels outperforms +strong-label-based fully-supervised learning (1.54\% of Dice score on average), +reducing labeling time by around 82.0\%. The efficiency in generating pseudo +labels allows the inclusion of label-agnostic external data in the training +set, leading to an additional improvement in performance (2.74\% of Dice score +on average) with a reduction of 66.3\% labeling time, where the labeling time +remains considerably less than that of strong labels. On the public dataset, +the pseudo labels achieve an overall improvement of 1.95\% in Dice score for 2D +models while a reduction of 11.65 voxel spacing in Hausdorff distance for 3D +model. + +
+
+
+
+
+ + ☆ Hyper-Diffusion: Estimating Epistemic and Aleatoric Uncertainty with a + Single Model + + +
+ Estimating and disentangling epistemic uncertainty (uncertainty that can be +reduced with more training data) and aleatoric uncertainty (uncertainty that is +inherent to the task at hand) is critically important when applying machine +learning (ML) to high-stakes applications such as medical imaging and weather +forecasting. Conditional diffusion models' breakthrough ability to accurately +and efficiently sample from the posterior distribution of a dataset now makes +uncertainty estimation conceptually straightforward: One need only train and +sample from a large ensemble of diffusion models. Unfortunately, training such +an ensemble becomes computationally intractable as the complexity of the model +architecture grows. + In this work we introduce a new approach to ensembling, hyper-diffusion, +which allows one to accurately estimate epistemic and aleatoric uncertainty +with a single model. Unlike existing Monte Carlo dropout based single-model +ensembling methods, hyper-diffusion offers the same prediction accuracy as +multi-model ensembles. We validate our approach on two distinct tasks: x-ray +computed tomography (CT) reconstruction and weather temperature forecasting. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Assessing the Efficacy of Invisible Watermarks in AI-Generated Medical + Images + + +
+ AI-generated medical images are gaining growing popularity due to their +potential to address the data scarcity challenge in the real world. However, +the issue of accurate identification of these synthetic images, particularly +when they exhibit remarkable realism with their real copies, remains a concern. +To mitigate this challenge, image generators such as DALLE and Imagen, have +integrated digital watermarks aimed at facilitating the discernment of +synthetic images' authenticity. These watermarks are embedded within the image +pixels and are invisible to the human eye while remains their detectability. +Nevertheless, a comprehensive investigation into the potential impact of these +invisible watermarks on the utility of synthetic medical images has been +lacking. In this study, we propose the incorporation of invisible watermarks +into synthetic medical images and seek to evaluate their efficacy in the +context of downstream classification tasks. Our goal is to pave the way for +discussions on the viability of such watermarks in boosting the detectability +of synthetic medical images, fortifying ethical standards, and safeguarding +against data pollution and potential scams. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Physics-Encoded Graph Neural Networks for Deformation Prediction under + Contact ICRA2024 + + +
+ In robotics, it's crucial to understand object deformation during tactile +interactions. A precise understanding of deformation can elevate robotic +simulations and have broad implications across different industries. We +introduce a method using Physics-Encoded Graph Neural Networks (GNNs) for such +predictions. Similar to robotic grasping and manipulation scenarios, we focus +on modeling the dynamics between a rigid mesh contacting a deformable mesh +under external forces. Our approach represents both the soft body and the rigid +body within graph structures, where nodes hold the physical states of the +meshes. We also incorporate cross-attention mechanisms to capture the interplay +between the objects. By jointly learning geometry and physics, our model +reconstructs consistent and detailed deformations. We've made our code and +dataset public to advance research in robotic simulation and grasping. + +
+
+ comment: Accepted at 2024 IEEE International Conference on Robotics and + Automation (ICRA2024) +
+
+
+
+
+ + ☆ Constrained Multiview Representation for Self-supervised Contrastive + Learning + + +
+ Representation learning constitutes a pivotal cornerstone in contemporary +deep learning paradigms, offering a conduit to elucidate distinctive features +within the latent space and interpret the deep models. Nevertheless, the +inherent complexity of anatomical patterns and the random nature of lesion +distribution in medical image segmentation pose significant challenges to the +disentanglement of representations and the understanding of salient features. +Methods guided by the maximization of mutual information, particularly within +the framework of contrastive learning, have demonstrated remarkable success and +superiority in decoupling densely intertwined representations. However, the +effectiveness of contrastive learning highly depends on the quality of the +positive and negative sample pairs, i.e. the unselected average mutual +information among multi-views would obstruct the learning strategy so the +selection of the views is vital. In this work, we introduce a novel approach +predicated on representation distance-based mutual information (MI) +maximization for measuring the significance of different views, aiming at +conducting more efficient contrastive learning and representation +disentanglement. Additionally, we introduce an MI re-ranking strategy for +representation selection, benefiting both the continuous MI estimating and +representation significance distance measuring. Specifically, we harness +multi-view representations extracted from the frequency domain, re-evaluating +their significance based on mutual information across varying frequencies, +thereby facilitating a multifaceted contrastive learning approach to bolster +semantic comprehension. The statistical results under the five metrics +demonstrate that our proposed framework proficiently constrains the MI +maximization-driven representation selection and steers the multi-view +contrastive learning process. + +
+
+ comment: 11 pages, 9 figures, 2 algorithms +
+
+
+
+
+ + ☆ Denoising Diffusion via Image-Based Rendering ICLR 2024 + + +
+ Generating 3D scenes is a challenging open problem, which requires +synthesizing plausible content that is fully consistent in 3D space. While +recent methods such as neural radiance fields excel at view synthesis and 3D +reconstruction, they cannot synthesize plausible details in unobserved regions +since they lack a generative capability. Conversely, existing generative +methods are typically not capable of reconstructing detailed, large-scale +scenes in the wild, as they use limited-capacity 3D scene representations, +require aligned camera poses, or rely on additional regularizers. In this work, +we introduce the first diffusion model able to perform fast, detailed +reconstruction and generation of real-world 3D scenes. To achieve this, we make +three contributions. First, we introduce a new neural scene representation, +IB-planes, that can efficiently and accurately represent large 3D scenes, +dynamically allocating more capacity as needed to capture details visible in +each image. Second, we propose a denoising-diffusion framework to learn a prior +over this novel 3D scene representation, using only 2D images without the need +for any additional supervision signal such as masks or depths. This supports 3D +reconstruction and generation in a unified architecture. Third, we develop a +principled approach to avoid trivial 3D solutions when integrating the +image-based rendering with the diffusion model, by dropping out representations +of some images. We evaluate the model on several challenging datasets of real +and synthetic images, and demonstrate superior results on generation, novel +view synthesis and 3D reconstruction. + +
+
+ comment: Accepted at ICLR 2024. Project page: + https://anciukevicius.github.io/generative-image-based-rendering +
+
+
+
+
+ + ☆ A Computer Vision Based Approach for Stalking Detection Using a + CNN-LSTM-MLP Hybrid Fusion Model + + +
+ Criminal and suspicious activity detection has become a popular research +topic in recent years. The rapid growth of computer vision technologies has had +a crucial impact on solving this issue. However, physical stalking detection is +still a less explored area despite the evolution of modern technology. +Nowadays, stalking in public places has become a common occurrence with women +being the most affected. Stalking is a visible action that usually occurs +before any criminal activity begins as the stalker begins to follow, loiter, +and stare at the victim before committing any criminal activity such as +assault, kidnapping, rape, and so on. Therefore, it has become a necessity to +detect stalking as all of these criminal activities can be stopped in the first +place through stalking detection. In this research, we propose a novel deep +learning-based hybrid fusion model to detect potential stalkers from a single +video with a minimal number of frames. We extract multiple relevant features, +such as facial landmarks, head pose estimation, and relative distance, as +numerical values from video frames. This data is fed into a multilayer +perceptron (MLP) to perform a classification task between a stalking and a +non-stalking scenario. Simultaneously, the video frames are fed into a +combination of convolutional and LSTM models to extract the spatio-temporal +features. We use a fusion of these numerical and spatio-temporal features to +build a classifier to detect stalking incidents. Additionally, we introduce a +dataset consisting of stalking and non-stalking videos gathered from various +feature films and television series, which is also used to train the model. The +experimental results show the efficiency and dynamism of our proposed stalker +detection system, achieving 89.58% testing accuracy with a significant +improvement as compared to the state-of-the-art approaches. + +
+
+ comment: Under review for publication in the PLOS ONE journal, 17 pages, 9 + figures +
+
+
+
+
+ + ♻ ☆ Denoising-Diffusion Alignment for Continuous Sign Language Recognition + + +
+ As a key to social good, continuous sign language recognition (CSLR) aims to +promote active and accessible communication for the hearing impaired. Current +CSLR research adopts a cross-modality alignment scheme to learn the mapping +relationship between "video clip-textual gloss". However, this local alignment +method, especially with weak data annotation, ignores the contextual +information of modalities and directly reduces the generalization of visual +features. To this end, we propose a novel Denoising-Diffusion global Alignment +scheme (DDA), which focuses on modeling the mapping of the "entire video-gloss +sequence". DDA consists of a partial noising process strategy and a +denoising-diffusion autoencoder. The former is used to achieve efficient +guidance of the text modality to the visual modality; the latter learns the +global alignment information of the two modalities in a denoising manner. Our +DDA confirms the feasibility of diffusion models for visual representation +learning in CSLR. Experiments on three public benchmarks demonstrate that our +method achieves state-of-the-art performances. Furthermore, the proposed method +can be a plug-and-play optimization to generalize other CSLR methods. + +
+
+
+
+
+ + ♻ ☆ Lumiere: A Space-Time Diffusion Model for Video Generation + + +
+ We introduce Lumiere -- a text-to-video diffusion model designed for +synthesizing videos that portray realistic, diverse and coherent motion -- a +pivotal challenge in video synthesis. To this end, we introduce a Space-Time +U-Net architecture that generates the entire temporal duration of the video at +once, through a single pass in the model. This is in contrast to existing video +models which synthesize distant keyframes followed by temporal super-resolution +-- an approach that inherently makes global temporal consistency difficult to +achieve. By deploying both spatial and (importantly) temporal down- and +up-sampling and leveraging a pre-trained text-to-image diffusion model, our +model learns to directly generate a full-frame-rate, low-resolution video by +processing it in multiple space-time scales. We demonstrate state-of-the-art +text-to-video generation results, and show that our design easily facilitates a +wide range of content creation tasks and video editing applications, including +image-to-video, video inpainting, and stylized generation. + +
+
+ comment: Webpage: https://lumiere-video.github.io/ | Video: + https://www.youtube.com/watch?v=wxLr02Dz2Sc +
+
+
+
+
+ + ♻ ☆ DiffusionWorldViewer: Exposing and Broadening the Worldview Reflected by + Generative Text-to-Image Models + + +
+ Generative text-to-image (TTI) models produce high-quality images from short +textual descriptions and are widely used in academic and creative domains. Like +humans, TTI models have a worldview, a conception of the world learned from +their training data and task that influences the images they generate for a +given prompt. However, the worldviews of TTI models are often hidden from +users, making it challenging for users to build intuition about TTI outputs, +and they are often misaligned with users' worldviews, resulting in output +images that do not match user expectations. In response, we introduce +DiffusionWorldViewer, an interactive interface that exposes a TTI model's +worldview across output demographics and provides editing tools for aligning +output images with user perspectives. In a user study with 18 diverse TTI +users, we find that DiffusionWorldViewer helps users represent their varied +viewpoints in generated images and challenge the limited worldview reflected in +current TTI models. + +
+
+ comment: 20 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Memory-Assisted Sub-Prototype Mining for Universal Domain Adaptation + + +
+ Universal domain adaptation aims to align the classes and reduce the feature +gap between the same category of the source and target domains. The target +private category is set as the unknown class during the adaptation process, as +it is not included in the source domain. However, most existing methods +overlook the intra-class structure within a category, especially in cases where +there exists significant concept shift between the samples belonging to the +same category. When samples with large concept shift are forced to be pushed +together, it may negatively affect the adaptation performance. Moreover, from +the interpretability aspect, it is unreasonable to align visual features with +significant differences, such as fighter jets and civil aircraft, into the same +category. Unfortunately, due to such semantic ambiguity and annotation cost, +categories are not always classified in detail, making it difficult for the +model to perform precise adaptation. To address these issues, we propose a +novel Memory-Assisted Sub-Prototype Mining (MemSPM) method that can learn the +differences between samples belonging to the same category and mine sub-classes +when there exists significant concept shift between them. By doing so, our +model learns a more reasonable feature space that enhances the transferability +and reflects the inherent differences among samples annotated as the same +category. We evaluate the effectiveness of our MemSPM method over multiple +scenarios, including UniDA, OSDA, and PDA. Our method achieves state-of-the-art +performance on four benchmarks in most cases. + +
+
+
+
+
+ + ♻ ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in non-English +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a (quasi)-zero-shot +manner, even surpassing models trained on image-text data in native languages. +Taking Chinese as a practice of MPM, we build large multimodal models VisCPM in +image-to-text and text-to-image generation, which achieve state-of-the-art +(open-source) performance in Chinese. To facilitate future research, we +open-source codes and model weights at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ♻ ☆ Applications of artificial intelligence in the analysis of + histopathology images of gliomas: a review + + +
+ In recent years, the diagnosis of gliomas has become increasingly complex. +Analysis of glioma histopathology images using artificial intelligence (AI) +offers new opportunities to support diagnosis and outcome prediction. To give +an overview of the current state of research, this review examines 70 publicly +available research studies that have proposed AI-based methods for whole-slide +histopathology images of human gliomas, covering the diagnostic tasks of +subtyping (16/70), grading (23/70), molecular marker prediction (13/70), and +survival prediction (27/70). All studies were reviewed with regard to +methodological aspects as well as clinical applicability. It was found that the +focus of current research is the assessment of hematoxylin and eosin-stained +tissue sections of adult-type diffuse gliomas. The majority of studies (49/70) +are based on the publicly available glioblastoma and low-grade glioma datasets +from The Cancer Genome Atlas (TCGA) and only a few studies employed other +datasets in isolation (10/70) or in addition to the TCGA datasets (11/70). +Current approaches mostly rely on convolutional neural networks (53/70) for +analyzing tissue at 20x magnification (30/70). A new field of research is the +integration of clinical data, omics data, or magnetic resonance imaging +(27/70). So far, AI-based methods have achieved promising results, but are not +yet used in real clinical settings. Future work should focus on the independent +validation of methods on larger, multi-site datasets with high-quality and +up-to-date clinical and molecular pathology annotations to demonstrate routine +applicability. + +
+
+
+
+
+ + ♻ ☆ LKCA: Large Kernel Convolutional Attention + + +
+ We revisit the relationship between attention mechanisms and large kernel +ConvNets in visual transformers and propose a new spatial attention named Large +Kernel Convolutional Attention (LKCA). It simplifies the attention operation by +replacing it with a single large kernel convolution. LKCA combines the +advantages of convolutional neural networks and visual transformers, possessing +a large receptive field, locality, and parameter sharing. We explained the +superiority of LKCA from both convolution and attention perspectives, providing +equivalent code implementations for each view. Experiments confirm that LKCA +implemented from both the convolutional and attention perspectives exhibit +equivalent performance. We extensively experimented with the LKCA variant of +ViT in both classification and segmentation tasks. The experiments demonstrated +that LKCA exhibits competitive performance in visual tasks. Our code will be +made publicly available at https://github.com/CatworldLee/LKCA. + +
+
+
+
+
+ + ♻ ☆ EarthGPT: A Universal Multi-modal Large Language Model for Multi-sensor + Image Comprehension in Remote Sensing Domain + + +
+ Multi-modal large language models (MLLMs) have demonstrated remarkable +success in vision and visual-language tasks within the natural image domain. +Owing to the significant diversities between the natural and remote sensing +(RS) images, the development of MLLMs in the RS domain is still in the infant +stage. To fill the gap, a pioneer MLLM named EarthGPT integrating various +multi-sensor RS interpretation tasks uniformly is proposed in this paper for +universal RS image comprehension. In EarthGPT, three key techniques are +developed including a visual-enhanced perception mechanism, a cross-modal +mutual comprehension approach, and a unified instruction tuning method for +multi-sensor multi-task in the RS domain. More importantly, a dataset named +MMRS-1M featuring large-scale multi-sensor multi-modal RS instruction-following +is constructed, comprising over 1M image-text pairs based on 34 existing +diverse RS datasets and including multi-sensor images such as optical, +synthetic aperture radar (SAR), and infrared. The MMRS-1M dataset addresses the +drawback of MLLMs on RS expert knowledge and stimulates the development of +MLLMs in the RS domain. Extensive experiments are conducted, demonstrating the +EarthGPT's superior performance in various RS visual interpretation tasks +compared with the other specialist models and MLLMs, proving the effectiveness +of the proposed EarthGPT and offering a versatile paradigm for open-set +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ In-Domain Self-Supervised Learning Improves Remote Sensing Image Scene + Classification + + +
+ We investigate the utility of in-domain self-supervised pre-training of +vision models in the analysis of remote sensing imagery. Self-supervised +learning (SSL) has emerged as a promising approach for remote sensing image +classification due to its ability to exploit large amounts of unlabeled data. +Unlike traditional supervised learning, SSL aims to learn representations of +data without the need for explicit labels. This is achieved by formulating +auxiliary tasks that can be used for pre-training models before fine-tuning +them on a given downstream task. A common approach in practice to SSL +pre-training is utilizing standard pre-training datasets, such as ImageNet. +While relevant, such a general approach can have a sub-optimal influence on the +downstream performance of models, especially on tasks from challenging domains +such as remote sensing. In this paper, we analyze the effectiveness of SSL +pre-training by employing the iBOT framework coupled with Vision transformers +trained on Million-AID, a large and unlabeled remote sensing dataset. We +present a comprehensive study of different self-supervised pre-training +strategies and evaluate their effect across 14 downstream datasets with diverse +properties. Our results demonstrate that leveraging large in-domain datasets +for self-supervised pre-training consistently leads to improved predictive +downstream performance, compared to the standard approaches found in practice. + +
+
+
+
+
+ + ♻ ☆ Cascaded Scaling Classifier: class incremental learning with probability + scaling + + +
+ Humans are capable of acquiring new knowledge and transferring learned +knowledge into different domains, incurring a small forgetting. The same +ability, called Continual Learning, is challenging to achieve when operating +with neural networks due to the forgetting affecting past learned tasks when +learning new ones. This forgetting can be mitigated by replaying stored samples +from past tasks, but a large memory size may be needed for long sequences of +tasks; moreover, this could lead to overfitting on saved samples. In this +paper, we propose a novel regularisation approach and a novel incremental +classifier called, respectively, Margin Dampening and Cascaded Scaling +Classifier. The first combines a soft constraint and a knowledge distillation +approach to preserve past learned knowledge while allowing the model to learn +new patterns effectively. The latter is a gated incremental classifier, helping +the model modify past predictions without directly interfering with them. This +is achieved by modifying the output of the model with auxiliary scaling +functions. We empirically show that our approach performs well on multiple +benchmarks against well-established baselines, and we also study each component +of our proposal and how the combinations of such components affect the final +results. + +
+
+ comment: Paper under review. The official code is available + https://github.com/jaryP/Cascaded-Scaling-Classifier +
+
+
+
+
+ + ♻ ☆ Interactive Humanoid: Online Full-Body Motion Reaction Synthesis with + Social Affordance Canonicalization and Forecasting + + +
+ We focus on the human-humanoid interaction task optionally with an object. We +propose a new task named online full-body motion reaction synthesis, which +generates humanoid reactions based on the human actor's motions. The previous +work only focuses on human interaction without objects and generates body +reactions without hand. Besides, they also do not consider the task as an +online setting, which means the inability to observe information beyond the +current moment in practical situations. To support this task, we construct two +datasets named HHI and CoChair and propose a unified method. Specifically, we +propose to construct a social affordance representation. We first select a +social affordance carrier and use SE(3)-Equivariant Neural Networks to learn +the local frame for the carrier, then we canonicalize the social affordance. +Besides, we propose a social affordance forecasting scheme to enable the +reactor to predict based on the imagined future. Experiments demonstrate that +our approach can effectively generate high-quality reactions on HHI and +CoChair. Furthermore, we also validate our method on existing human interaction +datasets Interhuman and Chi3D. + +
+
+
+
+
+ + ♻ ☆ Real-time High-Resolution Neural Network with Semantic Guidance for + Crack Segmentation + + +
+ Deep learning plays an important role in crack segmentation, but most work +utilize off-the-shelf or improved models that have not been specifically +developed for this task. High-resolution convolution neural networks that are +sensitive to objects' location and detail help improve the performance of crack +segmentation, yet conflict with real-time detection. This paper describes +HrSegNet, a high-resolution network with semantic guidance specifically +designed for crack segmentation, which guarantees real-time inference speed +while preserving crack details. After evaluation on the composite dataset +CrackSeg9k and the scenario-specific datasets Asphalt3k and Concrete3k, +HrSegNet obtains state-of-the-art segmentation performance and efficiencies +that far exceed those of the compared models. This approach demonstrates that +there is a trade-off between high-resolution modeling and real-time detection, +which fosters the use of edge devices to analyze cracks in real-world +applications. + +
+
+
+
+
+ + ♻ ☆ Navigating Neural Space: Revisiting Concept Activation Vectors to + Overcome Directional Divergence + + +
+ With a growing interest in understanding neural network prediction +strategies, Concept Activation Vectors (CAVs) have emerged as a popular tool +for modeling human-understandable concepts in the latent space. Commonly, CAVs +are computed by leveraging linear classifiers optimizing the separability of +latent representations of samples with and without a given concept. However, in +this paper we show that such a separability-oriented computation leads to +solutions, which may diverge from the actual goal of precisely modeling the +concept direction. This discrepancy can be attributed to the significant +influence of distractor directions, i.e., signals unrelated to the concept, +which are picked up by filters (i.e., weights) of linear models to optimize +class-separability. To address this, we introduce pattern-based CAVs, solely +focussing on concept signals, thereby providing more accurate concept +directions. We evaluate various CAV methods in terms of their alignment with +the true concept direction and their impact on CAV applications, including +concept sensitivity testing and model correction for shortcut behavior caused +by data artifacts. We demonstrate the benefits of pattern-based CAVs using the +Pediatric Bone Age, ISIC2019, and FunnyBirds datasets with VGG, ResNet, and +EfficientNet model architectures. + +
+
+
+
+
+ + ♻ ☆ Matrix Information Theory for Self-Supervised Learning + + +
+ Contrastive learning often relies on comparing positive anchor samples with +multiple negative samples to perform Self-Supervised Learning (SSL). However, +non-contrastive approaches like BYOL, SimSiam, and Barlow Twins achieve SSL +without explicit negative samples. In this paper, we introduce a unified matrix +information-theoretic framework that explains many contrastive and +non-contrastive learning methods. We then propose a novel method Matrix-SSL +based on matrix information theory. Experimental results reveal that Matrix-SSL +significantly outperforms state-of-the-art methods on the ImageNet dataset +under linear evaluation settings and on MS-COCO for transfer learning tasks. +Specifically, when performing 100 epochs pre-training, our method outperforms +SimCLR by 4.6%, and when performing transfer learning tasks on MS-COCO, our +method outperforms previous SOTA methods such as MoCo v2 and BYOL up to 3.3% +with only 400 epochs compared to 800 epochs pre-training. Code available at +https://github.com/yifanzhang-pro/Matrix-SSL. + +
+
+
+
+
+ + ♻ ☆ CLADE: Cycle Loss Augmented Degradation Enhancement for Unpaired + Super-Resolution of Anisotropic Medical Images + + +
+ Three-dimensional (3D) imaging is popular in medical applications, however, +anisotropic 3D volumes with thick, low-spatial-resolution slices are often +acquired to reduce scan times. Deep learning (DL) offers a solution to recover +high-resolution features through super-resolution reconstruction (SRR). +Unfortunately, paired training data is unavailable in many 3D medical +applications and therefore we propose a novel unpaired approach; CLADE (Cycle +Loss Augmented Degradation Enhancement). CLADE uses a modified CycleGAN +architecture with a cycle-consistent gradient mapping loss, to learn SRR of the +low-resolution dimension, from disjoint patches of the high-resolution plane +within the anisotropic 3D volume data itself. We show the feasibility of CLADE +in abdominal MRI and abdominal CT and demonstrate significant improvements in +CLADE image quality over low-resolution volumes and state-of-the-art +self-supervised SRR; SMORE (Synthetic Multi-Orientation Resolution +Enhancement). Quantitative PIQUE (qualitative perception-based image quality +evaluator) scores and quantitative edge sharpness (ES - calculated as the +maximum gradient of pixel intensities over a border of interest), showed +superior performance for CLADE in both MRI and CT. Qualitatively CLADE had the +best overall image quality and highest perceptual ES over the low-resolution +volumes and SMORE. This paper demonstrates the potential of using CLADE for +super-resolution reconstruction of anisotropic 3D medical imaging data without +the need for paired 3D training data. + +
+
+
+
+
+ + ♻ ☆ Fusion of Single and Integral Multispectral Aerial Images + + +
+ An adequate fusion of the most significant salient information from multiple +input channels is essential for many aerial imaging tasks. While multispectral +recordings reveal features in various spectral ranges, synthetic aperture +sensing makes occluded features visible. We present a first and hybrid (model- +and learning-based) architecture for fusing the most significant features from +conventional aerial images with the ones from integral aerial images that are +the result of synthetic aperture sensing for removing occlusion. It combines +the environment's spatial references with features of unoccluded targets that +would normally be hidden by dense vegetation. Our method out-beats +state-of-the-art two-channel and multi-channel fusion approaches visually and +quantitatively in common metrics, such as mutual information, visual +information fidelity, and peak signal-to-noise ratio. The proposed model does +not require manually tuned parameters, can be extended to an arbitrary number +and combinations of spectral channels, and is reconfigurable for addressing +different use cases. We demonstrate examples for search-and-rescue, wildfire +detection, and wildlife observation. + +
+
+
+
+
+ + ♻ ☆ An annotated instance segmentation XXL-CT data-set from a historic + airplane + + +
+ The Me 163 was a Second World War fighter airplane and a result of the German +air force secret developments. One of these airplanes is currently owned and +displayed in the historic aircraft exhibition of the Deutsches Museum in +Munich, Germany. To gain insights with respect to its history, design and state +of preservation, a complete CT scan was obtained using an industrial +XXL-computer tomography scanner. Using the CT data from the Me 163, all its +details can visually be examined at various levels, ranging from the complete +hull down to single sprockets and rivets. However, while a trained human +observer can identify and interpret the volumetric data with all its parts and +connections, a virtual dissection of the airplane and all its different parts +would be quite desirable. Nevertheless, this means, that an instance +segmentation of all components and objects of interest into disjoint entities +from the CT data is necessary. As of currently, no adequate computer-assisted +tools for automated or semi-automated segmentation of such XXL-airplane data +are available, in a first step, an interactive data annotation and object +labelling process has been established. So far, seven 512 x 512 x 512 voxel +sub-volumes from the Me 163 airplane have been annotated and labelled, whose +results can potentially be used for various new applications in the field of +digital heritage, non-destructive testing, or machine-learning. This work +describes the data acquisition process of the airplane using an industrial +XXL-CT scanner, outlines the interactive segmentation and labelling scheme to +annotate sub-volumes of the airplane's CT data, describes and discusses various +challenges with respect to interpreting and handling the annotated and labelled +data. + +
+
+
+
+
+ + ♻ ☆ The Machine Vision Iceberg Explained: Advancing Dynamic Testing by + Considering Holistic Environmental Circumstances + + +
+ Are we heading for an iceberg with the current testing of machine vision? +This work delves into the landscape of Machine Vision (MV) testing, which is +heavily required in Highly Automated Driving (HAD) systems. Utilizing the +metaphorical notion of navigating towards an iceberg, we discuss the potential +shortcomings concealed within current testing strategies. We emphasize the +urgent need for a deeper understanding of how to deal with the opaque functions +of MV in development processes. As overlooked considerations can cost lives. +Our main contribution is the hierarchical level model, which we call +Granularity Grades. The model encourages a refined exploration of the +multi-scaled depths of understanding about the circumstances of environments in +which MV is intended to operate. This model aims to provide a holistic overview +of all entities that may impact MV functions, ranging from relations of +individual entities like object attributes to entire environmental scenes. The +application of our model delivers a structured exploration of entities in a +specific domain, their relationships and assigning results of a MV-under-test +to construct an entity-relationship graph. Through clustering patterns of +relations in the graph general MV deficits are arguable. In Summary, our work +contributes to a more nuanced and systematized identification of deficits of a +MV test object in correlation to holistic circumstances in HAD operating +domains. + +
+
+ comment: Submitted at IEEE IV 2024 +
+
+
+
+
+ + ♻ ☆ Sneaky Spikes: Uncovering Stealthy Backdoor Attacks in Spiking Neural + Networks with Neuromorphic Data NDSS + + +
+ Deep neural networks (DNNs) have demonstrated remarkable performance across +various tasks, including image and speech recognition. However, maximizing the +effectiveness of DNNs requires meticulous optimization of numerous +hyperparameters and network parameters through training. Moreover, +high-performance DNNs entail many parameters, which consume significant energy +during training. In order to overcome these challenges, researchers have turned +to spiking neural networks (SNNs), which offer enhanced energy efficiency and +biologically plausible data processing capabilities, rendering them highly +suitable for sensory data tasks, particularly in neuromorphic data. Despite +their advantages, SNNs, like DNNs, are susceptible to various threats, +including adversarial examples and backdoor attacks. Yet, the field of SNNs +still needs to be explored in terms of understanding and countering these +attacks. + This paper delves into backdoor attacks in SNNs using neuromorphic datasets +and diverse triggers. Specifically, we explore backdoor triggers within +neuromorphic data that can manipulate their position and color, providing a +broader scope of possibilities than conventional triggers in domains like +images. We present various attack strategies, achieving an attack success rate +of up to 100% while maintaining a negligible impact on clean accuracy. +Furthermore, we assess these attacks' stealthiness, revealing that our most +potent attacks possess significant stealth capabilities. Lastly, we adapt +several state-of-the-art defenses from the image domain, evaluating their +efficacy on neuromorphic data and uncovering instances where they fall short, +leading to compromised performance. + +
+
+ comment: To appear in Network and Distributed System Security (NDSS) Symposium + 2024 +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for Spatio-Spectral Compression of + Hyperspectral Images + + +
+ The development of deep learning-based models for the compression of +hyperspectral images (HSIs) has recently attracted great attention in remote +sensing due to the sharp growing of hyperspectral data archives. Most of the +existing models achieve either spectral or spatial compression, and do not +jointly consider the spatio-spectral redundancies present in HSIs. To address +this problem, in this paper we focus our attention on the High Fidelity +Compression (HiFiC) model (which is proven to be highly effective for spatial +compression problems) and adapt it to perform spatio-spectral compression of +HSIs. In detail, we introduce two new models: i) HiFiC using Squeeze and +Excitation (SE) blocks (denoted as HiFiC$_{SE}$); and ii) HiFiC with 3D +convolutions (denoted as HiFiC$_{3D}$) in the framework of compression of HSIs. +We analyze the effectiveness of HiFiC$_{SE}$ and HiFiC$_{3D}$ in compressing +the spatio-spectral redundancies with channel attention and inter-dependency +analysis. Experimental results show the efficacy of the proposed models in +performing spatio-spectral compression, while reconstructing images at reduced +bitrates with higher reconstruction quality. The code of the proposed models is +publicly available at https://git.tu-berlin.de/rsim/HSI-SSC . + +
+
+
+
+
+ + ♻ ☆ AutoGCN -- Towards Generic Human Activity Recognition with Neural + Architecture Search + + +
+ This paper introduces AutoGCN, a generic Neural Architecture Search (NAS) +algorithm for Human Activity Recognition (HAR) using Graph Convolution Networks +(GCNs). HAR has gained attention due to advances in deep learning, increased +data availability, and enhanced computational capabilities. At the same time, +GCNs have shown promising results in modeling relationships between body key +points in a skeletal graph. While domain experts often craft dataset-specific +GCN-based methods, their applicability beyond this specific context is severely +limited. AutoGCN seeks to address this limitation by simultaneously searching +for the ideal hyperparameters and architecture combination within a versatile +search space using a reinforcement controller while balancing optimal +exploration and exploitation behavior with a knowledge reservoir during the +search process. We conduct extensive experiments on two large-scale datasets +focused on skeleton-based action recognition to assess the proposed algorithm's +performance. Our experimental results underscore the effectiveness of AutoGCN +in constructing optimal GCN architectures for HAR, outperforming conventional +NAS and GCN methods, as well as random search. These findings highlight the +significance of a diverse search space and an expressive input representation +to enhance the network performance and generalizability. + +
+
+
+
+
+ + ♻ ☆ Context-self contrastive pretraining for crop type semantic segmentation + + +
+ In this paper, we propose a fully supervised pre-training scheme based on +contrastive learning particularly tailored to dense classification tasks. The +proposed Context-Self Contrastive Loss (CSCL) learns an embedding space that +makes semantic boundaries pop-up by use of a similarity metric between every +location in a training sample and its local context. For crop type semantic +segmentation from Satellite Image Time Series (SITS) we find performance at +parcel boundaries to be a critical bottleneck and explain how CSCL tackles the +underlying cause of that problem, improving the state-of-the-art performance in +this task. Additionally, using images from the Sentinel-2 (S2) satellite +missions we compile the largest, to our knowledge, SITS dataset densely +annotated by crop type and parcel identities, which we make publicly available +together with the data generation pipeline. Using that data we find CSCL, even +with minimal pre-training, to improve all respective baselines and present a +process for semantic segmentation at super-resolution for obtaining crop +classes at a more granular level. The code and instructions to download the +data can be found in https://github.com/michaeltrs/DeepSatModels. + +
+
+ comment: 15 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ PPT: Token Pruning and Pooling for Efficient Vision Transformers + + +
+ Vision Transformers (ViTs) have emerged as powerful models in the field of +computer vision, delivering superior performance across various vision tasks. +However, the high computational complexity poses a significant barrier to their +practical applications in real-world scenarios. Motivated by the fact that not +all tokens contribute equally to the final predictions and fewer tokens bring +less computational cost, reducing redundant tokens has become a prevailing +paradigm for accelerating vision transformers. However, we argue that it is not +optimal to either only reduce inattentive redundancy by token pruning, or only +reduce duplicative redundancy by token merging. To this end, in this paper we +propose a novel acceleration framework, namely token Pruning & Pooling +Transformers (PPT), to adaptively tackle these two types of redundancy in +different layers. By heuristically integrating both token pruning and token +pooling techniques in ViTs without additional trainable parameters, PPT +effectively reduces the model complexity while maintaining its predictive +accuracy. For example, PPT reduces over 37% FLOPs and improves the throughput +by over 45% for DeiT-S without any accuracy drop on the ImageNet dataset. The +code is available at https://github.com/xjwu1024/PPT and +https://github.com/mindspore-lab/models/ + +
+
+
+
+
+ + ♻ ☆ Linear Alignment of Vision-language Models for Image Captioning + + +
+ Recently, vision-language models like CLIP have advanced the state of the art +in a variety of multi-modal tasks including image captioning and caption +evaluation. Many approaches adapt CLIP-style models to a downstream task by +training a mapping network between CLIP and a language model. This is costly as +it usually involves calculating gradients for large models. We propose a more +efficient training protocol that fits a linear mapping between image and text +embeddings of CLIP via a closed-form solution. This bypasses the need for +gradient computation and results in a lightweight captioning method called +ReCap, which can be trained up to 1000 times faster than existing lightweight +methods. Moreover, we propose two new learning-based image-captioning metrics +that build on CLIP score along with our linear mapping. Furthermore, we combine +ReCap with our new metrics to design an iterative datastore-augmentation loop +(DAL) based on synthetic captions. We evaluate ReCap on MS-COCO, Flickr30k, +VizWiz, and MSRVTT. ReCap achieves performance comparable to state-of-the-art +lightweight methods on established metrics while outperforming them on our new +metrics, which are better aligned with human ratings on Flickr8k-Expert and +Flickr8k-Crowdflower. Finally, we demonstrate that ReCap transfers well to +other domains and that our DAL leads to a performance boost. + +
+
+ comment: 8 pages (+ references and appendix) +
+
+
+
+
+ + ♻ ☆ A Survey on Deep Learning for Polyp Segmentation: Techniques, Challenges + and Future Trends + + +
+ Early detection and assessment of polyps play a crucial role in the +prevention and treatment of colorectal cancer (CRC). Polyp segmentation +provides an effective solution to assist clinicians in accurately locating and +segmenting polyp regions. In the past, people often relied on manually +extracted lower-level features such as color, texture, and shape, which often +had issues capturing global context and lacked robustness to complex scenarios. +With the advent of deep learning, more and more outstanding medical image +segmentation algorithms based on deep learning networks have emerged, making +significant progress in this field. This paper provides a comprehensive review +of polyp segmentation algorithms. We first review some traditional algorithms +based on manually extracted features and deep segmentation algorithms, then +detail benchmark datasets related to the topic. Specifically, we carry out a +comprehensive evaluation of recent deep learning models and results based on +polyp sizes, considering the pain points of research topics and differences in +network structures. Finally, we discuss the challenges of polyp segmentation +and future trends in this field. The models, benchmark datasets, and source +code links we collected are all published at +https://github.com/taozh2017/Awesome-Polyp-Segmentation. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ HICH Image/Text (HICH-IT): Comprehensive Text and Image Datasets for + Hypertensive Intracerebral Hemorrhage Research + + +
+ In this paper, we introduce a new dataset in the medical field of +hypertensive intracerebral hemorrhage (HICH), called HICH-IT, which includes +both electronic medical records (EMRs) and head CT images. This dataset is +designed to enhance the accuracy of artificial intelligence in the diagnosis +and treatment of HICH. This dataset, built upon the foundation of standard text +and image data, incorporates specific annotations within the EMRs, extracting +key content from the text information, and categorizes the annotation content +of imaging data into four types: brain midline, hematoma, left and right +cerebral ventricle. HICH-IT aims to be a foundational dataset for feature +learning in image segmentation tasks and named entity recognition. To further +understand the dataset, we have trained deep learning algorithms to observe the +performance. The pretrained models have been released at both www.daip.club and +github.com/Deep-AI-Application-DAIP. The dataset has been uploaded to +https://github.com/CYBUS123456/HICH-IT-Datasets. + Index Terms-HICH, Deep learning, Intraparenchymal hemorrhage, named entity +recognition, novel dataset + +
+
+
+
+
+ + ♻ ☆ FreDSNet: Joint Monocular Depth and Semantic Segmentation with Fast + Fourier Convolutions + + +
+ In this work we present FreDSNet, a deep learning solution which obtains +semantic 3D understanding of indoor environments from single panoramas. +Omnidirectional images reveal task-specific advantages when addressing scene +understanding problems due to the 360-degree contextual information about the +entire environment they provide. However, the inherent characteristics of the +omnidirectional images add additional problems to obtain an accurate detection +and segmentation of objects or a good depth estimation. To overcome these +problems, we exploit convolutions in the frequential domain obtaining a wider +receptive field in each convolutional layer. These convolutions allow to +leverage the whole context information from omnidirectional images. FreDSNet is +the first network that jointly provides monocular depth estimation and semantic +segmentation from a single panoramic image exploiting fast Fourier +convolutions. Our experiments show that FreDSNet has similar performance as +specific state of the art methods for semantic segmentation and depth +estimation. FreDSNet code is publicly available in +https://github.com/Sbrunoberenguel/FreDSNet + +
+
+ comment: 7 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Improved Implicit Neural Representation with Fourier Bases + Reparameterized Training + + +
+ Implicit Neural Representation (INR) as a mighty representation paradigm has +achieved success in various computer vision tasks recently. Due to the +low-frequency bias issue of vanilla multi-layer perceptron (MLP), existing +methods have investigated advanced techniques, such as positional encoding and +periodic activation function, to improve the accuracy of INR. In this paper, we +connect the network training bias with the reparameterization technique and +theoretically prove that weight reparameterization could provide us a chance to +alleviate the spectral bias of MLP. Based on our theoretical analysis, we +propose a Fourier reparameterization method which learns coefficient matrix of +fixed Fourier bases to compose the weights of MLP. We evaluate the proposed +Fourier reparameterization method on different INR tasks with various MLP +architectures, including vanilla MLP, MLP with positional encoding and MLP with +advanced activation function, etc. The superiority approximation results on +different MLP architectures clearly validate the advantage of our proposed +method. Armed with our Fourier reparameterization method, better INR with more +textures and less artifacts can be learned from the training data. + +
+
+
+
+
+ + ♻ ☆ Physics-informed Deep Diffusion MRI Reconstruction with Synthetic Data: + Break Training Data Bottleneck in Artificial Intelligence + + +
+ Diffusion magnetic resonance imaging (MRI) is the only imaging modality for +non-invasive movement detection of in vivo water molecules, with significant +clinical and research applications. Diffusion MRI (DWI) acquired by multi-shot +techniques can achieve higher resolution, better signal-to-noise ratio, and +lower geometric distortion than single-shot, but suffers from inter-shot +motion-induced artifacts. These artifacts cannot be removed prospectively, +leading to the absence of artifact-free training labels. Thus, the potential of +deep learning in multi-shot DWI reconstruction remains largely untapped. To +break the training data bottleneck, here, we propose a Physics-Informed Deep +DWI reconstruction method (PIDD) to synthesize high-quality paired training +data by leveraging the physical diffusion model (magnitude synthesis) and +inter-shot motion-induced phase model (motion phase synthesis). The network is +trained only once with 100,000 synthetic samples, achieving encouraging results +on multiple realistic in vivo data reconstructions. Advantages over +conventional methods include: (a) Better motion artifact suppression and +reconstruction stability; (b) Outstanding generalization to multi-scenario +reconstructions, including multi-resolution, multi-b-value, +multi-undersampling, multi-vendor, and multi-center; (c) Excellent clinical +adaptability to patients with verifications by seven experienced doctors +(p<0.001). In conclusion, PIDD presents a novel deep learning framework by +exploiting the power of MRI physics, providing a cost-effective and explainable +way to break the data bottleneck in deep learning medical imaging. + +
+
+ comment: 23 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Guiding Instruction-based Image Editing via Multimodal Large Language + Models ICLR'24 + + +
+ Instruction-based image editing improves the controllability and flexibility +of image manipulation via natural commands without elaborate descriptions or +regional masks. However, human instructions are sometimes too brief for current +methods to capture and follow. Multimodal large language models (MLLMs) show +promising capabilities in cross-modal understanding and visual-aware response +generation via LMs. We investigate how MLLMs facilitate edit instructions and +present MLLM-Guided Image Editing (MGIE). MGIE learns to derive expressive +instructions and provides explicit guidance. The editing model jointly captures +this visual imagination and performs manipulation through end-to-end training. +We evaluate various aspects of Photoshop-style modification, global photo +optimization, and local editing. Extensive experimental results demonstrate +that expressive instructions are crucial to instruction-based image editing, +and our MGIE can lead to a notable improvement in automatic metrics and human +evaluation while maintaining competitive inference efficiency. + +
+
+ comment: ICLR'24 (Spotlight) ; Project at https://mllm-ie.github.io ; Code at + https://github.com/tsujuifu/pytorch_mgie +
+
+
+
+
+ + ♻ ☆ RCM-Fusion: Radar-Camera Multi-Level Fusion for 3D Object Detection ICRA 2024 + + +
+ While LiDAR sensors have been successfully applied to 3D object detection, +the affordability of radar and camera sensors has led to a growing interest in +fusing radars and cameras for 3D object detection. However, previous +radar-camera fusion models were unable to fully utilize the potential of radar +information. In this paper, we propose Radar-Camera Multi-level fusion +(RCM-Fusion), which attempts to fuse both modalities at both feature and +instance levels. For feature-level fusion, we propose a Radar Guided BEV +Encoder which transforms camera features into precise BEV representations using +the guidance of radar Bird's-Eye-View (BEV) features and combines the radar and +camera BEV features. For instance-level fusion, we propose a Radar Grid Point +Refinement module that reduces localization error by accounting for the +characteristics of the radar point clouds. The experiments conducted on the +public nuScenes dataset demonstrate that our proposed RCM-Fusion achieves +state-of-the-art performances among single frame-based radar-camera fusion +methods in the nuScenes 3D object detection benchmark. Code will be made +publicly available. + +
+
+ comment: Accepted by IEEE International Conference on Robotics and Automation + (ICRA 2024), 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Deep Learning Techniques for In-Crop Weed Identification: A Review + + +
+ Weeds are a significant threat to the agricultural productivity and the +environment. The increasing demand for sustainable agriculture has driven +innovations in accurate weed control technologies aimed at reducing the +reliance on herbicides. With the great success of deep learning in various +vision tasks, many promising image-based weed detection algorithms have been +developed. This paper reviews recent developments of deep learning techniques +in the field of image-based weed detection. The review begins with an +introduction to the fundamentals of deep learning related to weed detection. +Next, recent progresses on deep weed detection are reviewed with the discussion +of the research materials including public weed datasets. Finally, the +challenges of developing practically deployable weed detection methods are +summarized, together with the discussions of the opportunities for future +research.We hope that this review will provide a timely survey of the field and +attract more researchers to address this inter-disciplinary research problem. + +
+
+
+
+
+ + ♻ ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support + Transformer for Frame-Event based Recognition + + +
+ Event camera-based pattern recognition is a newly arising research topic in +recent years. Current researchers usually transform the event streams into +images, graphs, or voxels, and adopt deep neural networks for event-based +classification. Although good performance can be achieved on simple event +recognition datasets, however, their results may be still limited due to the +following two issues. Firstly, they adopt spatial sparse event streams for +recognition only, which may fail to capture the color and detailed texture +information well. Secondly, they adopt either Spiking Neural Networks (SNN) for +energy-efficient recognition with suboptimal results, or Artificial Neural +Networks (ANN) for energy-intensive, high-performance recognition. However, +seldom of them consider achieving a balance between these two aspects. In this +paper, we formally propose to recognize patterns by fusing RGB frames and event +streams simultaneously and propose a new RGB frame-event recognition framework +to address the aforementioned issues. The proposed method contains four main +modules, i.e., memory support Transformer network for RGB frame encoding, +spiking neural network for raw event stream encoding, multi-modal bottleneck +fusion module for RGB-Event feature aggregation, and prediction head. Due to +the scarce of RGB-Event based classification dataset, we also propose a +large-scale PokerEvent dataset which contains 114 classes, and 27102 +frame-event pairs recorded using a DVS346 event camera. Extensive experiments +on two RGB-Event based classification datasets fully validated the +effectiveness of our proposed framework. We hope this work will boost the +development of pattern recognition by fusing RGB frames and event streams. Both +our dataset and source code of this work will be released at +https://github.com/Event-AHU/SSTFormer. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ Phrase Grounding-based Style Transfer for Single-Domain Generalized + Object Detection + + +
+ Single-domain generalized object detection aims to enhance a model's +generalizability to multiple unseen target domains using only data from a +single source domain during training. This is a practical yet challenging task +as it requires the model to address domain shift without incorporating target +domain data into training. In this paper, we propose a novel phrase +grounding-based style transfer (PGST) approach for the task. Specifically, we +first define textual prompts to describe potential objects for each unseen +target domain. Then, we leverage the grounded language-image pre-training +(GLIP) model to learn the style of these target domains and achieve style +transfer from the source to the target domain. The style-transferred source +visual features are semantically rich and could be close to imaginary +counterparts in the target domain. Finally, we employ these style-transferred +visual features to fine-tune GLIP. By introducing imaginary counterparts, the +detector could be effectively generalized to unseen target domains using only a +single source domain for training. Extensive experimental results on five +diverse weather driving benchmarks demonstrate our proposed approach achieves +state-of-the-art performance, even surpassing some domain adaptive methods that +incorporate target domain images into the training process.The source codes and +pre-trained models will be made available. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Generating by Understanding: Neural Visual Generation with Logical + Symbol Groundings + + +
+ Despite the great success of neural visual generative models in recent years, +integrating them with strong symbolic reasoning systems remains a challenging +task. There are two levels of symbol grounding problems among the core +challenges: the first is symbol assignment, i.e. mapping latent factors of +neural visual generators to semantic-meaningful symbolic factors from the +reasoning systems by learning from limited labeled data. The second is rule +learning, i.e. learning new rules that govern the generative process to enhance +the symbolic reasoning systems. To deal with these two problems, we propose a +neurosymbolic learning approach, Abductive visual Generation (AbdGen), for +integrating logic programming systems with neural visual generative models +based on the abductive learning framework. To achieve reliable and efficient +symbol grounding, the quantized abduction method is introduced for generating +abduction proposals by the nearest-neighbor lookup within semantic codebooks. +To achieve precise rule learning, the contrastive meta-abduction method is +proposed to eliminate wrong rules with positive cases and avoid less +informative rules with negative cases simultaneously. Experimental results show +that compared to the baseline approaches, AbdGen requires significantly less +labeled data for symbol assignment. Furthermore, AbdGen can effectively learn +underlying logical generative rules from data, which is out of the capability +of existing approaches. The code is released at this link: +https://github.com/candytalking/AbdGen. + +
+
+
+
+
+ + ♻ ☆ An extended asymmetric sigmoid with Perceptron (SIGTRON) for imbalanced + linear classification + + +
+ This article presents a new polynomial parameterized sigmoid called SIGTRON, +which is an extended asymmetric sigmoid with Perceptron, and its companion +convex model called SIGTRON-imbalanced classification (SIC) model that employs +a virtual SIGTRON-induced convex loss function. In contrast to the conventional +$\pi$-weighted cost-sensitive learning model, the SIC model does not have an +external $\pi$-weight on the loss function but has internal parameters in the +virtual SIGTRON-induced loss function. As a consequence, when the given +training dataset is close to the well-balanced condition, we show that the +proposed SIC model is more adaptive to variations of the dataset, such as the +inconsistency of the scale-class-imbalance ratio between the training and test +datasets. This adaptation is achieved by creating a skewed hyperplane equation. +Additionally, we present a quasi-Newton optimization(L-BFGS) framework for the +virtual convex loss by developing an interval-based bisection line search. +Empirically, we have observed that the proposed approach outperforms +$\pi$-weighted convex focal loss and balanced classifier LIBLINEAR(logistic +regression, SVM, and L2SVM) in terms of test classification accuracy with $51$ +two-class and $67$ multi-class datasets. In binary classification problems, +where the scale-class-imbalance ratio of the training dataset is not +significant but the inconsistency exists, a group of SIC models with the best +test accuracy for each dataset (TOP$1$) outperforms LIBSVM(C-SVC with RBF +kernel), a well-known kernel-based classifier. + +
+
+ comment: 24 pages, 9 figures, a typo is corrected +
+
+
+
+
+ + ♻ ☆ Lightweight, Pre-trained Transformers for Remote Sensing Timeseries + + +
+ Machine learning methods for satellite data have a range of societally +relevant applications, but labels used to train models can be difficult or +impossible to acquire. Self-supervision is a natural solution in settings with +limited labeled data, but current self-supervised models for satellite data +fail to take advantage of the characteristics of that data, including the +temporal dimension (which is critical for many applications, such as monitoring +crop growth) and availability of data from many complementary sensors (which +can significantly improve a model's predictive performance). We present Presto +(the Pretrained Remote Sensing Transformer), a model pre-trained on remote +sensing pixel-timeseries data. By designing Presto specifically for remote +sensing data, we can create a significantly smaller but performant model. +Presto excels at a wide variety of globally distributed remote sensing tasks +and performs competitively with much larger models while requiring far less +compute. Presto can be used for transfer learning or as a feature extractor for +simple models, enabling efficient deployment at scale. + +
+
+
+
+
+ + ♻ ☆ EraseDiff: Erasing Data Influence in Diffusion Models + + +
+ In this work, we introduce an unlearning algorithm for diffusion models. Our +algorithm equips a diffusion model with a mechanism to mitigate the concerns +related to data memorization. To achieve this, we formulate the unlearning +problem as a constraint optimization problem, aiming to preserve the utility of +the diffusion model on the remaining data and scrub the information associated +with forgetting data by deviating the learnable generative process from the +ground-truth denoising procedure. To solve the resulting problem, we adopt a +first-order method, having superior practical performance while being vigilant +about the diffusion process. Empirically, we demonstrate that our algorithm can +preserve the model utility, effectiveness, and efficiency while removing across +the widely-used diffusion models and in both conditional and unconditional +image generation scenarios. + +
+
+ comment: Diffusion Model, Machine Unlearning +
+
+
+
+
+ + ♻ ☆ Distilling Out-of-Distribution Robustness from Vision-Language + Foundation Models NeurIPS 2023 + + +
+ We propose a conceptually simple and lightweight framework for improving the +robustness of vision models through the combination of knowledge distillation +and data augmentation. We address the conjecture that larger models do not make +for better teachers by showing strong gains in out-of-distribution robustness +when distilling from pretrained foundation models. Following this finding, we +propose Discrete Adversarial Distillation (DAD), which leverages a robust +teacher to generate adversarial examples and a VQGAN to discretize them, +creating more informative samples than standard data augmentation techniques. +We provide a theoretical framework for the use of a robust teacher in the +knowledge distillation with data augmentation setting and demonstrate strong +gains in out-of-distribution robustness and clean accuracy across different +student architectures. Notably, our method adds minor computational overhead +compared to similar techniques and can be easily combined with other data +augmentations for further improvements. + +
+
+ comment: Published in NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ LaCViT: A Label-aware Contrastive Fine-tuning Framework for Vision + Transformers + + +
+ Vision Transformers (ViTs) have emerged as popular models in computer vision, +demonstrating state-of-the-art performance across various tasks. This success +typically follows a two-stage strategy involving pre-training on large-scale +datasets using self-supervised signals, such as masked random patches, followed +by fine-tuning on task-specific labeled datasets with cross-entropy loss. +However, this reliance on cross-entropy loss has been identified as a limiting +factor in ViTs, affecting their generalization and transferability to +downstream tasks. Addressing this critical challenge, we introduce a novel +Label-aware Contrastive Training framework, LaCViT, which significantly +enhances the quality of embeddings in ViTs. LaCViT not only addresses the +limitations of cross-entropy loss but also facilitates more effective transfer +learning across diverse image classification tasks. Our comprehensive +experiments on eight standard image classification datasets reveal that LaCViT +statistically significantly enhances the performance of three evaluated ViTs by +up-to 10.78% under Top-1 Accuracy. + +
+
+
+
+
+ + ♻ ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As Multimodal Large Language Models (MLLMs) grow in size, adapting them to +specialized tasks becomes increasingly challenging due to high computational +and memory demands. Indeed, traditional fine-tuning methods are costly, due to +the need for extensive, task-specific training. While efficient adaptation +methods exist that aim to reduce these costs, in practice they suffer from +shallow inter-modal alignment, which severely hurts model effectiveness. To +tackle these computational challenges and improve inter-modal alignment, we +introduce the MultiWay-Adapter (MWA), a novel framework featuring an 'Alignment +Enhancer'. This enhancer deepens inter-modal alignment, enabling high +transferability with minimal tuning effort. Our experiments show that unlike +prior efficient tuning approaches, MWA maintains model effectiveness, while +reducing training time by up-to 57%. MWA is also lightweight, increasing model +size by only 2-3% (in terms of parameters) for state-of-the-art foundation +models like BEiT-3 Large. These results demonstrate that MWA provides an +efficient and effective adaptation method for MLLMs, significantly broadening +their applicability. + +
+
+
+
+
+ + ♻ ☆ The University of California San Francisco, Brain Metastases + Stereotactic Radiosurgery (UCSF-BMSR) MRI Dataset + + +
+ The University of California San Francisco Brain Metastases Stereotactic +Radiosurgery (UCSF-BMSR) dataset is a public, clinical, multimodal brain MRI +dataset consisting of 560 brain MRIs from 412 patients with expert annotations +of 5136 brain metastases. Data consists of registered and skull stripped T1 +post-contrast, T1 pre-contrast, FLAIR and subtraction (T1 pre-contrast - T1 +post-contrast) images and voxelwise segmentations of enhancing brain metastases +in NifTI format. The dataset also includes patient demographics, surgical +status and primary cancer types. The UCSF-BSMR has been made publicly available +in the hopes that researchers will use these data to push the boundaries of AI +applications for brain metastases. + +
+
+ comment: 15 pages, 2 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ PathMMU: A Massive Multimodal Expert-Level Benchmark for Understanding + and Reasoning in Pathology + + +
+ The emergence of large multimodal models has unlocked remarkable potential in +AI, particularly in pathology. However, the lack of specialized, high-quality +benchmark impeded their development and precise evaluation. To address this, we +introduce PathMMU, the largest and highest-quality expert-validated pathology +benchmark for LMMs. It comprises 33,573 multimodal multi-choice questions and +21,599 images from various sources, and an explanation for the correct answer +accompanies each question. The construction of PathMMU capitalizes on the +robust capabilities of GPT-4V, utilizing approximately 30,000 gathered +image-caption pairs to generate Q\&As. Significantly, to maximize PathMMU's +authority, we invite six pathologists to scrutinize each question under strict +standards in PathMMU's validation and test sets, while simultaneously setting +an expert-level performance benchmark for PathMMU. We conduct extensive +evaluations, including zero-shot assessments of 14 open-sourced and three +closed-sourced LMMs and their robustness to image corruption. We also fine-tune +representative LMMs to assess their adaptability to PathMMU. The empirical +findings indicate that advanced LMMs struggle with the challenging PathMMU +benchmark, with the top-performing LMM, GPT-4V, achieving only a 51.7\% +zero-shot performance, significantly lower than the 71.4\% demonstrated by +human pathologists. After fine-tuning, even open-sourced LMMs can surpass +GPT-4V with a performance of over 60\%, but still fall short of the expertise +shown by pathologists. We hope that the PathMMU will offer valuable insights +and foster the development of more specialized, next-generation LLMs for +pathology. + +
+
+ comment: make source and method updates before resubmission +
+
+
+
+
+ + ♻ ☆ MGTR: Multi-Granular Transformer for Motion Prediction with LiDAR ICRA 2024 + + +
+ Motion prediction has been an essential component of autonomous driving +systems since it handles highly uncertain and complex scenarios involving +moving agents of different types. In this paper, we propose a Multi-Granular +TRansformer (MGTR) framework, an encoder-decoder network that exploits context +features in different granularities for different kinds of traffic agents. To +further enhance MGTR's capabilities, we leverage LiDAR point cloud data by +incorporating LiDAR semantic features from an off-the-shelf LiDAR feature +extractor. We evaluate MGTR on Waymo Open Dataset motion prediction benchmark +and show that the proposed method achieved state-of-the-art performance, +ranking 1st on its leaderboard +(https://waymo.com/open/challenges/2023/motion-prediction/). + +
+
+ comment: Accepted to ICRA 2024 +
+
+
+
+
+ + ♻ ☆ VLATTACK: Multimodal Adversarial Attacks on Vision-Language Tasks via + Pre-trained Models NeurIPS 2023 + + +
+ Vision-Language (VL) pre-trained models have shown their superiority on many +multimodal tasks. However, the adversarial robustness of such models has not +been fully explored. Existing approaches mainly focus on exploring the +adversarial robustness under the white-box setting, which is unrealistic. In +this paper, we aim to investigate a new yet practical task to craft image and +text perturbations using pre-trained VL models to attack black-box fine-tuned +models on different downstream tasks. Towards this end, we propose VLATTACK to +generate adversarial samples by fusing perturbations of images and texts from +both single-modal and multimodal levels. At the single-modal level, we propose +a new block-wise similarity attack (BSA) strategy to learn image perturbations +for disrupting universal representations. Besides, we adopt an existing text +attack strategy to generate text perturbations independent of the image-modal +attack. At the multimodal level, we design a novel iterative cross-search +attack (ICSA) method to update adversarial image-text pairs periodically, +starting with the outputs from the single-modal level. We conduct extensive +experiments to attack five widely-used VL pre-trained models for six tasks. +Experimental results show that VLATTACK achieves the highest attack success +rates on all tasks compared with state-of-the-art baselines, which reveals a +blind spot in the deployment of pre-trained VL models. Source codes can be +found at https://github.com/ericyinyzy/VLAttack. + +
+
+ comment: Accepted by NeurIPS 2023, 21 pages +
+
+
+
+
+
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ Event-based Product Carousel Recommendation with Query-Click Graph + + +
+ Many current recommender systems mainly focus on the product-to-product +recommendations and user-to-product recommendations even during the time of +events rather than modeling the typical recommendations for the target event +(e.g., festivals, seasonal activities, or social activities) without addressing +the multiple aspects of the shopping demands for the target event. Product +recommendations for the multiple aspects of the target event are usually +generated by human curators who manually identify the aspects and select a list +of aspect-related products (i.e., product carousel) for each aspect as +recommendations. However, building a recommender system with machine learning +is non-trivial due to the lack of both the ground truth of event-related +aspects and the aspect-related products. To fill this gap, we define the novel +problem as the event-based product carousel recommendations in e-commerce and +propose an effective recommender system based on the query-click bipartite +graph. We apply the iterative clustering algorithm over the query-click +bipartite graph and infer the event-related aspects by the clusters of queries. +The aspect-related recommendations are powered by the click-through rate of +products regarding each aspect. We show through experiments that this approach +effectively mines product carousels for the target event. + +
+
+ comment: 7 pages, 2 figures, 2021 IEEE International Conference on Big Data + (Big Data) +
+
+
+
+
+ + ☆ Unified Hallucination Detection for Multimodal Large Language Models + + +
+ Despite significant strides in multimodal tasks, Multimodal Large Language +Models (MLLMs) are plagued by the critical issue of hallucination. The reliable +detection of such hallucinations in MLLMs has, therefore, become a vital aspect +of model evaluation and the safeguarding of practical application deployment. +Prior research in this domain has been constrained by a narrow focus on +singular tasks, an inadequate range of hallucination categories addressed, and +a lack of detailed granularity. In response to these challenges, our work +expands the investigative horizons of hallucination detection. We present a +novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate +the evaluation of advancements in hallucination detection methods. +Additionally, we unveil a novel unified multimodal hallucination detection +framework, UNIHD, which leverages a suite of auxiliary tools to validate the +occurrence of hallucinations robustly. We demonstrate the effectiveness of +UNIHD through meticulous evaluation and comprehensive analysis. We also provide +strategic insights on the application of specific tools for addressing various +categories of hallucinations. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Comparison of Topic Modelling Approaches in the Banking Context + + +
+ Topic modelling is a prominent task for automatic topic extraction in many +applications such as sentiment analysis and recommendation systems. The +approach is vital for service industries to monitor their customer discussions. +The use of traditional approaches such as Latent Dirichlet Allocation (LDA) for +topic discovery has shown great performances, however, they are not consistent +in their results as these approaches suffer from data sparseness and inability +to model the word order in a document. Thus, this study presents the use of +Kernel Principal Component Analysis (KernelPCA) and K-means Clustering in the +BERTopic architecture. We have prepared a new dataset using tweets from +customers of Nigerian banks and we use this to compare the topic modelling +approaches. Our findings showed KernelPCA and K-means in the BERTopic +architecture-produced coherent topics with a coherence score of 0.8463. + +
+
+ comment: 14 pages, Journal of Applied Science +
+
+
+
+
+ + ☆ Linguistic features for sentence difficulty prediction in ABSA + + +
+ One of the challenges of natural language understanding is to deal with the +subjectivity of sentences, which may express opinions and emotions that add +layers of complexity and nuance. Sentiment analysis is a field that aims to +extract and analyze these subjective elements from text, and it can be applied +at different levels of granularity, such as document, paragraph, sentence, or +aspect. Aspect-based sentiment analysis is a well-studied topic with many +available data sets and models. However, there is no clear definition of what +makes a sentence difficult for aspect-based sentiment analysis. In this paper, +we explore this question by conducting an experiment with three data sets: +"Laptops", "Restaurants", and "MTSC" (Multi-Target-dependent Sentiment +Classification), and a merged version of these three datasets. We study the +impact of domain diversity and syntactic diversity on difficulty. We use a +combination of classifiers to identify the most difficult sentences and analyze +their characteristics. We employ two ways of defining sentence difficulty. The +first one is binary and labels a sentence as difficult if the classifiers fail +to correctly predict the sentiment polarity. The second one is a six-level +scale based on how many of the top five best-performing classifiers can +correctly predict the sentiment polarity. We also define 9 linguistic features +that, combined, aim at estimating the difficulty at sentence level. + +
+
+
+
+
+ + ☆ EasyInstruct: An Easy-to-use Instruction Processing Framework for Large + Language Models + + +
+ In recent years, instruction tuning has gained increasing attention and +emerged as a crucial technique to enhance the capabilities of Large Language +Models (LLMs). To construct high-quality instruction datasets, many instruction +processing approaches have been proposed, aiming to achieve a delicate balance +between data quantity and data quality. Nevertheless, due to inconsistencies +that persist among various instruction processing methods, there is no standard +open-source instruction processing implementation framework available for the +community, which hinders practitioners from further developing and advancing. +To facilitate instruction processing research and development, we present +EasyInstruct, an easy-to-use instruction processing framework for LLMs, which +modularizes instruction generation, selection, and prompting, while also +considering their combination and interaction. EasyInstruct is publicly +released and actively maintained at https://github.com/zjunlp/EasyInstruct, +along with a running demo App at +https://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for +broader research centered on instruction data. + +
+
+ comment: Ongoing work; the project website is at + https://zjunlp.github.io/project/EasyInstruct, code is at + https://github.com/zjunlp/EasyInstruct, demo is at + https://huggingface.co/spaces/zjunlp/EasyInstruct +
+
+
+
+
+ + ☆ Understanding and Guiding Weakly Supervised Entity Alignment with + Potential Isomorphism Propagation + + +
+ Weakly Supervised Entity Alignment (EA) is the task of identifying equivalent +entities across diverse knowledge graphs (KGs) using only a limited number of +seed alignments. Despite substantial advances in aggregation-based weakly +supervised EA, the underlying mechanisms in this setting remain unexplored. In +this paper, we present a propagation perspective to analyze weakly supervised +EA and explain the existing aggregation-based EA models. Our theoretical +analysis reveals that these models essentially seek propagation operators for +pairwise entity similarities. We further prove that, despite the structural +heterogeneity of different KGs, the potentially aligned entities within +aggregation-based EA models have isomorphic subgraphs, which is the core +premise of EA but has not been investigated. Leveraging this insight, we +introduce a potential isomorphism propagation operator to enhance the +propagation of neighborhood information across KGs. We develop a general EA +framework, PipEA, incorporating this operator to improve the accuracy of every +type of aggregation-based model without altering the learning process. +Extensive experiments substantiate our theoretical findings and demonstrate +PipEA's significant performance gains over state-of-the-art weakly supervised +EA methods. Our work not only advances the field but also enhances our +comprehension of aggregation-based weakly supervised EA. + +
+
+
+
+
+ + ☆ Domain Adaptation of Multilingual Semantic Search -- Literature Review + + +
+ This literature review gives an overview of current approaches to perform +domain adaptation in a low-resource and approaches to perform multilingual +semantic search in a low-resource setting. We developed a new typology to +cluster domain adaptation approaches based on the part of dense textual +information retrieval systems, which they adapt, focusing on how to combine +them efficiently. We also explore the possibilities of combining multilingual +semantic search with domain adaptation approaches for dense retrievers in a +low-resource setting. + +
+
+
+
+
+ + ☆ Dynamic Sparse Learning: A Novel Paradigm for Efficient Recommendation WSDM 2024 + + +
+ In the realm of deep learning-based recommendation systems, the increasing +computational demands, driven by the growing number of users and items, pose a +significant challenge to practical deployment. This challenge is primarily +twofold: reducing the model size while effectively learning user and item +representations for efficient recommendations. Despite considerable +advancements in model compression and architecture search, prevalent approaches +face notable constraints. These include substantial additional computational +costs from pre-training/re-training in model compression and an extensive +search space in architecture design. Additionally, managing complexity and +adhering to memory constraints is problematic, especially in scenarios with +strict time or space limitations. Addressing these issues, this paper +introduces a novel learning paradigm, Dynamic Sparse Learning (DSL), tailored +for recommendation models. DSL innovatively trains a lightweight sparse model +from scratch, periodically evaluating and dynamically adjusting each weight's +significance and the model's sparsity distribution during the training. This +approach ensures a consistent and minimal parameter budget throughout the full +learning lifecycle, paving the way for "end-to-end" efficiency from training to +inference. Our extensive experimental results underline DSL's effectiveness, +significantly reducing training and inference costs while delivering comparable +recommendation performance. + +
+
+ comment: 10 pages, 5 figures, 4 tables. Accecpted by WSDM 2024 +
+
+
+
+
+ + ☆ Comparing Knowledge Sources for Open-Domain Scientific Claim + Verification EACL 2024 + + +
+ The increasing rate at which scientific knowledge is discovered and health +claims shared online has highlighted the importance of developing efficient +fact-checking systems for scientific claims. The usual setting for this task in +the literature assumes that the documents containing the evidence for claims +are already provided and annotated or contained in a limited corpus. This +renders the systems unrealistic for real-world settings where knowledge sources +with potentially millions of documents need to be queried to find relevant +evidence. In this paper, we perform an array of experiments to test the +performance of open-domain claim verification systems. We test the final +verdict prediction of systems on four datasets of biomedical and health claims +in different settings. While keeping the pipeline's evidence selection and +verdict prediction parts constant, document retrieval is performed over three +common knowledge sources (PubMed, Wikipedia, Google) and using two different +information retrieval techniques. We show that PubMed works better with +specialized biomedical claims, while Wikipedia is more suited for everyday +health concerns. Likewise, BM25 excels in retrieval precision, while semantic +search in recall of relevant evidence. We discuss the results, outline frequent +retrieval patterns and challenges, and provide promising future directions. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ☆ Trinity: Syncretizing Multi-/Long-tail/Long-term Interests All in One + + +
+ Interest modeling in recommender system has been a constant topic for +improving user experience, and typical interest modeling tasks (e.g. +multi-interest, long-tail interest and long-term interest) have been +investigated in many existing works. However, most of them only consider one +interest in isolation, while neglecting their interrelationships. In this +paper, we argue that these tasks suffer from a common "interest amnesia" +problem, and a solution exists to mitigate it simultaneously. We figure that +long-term cues can be the cornerstone since they reveal multi-interest and +clarify long-tail interest. Inspired by the observation, we propose a novel and +unified framework in the retrieval stage, "Trinity", to solve interest amnesia +problem and improve multiple interest modeling tasks. We construct a real-time +clustering system that enables us to project items into enumerable clusters, +and calculate statistical interest histograms over these clusters. Based on +these histograms, Trinity recognizes underdelivered themes and remains stable +when facing emerging hot topics. Trinity is more appropriate for large-scale +industry scenarios because of its modest computational overheads. Its derived +retrievers have been deployed on the recommender system of Douyin, +significantly improving user experience and retention. We believe that such +practical experience can be well generalized to other scenarios. + +
+
+
+
+
+ + ☆ Intersectional Two-sided Fairness in Recommendation + + +
+ Fairness of recommender systems (RS) has attracted increasing attention +recently. Based on the involved stakeholders, the fairness of RS can be divided +into user fairness, item fairness, and two-sided fairness which considers both +user and item fairness simultaneously. However, we argue that the +intersectional two-sided unfairness may still exist even if the RS is two-sided +fair, which is observed and shown by empirical studies on real-world data in +this paper, and has not been well-studied previously. To mitigate this problem, +we propose a novel approach called Intersectional Two-sided Fairness +Recommendation (ITFR). Our method utilizes a sharpness-aware loss to perceive +disadvantaged groups, and then uses collaborative loss balance to develop +consistent distinguishing abilities for different intersectional groups. +Additionally, predicted score normalization is leveraged to align positive +predicted scores to fairly treat positives in different intersectional groups. +Extensive experiments and analyses on three public datasets show that our +proposed approach effectively alleviates the intersectional two-sided +unfairness and consistently outperforms previous state-of-the-art methods. + +
+
+
+
+
+ + ☆ Large Language Model Distilling Medication Recommendation Model + + +
+ The recommendation of medication is a vital aspect of intelligent healthcare +systems, as it involves prescribing the most suitable drugs based on a +patient's specific health needs. Unfortunately, many sophisticated models +currently in use tend to overlook the nuanced semantics of medical data, while +only relying heavily on identities. Furthermore, these models face significant +challenges in handling cases involving patients who are visiting the hospital +for the first time, as they lack prior prescription histories to draw upon. To +tackle these issues, we harness the powerful semantic comprehension and +input-agnostic characteristics of Large Language Models (LLMs). Our research +aims to transform existing medication recommendation methodologies using LLMs. +In this paper, we introduce a novel approach called Large Language Model +Distilling Medication Recommendation (LEADER). We begin by creating appropriate +prompt templates that enable LLMs to suggest medications effectively. However, +the straightforward integration of LLMs into recommender systems leads to an +out-of-corpus issue specific to drugs. We handle it by adapting the LLMs with a +novel output layer and a refined tuning loss function. Although LLM-based +models exhibit remarkable capabilities, they are plagued by high computational +costs during inference, which is impractical for the healthcare sector. To +mitigate this, we have developed a feature-level knowledge distillation +technique, which transfers the LLM's proficiency to a more compact model. +Extensive experiments conducted on two real-world datasets, MIMIC-III and +MIMIC-IV, demonstrate that our proposed model not only delivers effective +results but also is efficient. To ease the reproducibility of our experiments, +we release the implementation code online. + +
+
+
+
+
+ + ☆ List-aware Reranking-Truncation Joint Model for Search and + Retrieval-augmented Generation WWW 2024 + + +
+ The results of information retrieval (IR) are usually presented in the form +of a ranked list of candidate documents, such as web search for humans and +retrieval-augmented generation for large language models (LLMs). List-aware +retrieval aims to capture the list-level contextual features to return a better +list, mainly including reranking and truncation. Reranking finely re-scores the +documents in the list. Truncation dynamically determines the cut-off point of +the ranked list to achieve the trade-off between overall relevance and avoiding +misinformation from irrelevant documents. Previous studies treat them as two +separate tasks and model them separately. However, the separation is not +optimal. First, it is hard to share the contextual information of the ranking +list between the two tasks. Second, the separate pipeline usually meets the +error accumulation problem, where the small error from the reranking stage can +largely affect the truncation stage. To solve these problems, we propose a +Reranking-Truncation joint model (GenRT) that can perform the two tasks +concurrently. GenRT integrates reranking and truncation via generative paradigm +based on encoder-decoder architecture. We also design the novel loss functions +for joint optimization to make the model learn both tasks. Sharing parameters +by the joint model is conducive to making full use of the common modeling +information of the two tasks. Besides, the two tasks are performed concurrently +and co-optimized to solve the error accumulation problem between separate +stages. Experiments on public learning-to-rank benchmarks and open-domain Q\&A +tasks show that our method achieves SOTA performance on both reranking and +truncation tasks for web search and retrieval-augmented LLMs. + +
+
+ comment: Accepted by WWW 2024 +
+
+
+
+
+ + ☆ Denoising Time Cycle Modeling for Recommendation + + +
+ Recently, modeling temporal patterns of user-item interactions have attracted +much attention in recommender systems. We argue that existing methods ignore +the variety of temporal patterns of user behaviors. We define the subset of +user behaviors that are irrelevant to the target item as noises, which limits +the performance of target-related time cycle modeling and affect the +recommendation performance. In this paper, we propose Denoising Time Cycle +Modeling (DiCycle), a novel approach to denoise user behaviors and select the +subset of user behaviors that are highly related to the target item. DiCycle is +able to explicitly model diverse time cycle patterns for recommendation. +Extensive experiments are conducted on both public benchmarks and a real-world +dataset, demonstrating the superior performance of DiCycle over the +state-of-the-art recommendation methods. + +
+
+
+
+
+ + ☆ Early prediction of onset of sepsis in Clinical Setting + + +
+ This study proposes the use of Machine Learning models to predict the early +onset of sepsis using deidentified clinical data from Montefiore Medical Center +in Bronx, NY, USA. A supervised learning approach was adopted, wherein an +XGBoost model was trained utilizing 80\% of the train dataset, encompassing 107 +features (including the original and derived features). Subsequently, the model +was evaluated on the remaining 20\% of the test data. The model was validated +on prospective data that was entirely unseen during the training phase. To +assess the model's performance at the individual patient level and timeliness +of the prediction, a normalized utility score was employed, a widely recognized +scoring methodology for sepsis detection, as outlined in the PhysioNet Sepsis +Challenge paper. Metrics such as F1 Score, Sensitivity, Specificity, and Flag +Rate were also devised. The model achieved a normalized utility score of 0.494 +on test data and 0.378 on prospective data at threshold 0.3. The F1 scores were +80.8\% and 67.1\% respectively for the test data and the prospective data for +the same threshold, highlighting its potential to be integrated into clinical +decision-making processes effectively. These results bear testament to the +model's robust predictive capabilities and its potential to substantially +impact clinical decision-making processes. + +
+
+ comment: 16 pages, 6 figures and 7 tables +
+
+
+
+
+ + ☆ Harnessing PubMed User Query Logs for Post Hoc Explanations of + Recommended Similar Articles + + +
+ Searching for a related article based on a reference article is an integral +part of scientific research. PubMed, like many academic search engines, has a +"similar articles" feature that recommends articles relevant to the current +article viewed by a user. Explaining recommended items can be of great utility +to users, particularly in the literature search process. With more than a +million biomedical papers being published each year, explaining the recommended +similar articles would facilitate researchers and clinicians in searching for +related articles. Nonetheless, the majority of current literature +recommendation systems lack explanations for their suggestions. We employ a +post hoc approach to explaining recommendations by identifying relevant tokens +in the titles of similar articles. Our major contribution is building PubCLogs +by repurposing 5.6 million pairs of coclicked articles from PubMed's user query +logs. Using our PubCLogs dataset, we train the Highlight Similar Article Title +(HSAT), a transformer-based model designed to select the most relevant parts of +the title of a similar article, based on the title and abstract of a seed +article. HSAT demonstrates strong performance in our empirical evaluations, +achieving an F1 score of 91.72 percent on the PubCLogs test set, considerably +outperforming several baselines including BM25 (70.62), MPNet (67.11), MedCPT +(62.22), GPT-3.5 (46.00), and GPT-4 (64.89). Additional evaluations on a +separate, manually annotated test set further verifies HSAT's performance. +Moreover, participants of our user study indicate a preference for HSAT, due to +its superior balance between conciseness and comprehensiveness. Our study +suggests that repurposing user query logs of academic search engines can be a +promising way to train state-of-the-art models for explaining literature +recommendation. + +
+
+
+
+
+ + ☆ FINEST: Stabilizing Recommendations by Rank-Preserving Fine-Tuning RecSys 2023 + + +
+ Modern recommender systems may output considerably different recommendations +due to small perturbations in the training data. Changes in the data from a +single user will alter the recommendations as well as the recommendations of +other users. In applications like healthcare, housing, and finance, this +sensitivity can have adverse effects on user experience. We propose a method to +stabilize a given recommender system against such perturbations. This is a +challenging task due to (1) the lack of a ``reference'' rank list that can be +used to anchor the outputs; and (2) the computational challenges in ensuring +the stability of rank lists with respect to all possible perturbations of +training data. Our method, FINEST, overcomes these challenges by obtaining +reference rank lists from a given recommendation model and then fine-tuning the +model under simulated perturbation scenarios with rank-preserving +regularization on sampled items. Our experiments on real-world datasets +demonstrate that FINEST can ensure that recommender models output stable +recommendations under a wide range of different perturbations without +compromising next-item prediction accuracy. + +
+
+ comment: Accepted at the 6th FAccTRec Workshop on Responsible Recommendation @ + ACM RecSys 2023 +
+
+
+
+
+ + ☆ A Fuzzy Approach to Record Linkages + + +
+ Record Linkage is the process of identifying and unifying records from +various independent data sources. Existing strategies, which can be either +deterministic or probabilistic, often fail to link records satisfactorily under +uncertainty. This paper describes an indigenously (locally) developed fuzzy +linkage method, based on fuzzy set techniques, which can effectively account +for this uncertainty prevalent in the disparate data sources and address the +shortcomings of the existing approaches. Extensive testing, evaluation and +comparisons have demonstrated the efficacy of this fuzzy approach for record +linkages. + +
+
+ comment: Journal Paper (9 pages, 6 Figures) +
+
+
+
+
+ + ☆ Recommendation Fairness in Social Networks Over Time + + +
+ In social recommender systems, it is crucial that the recommendation models +provide equitable visibility for different demographic groups, such as gender +or race. Most existing research has addressed this problem by only studying +individual static snapshots of networks that typically change over time. To +address this gap, we study the evolution of recommendation fairness over time +and its relation to dynamic network properties. We examine three real-world +dynamic networks by evaluating the fairness of six recommendation algorithms +and analyzing the association between fairness and network properties over +time. We further study how interventions on network properties influence +fairness by examining counterfactual scenarios with alternative evolution +outcomes and differing network properties. Our results on empirical datasets +suggest that recommendation fairness improves over time, regardless of the +recommendation method. We also find that two network properties, minority +ratio, and homophily ratio, exhibit stable correlations with fairness over +time. Our counterfactual study further suggests that an extreme homophily ratio +potentially contributes to unfair recommendations even with a balanced minority +ratio. Our work provides insights into the evolution of fairness within dynamic +networks in social science. We believe that our findings will help system +operators and policymakers to better comprehend the implications of temporal +changes and interventions targeting fairness in social networks. + +
+
+
+
+
+ + ♻ ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts +extend beyond technology, influencing environmental and societal aspects. This +evolution has increased publications, making manual literature analysis +increasingly challenging. We address this with a Natural Language Processing +(NLP)-based systematic literature review method to explore the intersection of +Distributed Ledger Technology (DLT) with its Environmental, Social, and +Governance (ESG) aspects. Our approach involves building and refining a +directed citation network from 107 seed papers to a corpus of 24,539 +publications and fine-tuning a transformer-based language model for Named +Entity Recognition (NER) on DLT and ESG domains. Applying this model, we +distilled the corpus to 505 key publications, enabling an inaugural literature +review and temporal graph analysis of DLT's evolution in ESG contexts. Our +contributions include an adaptable and scalable NLP-driven systematic +literature review methodology and a unique NER dataset of 54,808 entities, +tailored for DLT and ESG research. Our inaugural literature review demonstrates +their applicability and effectiveness in analyzing DLT's evolution and impacts, +proving invaluable for stakeholders in the DLT domain. + +
+
+
+
+
+ + ♻ ☆ RimiRec: Modeling Refined Multi-interest in Hierarchical Structure for + Recommendation + + +
+ Industrial recommender systems usually consist of the retrieval stage and the +ranking stage, to handle the billion-scale of users and items. The retrieval +stage retrieves candidate items relevant to user interests for recommendations +and has attracted much attention. Frequently, a user shows refined +multi-interests in a hierarchical structure. For example, a user likes Conan +and Kuroba Kaito, which are the roles in hierarchical structure "Animation, +Japanese Animation, Detective Conan". However, most existing methods ignore +this hierarchical nature, and simply average the fine-grained interest +information. Therefore, we propose a novel two-stage approach to explicitly +modeling refined multi-interest in a hierarchical structure for recommendation. +In the first hierarchical multi-interest mining stage, the hierarchical +clustering and transformer-based model adaptively generate circles or +sub-circles that users are interested in. In the second stage, the partition of +retrieval space allows the EBR models to deal only with items within each +circle and accurately capture users' refined interests. Experimental results +show that the proposed approach achieves state-of-the-art performance. Our +framework has also been deployed at Lofter. + +
+
+ comment: 4 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Spectral-Based Graph Neural Networks for Complementary Item + Recommendation AAAI-24 + + +
+ Modeling complementary relationships greatly helps recommender systems to +accurately and promptly recommend the subsequent items when one item is +purchased. Unlike traditional similar relationships, items with complementary +relationships may be purchased successively (such as iPhone and Airpods Pro), +and they not only share relevance but also exhibit dissimilarity. Since the two +attributes are opposites, modeling complementary relationships is challenging. +Previous attempts to exploit these relationships have either ignored or +oversimplified the dissimilarity attribute, resulting in ineffective modeling +and an inability to balance the two attributes. Since Graph Neural Networks +(GNNs) can capture the relevance and dissimilarity between nodes in the +spectral domain, we can leverage spectral-based GNNs to effectively understand +and model complementary relationships. In this study, we present a novel +approach called Spectral-based Complementary Graph Neural Networks (SComGNN) +that utilizes the spectral properties of complementary item graphs. We make the +first observation that complementary relationships consist of low-frequency and +mid-frequency components, corresponding to the relevance and dissimilarity +attributes, respectively. Based on this spectral observation, we design +spectral graph convolutional networks with low-pass and mid-pass filters to +capture the low-frequency and mid-frequency components. Additionally, we +propose a two-stage attention mechanism to adaptively integrate and balance the +two attributes. Experimental results on four e-commerce datasets demonstrate +the effectiveness of our model, with SComGNN significantly outperforming +existing baseline models. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Test-Time Adaptation for Depth Completion + + +
+ It is common to observe performance degradation when transferring models +trained on some (source) datasets to target testing data due to a domain gap +between them. Existing methods for bridging this gap, such as domain adaptation +(DA), may require the source data on which the model was trained (often not +available), while others, i.e., source-free DA, require many passes through the +testing data. We propose an online test-time adaptation method for depth +completion, the task of inferring a dense depth map from a single image and +associated sparse depth map, that closes the performance gap in a single pass. +We first present a study on how the domain shift in each data modality affects +model performance. Based on our observations that the sparse depth modality +exhibits a much smaller covariate shift than the image, we design an embedding +module trained in the source domain that preserves a mapping from features +encoding only sparse depth to those encoding image and sparse depth. During +test time, sparse depth features are projected using this map as a proxy for +source domain features and are used as guidance to train a set of auxiliary +parameters (i.e., adaptation layer) to align image and sparse depth features +from the target test domain to that of the source domain. We evaluate our +method on indoor and outdoor scenarios and show that it improves over baselines +by an average of 21.1%. + +
+
+
+
+
+ + ☆ HASSOD: Hierarchical Adaptive Self-Supervised Object Detection NeurIPS 2023 + + +
+ The human visual perception system demonstrates exceptional capabilities in +learning without explicit supervision and understanding the part-to-whole +composition of objects. Drawing inspiration from these two abilities, we +propose Hierarchical Adaptive Self-Supervised Object Detection (HASSOD), a +novel approach that learns to detect objects and understand their compositions +without human supervision. HASSOD employs a hierarchical adaptive clustering +strategy to group regions into object masks based on self-supervised visual +representations, adaptively determining the number of objects per image. +Furthermore, HASSOD identifies the hierarchical levels of objects in terms of +composition, by analyzing coverage relations between masks and constructing +tree structures. This additional self-supervised learning task leads to +improved detection performance and enhanced interpretability. Lastly, we +abandon the inefficient multi-round self-training process utilized in prior +methods and instead adapt the Mean Teacher framework from semi-supervised +learning, which leads to a smoother and more efficient training process. +Through extensive experiments on prevalent image datasets, we demonstrate the +superiority of HASSOD over existing methods, thereby advancing the state of the +art in self-supervised object detection. Notably, we improve Mask AR from 20.2 +to 22.5 on LVIS, and from 17.0 to 26.0 on SA-1B. Project page: +https://HASSOD-NeurIPS23.github.io. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ AONeuS: A Neural Rendering Framework for Acoustic-Optical Sensor Fusion + + +
+ Underwater perception and 3D surface reconstruction are challenging problems +with broad applications in construction, security, marine archaeology, and +environmental monitoring. Treacherous operating conditions, fragile +surroundings, and limited navigation control often dictate that submersibles +restrict their range of motion and, thus, the baseline over which they can +capture measurements. In the context of 3D scene reconstruction, it is +well-known that smaller baselines make reconstruction more challenging. Our +work develops a physics-based multimodal acoustic-optical neural surface +reconstruction framework (AONeuS) capable of effectively integrating +high-resolution RGB measurements with low-resolution depth-resolved imaging +sonar measurements. By fusing these complementary modalities, our framework can +reconstruct accurate high-resolution 3D surfaces from measurements captured +over heavily-restricted baselines. Through extensive simulations and in-lab +experiments, we demonstrate that AONeuS dramatically outperforms recent +RGB-only and sonar-only inverse-differentiable-rendering--based surface +reconstruction methods. A website visualizing the results of our paper is +located at this address: https://aoneus.github.io/ + +
+
+ comment: First two authors contributed equally. Paper website: + https://aoneus.github.io/ +
+
+
+
+
+ + ☆ Do Diffusion Models Learn Semantically Meaningful and Efficient + Representations? + + +
+ Diffusion models are capable of impressive feats of image generation with +uncommon juxtapositions such as astronauts riding horses on the moon with +properly placed shadows. These outputs indicate the ability to perform +compositional generalization, but how do the models do so? We perform +controlled experiments on conditional DDPMs learning to generate 2D spherical +Gaussian bumps centered at specified $x$- and $y$-positions. Our results show +that the emergence of semantically meaningful latent representations is key to +achieving high performance. En route to successful performance over learning, +the model traverses three distinct phases of latent representations: (phase A) +no latent structure, (phase B) a 2D manifold of disordered states, and (phase +C) a 2D ordered manifold. Corresponding to each of these phases, we identify +qualitatively different generation behaviors: 1) multiple bumps are generated, +2) one bump is generated but at inaccurate $x$ and $y$ locations, 3) a bump is +generated at the correct $x$ and y location. Furthermore, we show that even +under imbalanced datasets where features ($x$- versus $y$-positions) are +represented with skewed frequencies, the learning process for $x$ and $y$ is +coupled rather than factorized, demonstrating that simple vanilla-flavored +diffusion models cannot learn efficient representations in which localization +in $x$ and $y$ are factorized into separate 1D tasks. These findings suggest +the need for future work to find inductive biases that will push generative +models to discover and exploit factorizable independent structures in their +inputs, which will be required to vault these models into more data-efficient +regimes. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Nevermind: Instruction Override and Moderation in Large Language Models + + +
+ Given the impressive capabilities of recent Large Language Models (LLMs), we +investigate and benchmark the most popular proprietary and different sized open +source models on the task of explicit instruction following in conflicting +situations, e.g. overrides. These include the ability of the model to override +the knowledge within the weights of the model, the ability to override (or +moderate) extracted knowledge in the prompt, and lastly the ability to perform +a full jailbreak. Experimentation performed suggest several key findings to +improve instruction following - larger models perform the best in following +instructions that override internal and contextual instructions, and are +obedient, even to a fault. When scaling to longer contexts via rope scaling, a +significant buffer needs to be maintained from the edge of the perplexity cliff +in order to maintain instruction following capabilities. Finally, we observe +improving instruction following, and subsequently instruction +overrides/jailbreaks, is fundamentally at odds with the ability of a language +model to follow given safety filters or guidelines. Thus, we postulate the most +effective approach for safe, trustworthy AI should be dealt external to the LLM +itself. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Swin-UMamba: Mamba-based UNet with ImageNet-based pretraining + + +
+ Accurate medical image segmentation demands the integration of multi-scale +information, spanning from local features to global dependencies. However, it +is challenging for existing methods to model long-range global information, +where convolutional neural networks (CNNs) are constrained by their local +receptive fields, and vision transformers (ViTs) suffer from high quadratic +complexity of their attention mechanism. Recently, Mamba-based models have +gained great attention for their impressive ability in long sequence modeling. +Several studies have demonstrated that these models can outperform popular +vision models in various tasks, offering higher accuracy, lower memory +consumption, and less computational burden. However, existing Mamba-based +models are mostly trained from scratch and do not explore the power of +pretraining, which has been proven to be quite effective for data-efficient +medical image analysis. This paper introduces a novel Mamba-based model, +Swin-UMamba, designed specifically for medical image segmentation tasks, +leveraging the advantages of ImageNet-based pretraining. Our experimental +results reveal the vital role of ImageNet-based training in enhancing the +performance of Mamba-based models. Swin-UMamba demonstrates superior +performance with a large margin compared to CNNs, ViTs, and latest Mamba-based +models. Notably, on AbdomenMRI, Encoscopy, and Microscopy datasets, Swin-UMamba +outperforms its closest counterpart U-Mamba by an average score of 3.58%. The +code and models of Swin-UMamba are publicly available at: +https://github.com/JiarunLiu/Swin-UMamba + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open + Language Models + + +
+ Mathematical reasoning poses a significant challenge for language models due +to its complex and structured nature. In this paper, we introduce DeepSeekMath +7B, which continues pre-training DeepSeek-Coder-Base-v1.5 7B with 120B +math-related tokens sourced from Common Crawl, together with natural language +and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the +competition-level MATH benchmark without relying on external toolkits and +voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. +Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. +The mathematical reasoning capability of DeepSeekMath is attributed to two key +factors: First, we harness the significant potential of publicly available web +data through a meticulously engineered data selection pipeline. Second, we +introduce Group Relative Policy Optimization (GRPO), a variant of Proximal +Policy Optimization (PPO), that enhances mathematical reasoning abilities while +concurrently optimizing the memory usage of PPO. + +
+
+
+
+
+ + ☆ GUARD: Role-playing to Generate Natural-language Jailbreakings to Test + Guideline Adherence of Large Language Models + + +
+ The discovery of "jailbreaks" to bypass safety filters of Large Language +Models (LLMs) and harmful responses have encouraged the community to implement +safety measures. One major safety measure is to proactively test the LLMs with +jailbreaks prior to the release. Therefore, such testing will require a method +that can generate jailbreaks massively and efficiently. In this paper, we +follow a novel yet intuitive strategy to generate jailbreaks in the style of +the human generation. We propose a role-playing system that assigns four +different roles to the user LLMs to collaborate on new jailbreaks. Furthermore, +we collect existing jailbreaks and split them into different independent +characteristics using clustering frequency and semantic patterns sentence by +sentence. We organize these characteristics into a knowledge graph, making them +more accessible and easier to retrieve. Our system of different roles will +leverage this knowledge graph to generate new jailbreaks, which have proved +effective in inducing LLMs to generate unethical or guideline-violating +responses. In addition, we also pioneer a setting in our system that will +automatically follow the government-issued guidelines to generate jailbreaks to +test whether LLMs follow the guidelines accordingly. We refer to our system as +GUARD (Guideline Upholding through Adaptive Role-play Diagnostics). We have +empirically validated the effectiveness of GUARD on three cutting-edge +open-sourced LLMs (Vicuna-13B, LongChat-7B, and Llama-2-7B), as well as a +widely-utilized commercial LLM (ChatGPT). Moreover, our work extends to the +realm of vision language models (MiniGPT-v2 and Gemini Vision Pro), showcasing +GUARD's versatility and contributing valuable insights for the development of +safer, more reliable LLM-based applications across diverse modalities. + +
+
+ comment: 22 papges +
+
+
+
+
+ + ☆ Ginger: An Efficient Curvature Approximation with Linear Complexity for + General Neural Networks + + +
+ Second-order optimization approaches like the generalized Gauss-Newton method +are considered more powerful as they utilize the curvature information of the +objective function with preconditioning matrices. Albeit offering tempting +theoretical benefits, they are not easily applicable to modern deep learning. +The major reason is due to the quadratic memory and cubic time complexity to +compute the inverse of the matrix. These requirements are infeasible even with +state-of-the-art hardware. In this work, we propose Ginger, an +eigendecomposition for the inverse of the generalized Gauss-Newton matrix. Our +method enjoys efficient linear memory and time complexity for each iteration. +Instead of approximating the conditioning matrix, we directly maintain its +inverse to make the approximation more accurate. We provide the convergence +result of Ginger for non-convex objectives. Our experiments on different tasks +with different model architectures verify the effectiveness of our method. Our +code is publicly available. + +
+
+
+
+
+ + ☆ Flora: Low-Rank Adapters Are Secretly Gradient Compressors + + +
+ Despite large neural networks demonstrating remarkable abilities to complete +different tasks, they require excessive memory usage to store the optimization +states for training. To alleviate this, the low-rank adaptation (LoRA) is +proposed to reduce the optimization states by training fewer parameters. +However, LoRA restricts overall weight update matrices to be low-rank, limiting +the model performance. In this work, we investigate the dynamics of LoRA and +identify that it can be approximated by a random projection. Based on this +observation, we propose Flora, which is able to achieve high-rank updates by +resampling the projection matrices while enjoying the sublinear space +complexity of optimization states. We conduct experiments across different +tasks and model architectures to verify the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Zero-shot Object-Level OOD Detection with Context-Aware Inpainting + + +
+ Machine learning algorithms are increasingly provided as black-box cloud +services or pre-trained models, without access to their training data. This +motivates the problem of zero-shot out-of-distribution (OOD) detection. +Concretely, we aim to detect OOD objects that do not belong to the classifier's +label set but are erroneously classified as in-distribution (ID) objects. Our +approach, RONIN, uses an off-the-shelf diffusion model to replace detected +objects with inpainting. RONIN conditions the inpainting process with the +predicted ID label, drawing the input object closer to the in-distribution +domain. As a result, the reconstructed object is very close to the original in +the ID cases and far in the OOD cases, allowing RONIN to effectively +distinguish ID and OOD samples. Throughout extensive experiments, we +demonstrate that RONIN achieves competitive results compared to previous +approaches across several datasets, both in zero-shot and non-zero-shot +settings. + +
+
+
+
+
+ + ☆ InstanceDiffusion: Instance-level Control for Image Generation + + +
+ Text-to-image diffusion models produce high quality images but do not offer +control over individual instances in the image. We introduce InstanceDiffusion +that adds precise instance-level control to text-to-image diffusion models. +InstanceDiffusion supports free-form language conditions per instance and +allows flexible ways to specify instance locations such as simple single +points, scribbles, bounding boxes or intricate instance segmentation masks, and +combinations thereof. We propose three major changes to text-to-image models +that enable precise instance-level control. Our UniFusion block enables +instance-level conditions for text-to-image models, the ScaleU block improves +image fidelity, and our Multi-instance Sampler improves generations for +multiple instances. InstanceDiffusion significantly surpasses specialized +state-of-the-art models for each location condition. Notably, on the COCO +dataset, we outperform previous state-of-the-art by 20.4% AP$_{50}^\text{box}$ +for box inputs, and 25.4% IoU for mask inputs. + +
+
+ comment: Preprint; Project page: + https://people.eecs.berkeley.edu/~xdwang/projects/InstDiff/ +
+
+
+
+
+ + ☆ Make Every Move Count: LLM-based High-Quality RTL Code Generation Using + MCTS + + +
+ Existing large language models (LLMs) for register transfer level code +generation face challenges like compilation failures and suboptimal power, +performance, and area (PPA) efficiency. This is due to the lack of PPA +awareness in conventional transformer decoding algorithms. In response, we +present an automated transformer decoding algorithm that integrates Monte Carlo +tree-search for lookahead, guiding the transformer to produce compilable, +functionally correct, and PPA-optimized code. Empirical evaluation with a +fine-tuned language model on RTL codesets shows that our proposed technique +consistently generates functionally correct code compared to prompting-only +methods and effectively addresses the PPA-unawareness drawback of naive large +language models. For the largest design generated by the state-of-the-art LLM +(16-bit adder), our technique can achieve a 31.8% improvement in the area-delay +product. + +
+
+
+
+
+ + ☆ A Lennard-Jones Layer for Distribution Normalization + + +
+ We introduce the Lennard-Jones layer (LJL) for the equalization of the +density of 2D and 3D point clouds through systematically rearranging points +without destroying their overall structure (distribution normalization). LJL +simulates a dissipative process of repulsive and weakly attractive interactions +between individual points by considering the nearest neighbor of each point at +a given moment in time. This pushes the particles into a potential valley, +reaching a well-defined stable configuration that approximates an equidistant +sampling after the stabilization process. We apply LJLs to redistribute +randomly generated point clouds into a randomized uniform distribution. +Moreover, LJLs are embedded in the generation process of point cloud networks +by adding them at later stages of the inference process. The improvements in 3D +point cloud generation utilizing LJLs are evaluated qualitatively and +quantitatively. Finally, we apply LJLs to improve the point distribution of a +score-based 3D point cloud denoising network. In general, we demonstrate that +LJLs are effective for distribution normalization which can be applied at +negligible cost without retraining the given neural network. + +
+
+ comment: Upon request, we are happy to share the source code to generate the + results presented in this paper. Please contact the first or the last author + of this manuscript +
+
+
+
+
+ + ☆ Training-Free Consistent Text-to-Image Generation + + +
+ Text-to-image models offer a new level of creative flexibility by allowing +users to guide the image generation process through natural language. However, +using these models to consistently portray the same subject across diverse +prompts remains challenging. Existing approaches fine-tune the model to teach +it new words that describe specific user-provided subjects or add image +conditioning to the model. These methods require lengthy per-subject +optimization or large-scale pre-training. Moreover, they struggle to align +generated images with text prompts and face difficulties in portraying multiple +subjects. Here, we present ConsiStory, a training-free approach that enables +consistent subject generation by sharing the internal activations of the +pretrained model. We introduce a subject-driven shared attention block and +correspondence-based feature injection to promote subject consistency between +images. Additionally, we develop strategies to encourage layout diversity while +maintaining subject consistency. We compare ConsiStory to a range of baselines, +and demonstrate state-of-the-art performance on subject consistency and text +alignment, without requiring a single optimization step. Finally, ConsiStory +can naturally extend to multi-subject scenarios, and even enable training-free +personalization for common objects. + +
+
+ comment: Project page is in https://consistory-paper.github.io +
+
+
+
+
+ + ☆ Deal, or no deal (or who knows)? Forecasting Uncertainty in + Conversations using Large Language Models + + +
+ Effective interlocutors account for the uncertain goals, beliefs, and +emotions of others. But even the best human conversationalist cannot perfectly +anticipate the trajectory of a dialogue. How well can language models represent +inherent uncertainty in conversations? We propose FortUne Dial, an expansion of +the long-standing "conversation forecasting" task: instead of just accuracy, +evaluation is conducted with uncertainty-aware metrics, effectively enabling +abstention on individual instances. We study two ways in which language models +potentially represent outcome uncertainty (internally, using scores and +directly, using tokens) and propose fine-tuning strategies to improve +calibration of both representations. Experiments on eight difficult negotiation +corpora demonstrate that our proposed fine-tuning strategies (a traditional +supervision strategy and an off-policy reinforcement learning strategy) can +calibrate smaller open-source models to compete with pre-trained models 10x +their size. + +
+
+ comment: 2 Figures; 7 Tables; 27 pages +
+
+
+
+
+ + ☆ A Framework for Partially Observed Reward-States in RLHF + + +
+ The study of reinforcement learning from human feedback (RLHF) has gained +prominence in recent years due to its role in the development of LLMs. +Neuroscience research shows that human responses to stimuli are known to depend +on partially-observed "internal states." Unfortunately current models of RLHF +do not take take this into consideration. Moreover most RLHF models do not +account for intermediate feedback, which is gaining importance in empirical +work and can help improve both sample complexity and alignment. To address +these limitations, we model RLHF as reinforcement learning with partially +observed reward-states (PORRL). We show reductions from the the two dominant +forms of human feedback in RLHF - cardinal and dueling feedback to PORRL. For +cardinal feedback, we develop generic statistically efficient algorithms and +instantiate them to present POR-UCRL and POR-UCBVI. For dueling feedback, we +show that a naive reduction to cardinal feedback fails to achieve sublinear +dueling regret. We then present the first explicit reduction that converts +guarantees for cardinal regret to dueling regret. We show that our models and +guarantees in both settings generalize and extend existing ones. Finally, we +identify a recursive structure on our model that could improve the statistical +and computational tractability of PORRL, giving examples from past work on RLHF +as well as learning perfect reward machines, which PORRL subsumes. + +
+
+ comment: 47 pages. 13 pages for the main paper, 34 pages for the references + and appendix +
+
+
+
+
+ + ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models + + +
+ In the face of uncertainty, the ability to seek information is of fundamental +importance. In many practical applications, such as medical diagnosis and +troubleshooting, the information needed to solve the task is not initially +given, and has to be actively sought by asking follow-up questions (for +example, a doctor asking a patient for more details about their symptoms). In +this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to augment +large language models with the ability to actively seek information by asking +effective questions. UoT combines 1) an uncertainty-aware simulation approach +which enables the model to simulate possible future scenarios and how likely +they are to occur, 2) uncertainty-based rewards motivated by information gain +which incentivizes the model to seek information, and 3) a reward propagation +scheme to select the optimal question to ask in a way that maximizes the +expected reward. In experiments on medical diagnosis, troubleshooting and the +'20 Questions' game, UoT achieves an average performance improvement of 57.8% +in the rate of successful task completion across multiple LLMs compared with +direct prompting, and also improves efficiency (i.e., the number of questions +needed to complete the task). + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Multiclass Classification Procedure for Detecting Attacks on MQTT-IoT + Protocol + + +
+ The large number of sensors and actuators that make up the Internet of Things +obliges these systems to use diverse technologies and protocols. This means +that IoT networks are more heterogeneous than traditional networks. This gives +rise to new challenges in cybersecurity to protect these systems and devices +which are characterized by being connected continuously to the Internet. +Intrusion detection systems (IDS) are used to protect IoT systems from the +various anomalies and attacks at the network level. Intrusion Detection Systems +(IDS) can be improved through machine learning techniques. Our work focuses on +creating classification models that can feed an IDS using a dataset containing +frames under attacks of an IoT system that uses the MQTT protocol. We have +addressed two types of method for classifying the attacks, ensemble methods and +deep learning models, more specifically recurrent networks with very +satisfactory results. + +
+
+
+
+
+ + ☆ ISPA: Inter-Species Phonetic Alphabet for Transcribing Animal Sounds ICASSP 2024 + + +
+ Traditionally, bioacoustics has relied on spectrograms and continuous, +per-frame audio representations for the analysis of animal sounds, also serving +as input to machine learning models. Meanwhile, the International Phonetic +Alphabet (IPA) system has provided an interpretable, language-independent +method for transcribing human speech sounds. In this paper, we introduce ISPA +(Inter-Species Phonetic Alphabet), a precise, concise, and interpretable system +designed for transcribing animal sounds into text. We compare acoustics-based +and feature-based methods for transcribing and classifying animal sounds, +demonstrating their comparable performance with baseline methods utilizing +continuous, dense audio representations. By representing animal sounds with +text, we effectively treat them as a "foreign language," and we show that +established human language ML paradigms and models, such as language models, +can be successfully applied to improve performance. + +
+
+ comment: Accepted at XAI-AI Workshop (IEEEXplore track) @ ICASSP 2024 +
+
+
+
+
+ + ☆ Understanding the Reasoning Ability of Language Models From the + Perspective of Reasoning Paths Aggregation + + +
+ Pre-trained language models (LMs) are able to perform complex reasoning +without explicit fine-tuning. To understand how pre-training with a next-token +prediction objective contributes to the emergence of such reasoning capability, +we propose that we can view an LM as deriving new conclusions by aggregating +indirect reasoning paths seen at pre-training time. We found this perspective +effective in two important cases of reasoning: logic reasoning with knowledge +graphs (KGs) and math reasoning with math word problems (MWPs). More +specifically, we formalize the reasoning paths as random walk paths on the +knowledge/reasoning graphs. Analyses of learned LM distributions suggest that a +weighted sum of relevant random walk path probabilities is a reasonable way to +explain how LMs reason. Experiments and analysis on multiple KG and MWP +datasets reveal the effect of training on random walk paths and suggest that +augmenting unlabeled random walk reasoning paths can improve real-world +multi-step reasoning performance. + +
+
+
+
+
+ + ☆ MobilityGPT: Enhanced Human Mobility Modeling with a GPT model + + +
+ Generative models have shown promising results in capturing human mobility +characteristics and generating synthetic trajectories. However, it remains +challenging to ensure that the generated geospatial mobility data is +semantically realistic, including consistent location sequences, and reflects +real-world characteristics, such as constraining on geospatial limits. To +address these issues, we reformat human mobility modeling as an autoregressive +generation task, leveraging Generative Pre-trained Transformer (GPT). To ensure +its controllable generation to alleviate the above challenges, we propose a +geospatially-aware generative model, MobilityGPT. We propose a gravity-based +sampling method to train a transformer for semantic sequence similarity. Then, +we constrained the training process via a road connectivity matrix that +provides the connectivity of sequences in trajectory generation, thereby +keeping generated trajectories in geospatial limits. Lastly, we constructed a +Reinforcement Learning from Trajectory Feedback (RLTF) to minimize the travel +distance between training and the synthetically generated trajectories. Our +experiments on real-world datasets demonstrate that MobilityGPT outperforms +state-of-the-art methods in generating high-quality mobility trajectories that +are closest to real data in terms of origin-destination similarity, trip +length, travel radius, link, and gravity distributions. + +
+
+
+
+
+ + ☆ Learning Best-in-Class Policies for the Predict-then-Optimize Framework + + +
+ We propose a novel family of decision-aware surrogate losses, called +Perturbation Gradient (PG) losses, for the predict-then-optimize framework. +These losses directly approximate the downstream decision loss and can be +optimized using off-the-shelf gradient-based methods. Importantly, unlike +existing surrogate losses, the approximation error of our PG losses vanishes as +the number of samples grows. This implies that optimizing our surrogate loss +yields a best-in-class policy asymptotically, even in misspecified settings. +This is the first such result in misspecified settings and we provide numerical +evidence confirming our PG losses substantively outperform existing proposals +when the underlying model is misspecified and the noise is not centrally +symmetric. Insofar as misspecification is commonplace in practice -- especially +when we might prefer a simpler, more interpretable model -- PG losses offer a +novel, theoretically justified, method for computationally tractable +decision-aware learning. + +
+
+
+
+
+ + ☆ Minimum Description Length and Generalization Guarantees for + Representation Learning NeurIPS 2023 + + +
+ A major challenge in designing efficient statistical supervised learning +algorithms is finding representations that perform well not only on available +training samples but also on unseen data. While the study of representation +learning has spurred much interest, most existing such approaches are +heuristic; and very little is known about theoretical generalization +guarantees. + In this paper, we establish a compressibility framework that allows us to +derive upper bounds on the generalization error of a representation learning +algorithm in terms of the "Minimum Description Length" (MDL) of the labels or +the latent variables (representations). Rather than the mutual information +between the encoder's input and the representation, which is often believed to +reflect the algorithm's generalization capability in the related literature but +in fact, falls short of doing so, our new bounds involve the "multi-letter" +relative entropy between the distribution of the representations (or labels) of +the training and test sets and a fixed prior. In particular, these new bounds +reflect the structure of the encoder and are not vacuous for deterministic +algorithms. Our compressibility approach, which is information-theoretic in +nature, builds upon that of Blum-Langford for PAC-MDL bounds and introduces two +essential ingredients: block-coding and lossy-compression. The latter allows +our approach to subsume the so-called geometrical compressibility as a special +case. To the best knowledge of the authors, the established generalization +bounds are the first of their kind for Information Bottleneck (IB) type +encoders and representation learning. Finally, we partly exploit the +theoretical results by introducing a new data-dependent prior. Numerical +simulations illustrate the advantages of well-chosen such priors over classical +priors used in IB. + +
+
+ comment: Accepted and presented at NeurIPS 2023 +
+
+
+
+
+ + ☆ Fair Active Ranking from Pairwise Preferences + + +
+ We investigate the problem of probably approximately correct and fair (PACF) +ranking of items by adaptively evoking pairwise comparisons. Given a set of $n$ +items that belong to disjoint groups, our goal is to find an $(\epsilon, +\delta)$-PACF-Ranking according to a fair objective function that we propose. +We assume access to an oracle, wherein, for each query, the learner can choose +a pair of items and receive stochastic winner feedback from the oracle. Our +proposed objective function asks to minimize the $\ell_q$ norm of the error of +the groups, where the error of a group is the $\ell_p$ norm of the error of all +the items within that group, for $p, q \geq 1$. This generalizes the objective +function of $\epsilon$-Best-Ranking, proposed by Saha & Gopalan (2019). + By adopting our objective function, we gain the flexibility to explore +fundamental fairness concepts like equal or proportionate errors within a +unified framework. Adjusting parameters $p$ and $q$ allows tailoring to +specific fairness preferences. We present both group-blind and group-aware +algorithms and analyze their sample complexity. We provide matching lower +bounds up to certain logarithmic factors for group-blind algorithms. For a +restricted class of group-aware algorithms, we show that we can get reasonable +lower bounds. We conduct comprehensive experiments on both real-world and +synthetic datasets to complement our theoretical findings. + +
+
+ comment: 39 pages, 3.1 MB +
+
+
+
+
+ + ☆ CLIP Can Understand Depth + + +
+ Recent studies on generalizing CLIP for monocular depth estimation reveal +that CLIP pre-trained on web-crawled data is inefficient for deriving proper +similarities between image patches and depth-related prompts. In this paper, we +adapt CLIP for meaningful quality of monocular depth estimation with dense +prediction, without fine-tuning its original vision-language alignment. By +jointly training a compact deconvolutional decoder with a tiny learnable +embedding matrix named mirror, as a static prompt for its text encoder, CLIP is +enabled to understand depth. With this approach, our model exhibits impressive +performance matching several previous state-of-the-art vision-only models on +the NYU Depth v2 and KITTI datasets, outperforming every CLIP-based depth +estimation model with a large margin. Experiments on temporal depth consistency +and spatial continuity demonstrate that the prior knowledge of CLIP can be +effectively refined by our proposed framework. Furthermore, an ablation study +on mirror proves that the resulting model estimates depth utilizing knowledge +not only from the image encoder but also text encoder despite not being given +any prompt written in a human way. This research demonstrates that through +minimal adjustments, the prior knowledge of vision-language foundation models, +such as CLIP, can be generalized even to domains where learning during +pretraining is challenging. We facilitate future works focused on methods to +adjust suboptimal prior knowledge of vision-language models using non-human +language prompts, achieving performance on par with task-specific +state-of-the-art methodologies. + +
+
+
+
+
+ + ☆ Skill Set Optimization: Reinforcing Language Model Behavior via + Transferable Skills + + +
+ Large language models (LLMs) have recently been used for sequential decision +making in interactive environments. However, leveraging environment reward +signals for continual LLM actor improvement is not straightforward. We propose +Skill Set Optimization (SSO) for improving LLM actor performance through +constructing and refining sets of transferable skills. SSO constructs skills by +extracting common subtrajectories with high rewards and generating subgoals and +instructions to represent each skill. These skills are provided to the LLM +actor in-context to reinforce behaviors with high rewards. Then, SSO further +refines the skill set by pruning skills that do not continue to result in high +rewards. We evaluate our method in the classic videogame NetHack and the text +environment ScienceWorld to demonstrate SSO's ability to optimize a set of +skills and perform in-context policy improvement. SSO outperforms baselines by +40% in our custom NetHack task and outperforms the previous state-of-the-art in +ScienceWorld by 35%. + +
+
+ comment: 8 pages, preprint +
+
+
+
+
+ + ☆ PINN-BO: A Black-box Optimization Algorithm using Physics-Informed + Neural Networks + + +
+ Black-box optimization is a powerful approach for discovering global optima +in noisy and expensive black-box functions, a problem widely encountered in +real-world scenarios. Recently, there has been a growing interest in leveraging +domain knowledge to enhance the efficacy of machine learning methods. Partial +Differential Equations (PDEs) often provide an effective means for elucidating +the fundamental principles governing the black-box functions. In this paper, we +propose PINN-BO, a black-box optimization algorithm employing Physics-Informed +Neural Networks that integrates the knowledge from Partial Differential +Equations (PDEs) to improve the sample efficiency of the optimization. We +analyze the theoretical behavior of our algorithm in terms of regret bound +using advances in NTK theory and prove that the use of the PDE alongside the +black-box function evaluations, PINN-BO leads to a tighter regret bound. We +perform several experiments on a variety of optimization tasks and show that +our algorithm is more sample-efficient compared to existing methods. + +
+
+
+
+
+ + ☆ FROSTER: Frozen CLIP Is A Strong Teacher for Open-Vocabulary Action + Recognition ICLR 2024 + + +
+ In this paper, we introduce FROSTER, an effective framework for +open-vocabulary action recognition. The CLIP model has achieved remarkable +success in a range of image-based tasks, benefiting from its strong +generalization capability stemming from pretaining on massive image-text pairs. +However, applying CLIP directly to the open-vocabulary action recognition task +is challenging due to the absence of temporal information in CLIP's +pretraining. Further, fine-tuning CLIP on action recognition datasets may lead +to overfitting and hinder its generalizability, resulting in unsatisfactory +results when dealing with unseen actions. + To address these issues, FROSTER employs a residual feature distillation +approach to ensure that CLIP retains its generalization capability while +effectively adapting to the action recognition task. Specifically, the residual +feature distillation treats the frozen CLIP model as a teacher to maintain the +generalizability exhibited by the original CLIP and supervises the feature +learning for the extraction of video-specific features to bridge the gap +between images and videos. Meanwhile, it uses a residual sub-network for +feature distillation to reach a balance between the two distinct objectives of +learning generalizable and video-specific features. + We extensively evaluate FROSTER on open-vocabulary action recognition +benchmarks under both base-to-novel and cross-dataset settings. FROSTER +consistently achieves state-of-the-art performance on all datasets across the +board. Project page: https://visual-ai.github.io/froster. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ ActiveAnno3D -- An Active Learning Framework for Multi-Modal 3D Object + Detection + + +
+ The curation of large-scale datasets is still costly and requires much time +and resources. Data is often manually labeled, and the challenge of creating +high-quality datasets remains. In this work, we fill the research gap using +active learning for multi-modal 3D object detection. We propose ActiveAnno3D, +an active learning framework to select data samples for labeling that are of +maximum informativeness for training. We explore various continuous training +methods and integrate the most efficient method regarding computational demand +and detection performance. Furthermore, we perform extensive experiments and +ablation studies with BEVFusion and PV-RCNN on the nuScenes and TUM Traffic +Intersection dataset. We show that we can achieve almost the same performance +with PV-RCNN and the entropy-based query strategy when using only half of the +training data (77.25 mAP compared to 83.50 mAP) of the TUM Traffic Intersection +dataset. BEVFusion achieved an mAP of 64.31 when using half of the training +data and 75.0 mAP when using the complete nuScenes dataset. We integrate our +active learning framework into the proAnno labeling tool to enable AI-assisted +data selection and labeling and minimize the labeling costs. Finally, we +provide code, weights, and visualization results on our website: +https://active3d-framework.github.io/active3d-framework. + +
+
+
+
+
+ + ☆ Smart Flow Matching: On The Theory of Flow Matching Algorithms with + Applications + + +
+ The paper presents the exact formula for the vector field that minimizes the +loss for the standard flow. This formula depends analytically on a given +distribution \rho_0 and an unknown one \rho_1. Based on the presented formula, +a new loss and algorithm for training a vector field model in the style of +Conditional Flow Matching are provided. Our loss, in comparison to the standard +Conditional Flow Matching approach, exhibits smaller variance when evaluated +through Monte Carlo sampling methods. Numerical experiments on synthetic models +and models on tabular data of large dimensions demonstrate better learning +results with the use of the presented algorithm. + +
+
+
+
+
+ + ☆ Improved prediction of future user activity in online A/B testing + + +
+ In online randomized experiments or A/B tests, accurate predictions of +participant inclusion rates are of paramount importance. These predictions not +only guide experimenters in optimizing the experiment's duration but also +enhance the precision of treatment effect estimates. In this paper we present a +novel, straightforward, and scalable Bayesian nonparametric approach for +predicting the rate at which individuals will be exposed to interventions +within the realm of online A/B testing. Our approach stands out by offering +dual prediction capabilities: it forecasts both the quantity of new customers +expected in future time windows and, unlike available alternative methods, the +number of times they will be observed. We derive closed-form expressions for +the posterior distributions of the quantities needed to form predictions about +future user activity, thereby bypassing the need for numerical algorithms such +as Markov chain Monte Carlo. After a comprehensive exposition of our model, we +test its performance on experiments on real and simulated data, where we show +its superior performance with respect to existing alternatives in the +literature. + +
+
+
+
+
+ + ☆ CT-based Anatomical Segmentation for Thoracic Surgical Planning: A + Benchmark Study for 3D U-shaped Deep Learning Models + + +
+ Recent rising interests in patient-specific thoracic surgical planning and +simulation require efficient and robust creation of digital anatomical models +from automatic medical image segmentation algorithms. Deep learning (DL) is now +state-of-the-art in various radiological tasks, and U-shaped DL models have +particularly excelled in medical image segmentation since the inception of the +2D UNet. To date, many variants of U-shaped models have been proposed by the +integration of different attention mechanisms and network configurations. +Leveraging the recent development of large multi-label databases, systematic +benchmark studies for these models can provide valuable insights for clinical +deployment and future model designs, but such studies are still rare. We +conduct the first benchmark study for variants of 3D U-shaped models (3DUNet, +STUNet, AttentionUNet, SwinUNETR, FocalSegNet, and a novel 3D SwinUnet with +four variants) with a focus on CT-based anatomical segmentation for thoracic +surgery. Our study systematically examines the impact of different attention +mechanisms, number of resolution stages, and network configurations on +segmentation accuracy and computational complexity. To allow cross-reference +with other recent benchmarking studies, we also included a performance +assessment of the BTCV abdominal structural segmentation. With the STUNet +ranking at the top, our study demonstrated the value of CNN-based U-shaped +models for the investigated tasks and the benefit of residual blocks in network +configuration designs to boost segmentation performance. + +
+
+
+
+
+ + ☆ IGUANe: a 3D generalizable CycleGAN for multicenter harmonization of + brain MR images + + +
+ In MRI studies, the aggregation of imaging data from multiple acquisition +sites enhances sample size but may introduce site-related variabilities that +hinder consistency in subsequent analyses. Deep learning methods for image +translation have emerged as a solution for harmonizing MR images across sites. +In this study, we introduce IGUANe (Image Generation with Unified Adversarial +Networks), an original 3D model that leverages the strengths of domain +translation and straightforward application of style transfer methods for +multicenter brain MR image harmonization. IGUANe extends CycleGAN architecture +by integrating an arbitrary number of domains for training through a +many-to-one strategy. During inference, the model can be applied to any image, +even from an unknown acquisition site, making it a universal generator for +harmonization. Trained on a dataset comprising T1-weighted images from 11 +different scanners, IGUANe was evaluated on data from unseen sites. The +assessments included the transformation of MR images with traveling subjects, +the preservation of pairwise distances between MR images within domains, the +evolution of volumetric patterns related to age and Alzheimer$^\prime$s disease +(AD), and the performance in age regression and patient classification tasks. +Comparisons with other harmonization and normalization methods suggest that +IGUANe better preserves individual information in MR images and is more +suitable for maintaining and reinforcing variabilities related to age and AD. +Future studies may further assess IGUANe in other multicenter contexts, either +using the same model or retraining it for applications to different image +modalities. + +
+
+ comment: 23 pages, 8 figures +
+
+
+
+
+ + ☆ FuseMoE: Mixture-of-Experts Transformers for Fleximodal Fusion + + +
+ As machine learning models in critical fields increasingly grapple with +multimodal data, they face the dual challenges of handling a wide array of +modalities, often incomplete due to missing elements, and the temporal +irregularity and sparsity of collected samples. Successfully leveraging this +complex data, while overcoming the scarcity of high-quality training samples, +is key to improving these models' predictive performance. We introduce +``FuseMoE'', a mixture-of-experts framework incorporated with an innovative +gating function. Designed to integrate a diverse number of modalities, FuseMoE +is effective in managing scenarios with missing modalities and irregularly +sampled data trajectories. Theoretically, our unique gating function +contributes to enhanced convergence rates, leading to better performance in +multiple downstream tasks. The practical utility of FuseMoE in real world is +validated by a challenging set of clinical risk prediction tasks. + +
+
+ comment: 35 pages, 8 tables, 5 figures +
+
+
+
+
+ + ☆ The Benefits of Reusing Batches for Gradient Descent in Two-Layer + Networks: Breaking the Curse of Information and Leap Exponents + + +
+ We investigate the training dynamics of two-layer neural networks when +learning multi-index target functions. We focus on multi-pass gradient descent +(GD) that reuses the batches multiple times and show that it significantly +changes the conclusion about which functions are learnable compared to +single-pass gradient descent. In particular, multi-pass GD with finite stepsize +is found to overcome the limitations of gradient flow and single-pass GD given +by the information exponent (Ben Arous et al., 2021) and leap exponent (Abbe et +al., 2023) of the target function. We show that upon re-using batches, the +network achieves in just two time steps an overlap with the target subspace +even for functions not satisfying the staircase property (Abbe et al., 2021). +We characterize the (broad) class of functions efficiently learned in finite +time. The proof of our results is based on the analysis of the Dynamical +Mean-Field Theory (DMFT). We further provide a closed-form description of the +dynamical process of the low-dimensional projections of the weights, and +numerical experiments illustrating the theory. + +
+
+
+
+
+ + ☆ BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity + Text Embeddings Through Self-Knowledge Distillation + + +
+ In this paper, we present a new embedding model, called M3-Embedding, which +is distinguished for its versatility in Multi-Linguality, Multi-Functionality, +and Multi-Granularity. It can support more than 100 working languages, leading +to new state-of-the-art performances on multi-lingual and cross-lingual +retrieval tasks. It can simultaneously perform the three common retrieval +functionalities of embedding model: dense retrieval, multi-vector retrieval, +and sparse retrieval, which provides a unified model foundation for real-world +IR applications. It is able to process inputs of different granularities, +spanning from short sentences to long documents of up to 8192 tokens. The +effective training of M3-Embedding involves the following technical +contributions. We propose a novel self-knowledge distillation approach, where +the relevance scores from different retrieval functionalities can be integrated +as the teacher signal to enhance the training quality. We also optimize the +batching strategy, enabling a large batch size and high training throughput to +ensure the discriminativeness of embeddings. To the best of our knowledge, +M3-Embedding is the first embedding model which realizes such a strong +versatility. The model and code will be publicly available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Organic or Diffused: Can We Distinguish Human Art from AI-generated + Images? + + +
+ The advent of generative AI images has completely disrupted the art world. +Identifying AI generated images from human art is a challenging problem whose +impact is growing over time. The failure to address this problem allows bad +actors to defraud individuals paying a premium for human art, and companies +whose stated policies forbid AI imagery. This is also critical for AI model +trainers, who need to filter training data to avoid potential model collapse. +There are several different approaches to distinguishing human art from AI +images, including classifiers trained by supervised learning, research tools +targeting diffusion models, and identification by professional artists using +their knowledge of artistic techniques. In this paper, we seek to understand +how well these approaches can perform against today's modern generative models +in both benign and adversarial settings. We curate real human art across 7 +styles, generate matching images from 5 generative models, and apply 8 +detectors (5 automated detectors and 3 different human groups including 180 +crowdworkers, 4000+ professional artists, and 13 expert artists experienced at +detecting AI). Both Hive and expert artists do very well, but make mistakes in +different ways (Hive is weaker against adversarial perturbations while Expert +artists produce higher false positives). We believe these weaknesses will +remain as models continue to evolve, and use our data to demonstrate why a +combined team of human and automated detectors provides the best combination of +accuracy and robustness. + +
+
+
+
+
+ + ☆ Light and Optimal Schrödinger Bridge Matching + + +
+ Schr\"odinger Bridges (SB) have recently gained the attention of the ML +community as a promising extension of classic diffusion models which is also +interconnected to the Entropic Optimal Transport (EOT). Recent solvers for SB +exploit the pervasive bridge matching procedures. Such procedures aim to +recover a stochastic process transporting the mass between distributions given +only a transport plan between them. In particular, given the EOT plan, these +procedures can be adapted to solve SB. This fact is heavily exploited by recent +works giving rives to matching-based SB solvers. The cornerstone here is +recovering the EOT plan: recent works either use heuristical approximations +(e.g., the minibatch OT) or establish iterative matching procedures which by +the design accumulate the error during the training. We address these +limitations and propose a novel procedure to learn SB which we call the +\textbf{optimal Schr\"odinger bridge matching}. It exploits the optimal +parameterization of the diffusion process and provably recovers the SB process +\textbf{(a)} with a single bridge matching step and \textbf{(b)} with arbitrary +transport plan as the input. Furthermore, we show that the optimal bridge +matching objective coincides with the recently discovered energy-based modeling +(EBM) objectives to learn EOT/SB. Inspired by this observation, we develop a +light solver (which we call LightSB-M) to implement optimal matching in +practice using the Gaussian mixture parameterization of the Schr\"odinger +potential. We experimentally showcase the performance of our solver in a range +of practical tasks. The code for the LightSB-M solver can be found at +\url{https://github.com/SKholkin/LightSB-Matching}. + +
+
+
+
+
+ + ☆ Multi-agent Reinforcement Learning for Energy Saving in Multi-Cell + Massive MIMO Systems + + +
+ We develop a multi-agent reinforcement learning (MARL) algorithm to minimize +the total energy consumption of multiple massive MIMO (multiple-input +multiple-output) base stations (BSs) in a multi-cell network while preserving +the overall quality-of-service (QoS) by making decisions on the multi-level +advanced sleep modes (ASMs) and antenna switching of these BSs. The problem is +modeled as a decentralized partially observable Markov decision process +(DEC-POMDP) to enable collaboration between individual BSs, which is necessary +to tackle inter-cell interference. A multi-agent proximal policy optimization +(MAPPO) algorithm is designed to learn a collaborative BS control policy. To +enhance its scalability, a modified version called MAPPO-neighbor policy is +further proposed. Simulation results demonstrate that the trained MAPPO agent +achieves better performance compared to baseline policies. Specifically, +compared to the auto sleep mode 1 (symbol-level sleeping) algorithm, the +MAPPO-neighbor policy reduces power consumption by approximately 8.7% during +low-traffic hours and improves energy efficiency by approximately 19% during +high-traffic hours, respectively. + +
+
+
+
+
+ + ☆ Guidance with Spherical Gaussian Constraint for Conditional Diffusion + + +
+ Recent advances in diffusion models attempt to handle conditional generative +tasks by utilizing a differentiable loss function for guidance without the need +for additional training. While these methods achieved certain success, they +often compromise on sample quality and require small guidance step sizes, +leading to longer sampling processes. This paper reveals that the fundamental +issue lies in the manifold deviation during the sampling process when loss +guidance is employed. We theoretically show the existence of manifold deviation +by establishing a certain lower bound for the estimation error of the loss +guidance. To mitigate this problem, we propose Diffusion with Spherical +Gaussian constraint (DSG), drawing inspiration from the concentration +phenomenon in high-dimensional Gaussian distributions. DSG effectively +constrains the guidance step within the intermediate data manifold through +optimization and enables the use of larger guidance steps. Furthermore, we +present a closed-form solution for DSG denoising with the Spherical Gaussian +constraint. Notably, DSG can seamlessly integrate as a plugin module within +existing training-free conditional diffusion methods. Implementing DSG merely +involves a few lines of additional code with almost no extra computational +overhead, yet it leads to significant performance improvements. Comprehensive +experimental results in various conditional generation tasks validate the +superiority and adaptability of DSG in terms of both sample quality and time +efficiency. + +
+
+
+
+
+ + ☆ Isotropy, Clusters, and Classifiers + + +
+ Whether embedding spaces use all their dimensions equally, i.e., whether they +are isotropic, has been a recent subject of discussion. Evidence has been +accrued both for and against enforcing isotropy in embedding spaces. In the +present paper, we stress that isotropy imposes requirements on the embedding +space that are not compatible with the presence of clusters -- which also +negatively impacts linear classification objectives. We demonstrate this fact +empirically and use it to shed light on previous results from the literature. + +
+
+
+
+
+ + ☆ Unified Hallucination Detection for Multimodal Large Language Models + + +
+ Despite significant strides in multimodal tasks, Multimodal Large Language +Models (MLLMs) are plagued by the critical issue of hallucination. The reliable +detection of such hallucinations in MLLMs has, therefore, become a vital aspect +of model evaluation and the safeguarding of practical application deployment. +Prior research in this domain has been constrained by a narrow focus on +singular tasks, an inadequate range of hallucination categories addressed, and +a lack of detailed granularity. In response to these challenges, our work +expands the investigative horizons of hallucination detection. We present a +novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate +the evaluation of advancements in hallucination detection methods. +Additionally, we unveil a novel unified multimodal hallucination detection +framework, UNIHD, which leverages a suite of auxiliary tools to validate the +occurrence of hallucinations robustly. We demonstrate the effectiveness of +UNIHD through meticulous evaluation and comprehensive analysis. We also provide +strategic insights on the application of specific tools for addressing various +categories of hallucinations. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ How Good is a Single Basin? + + +
+ The multi-modal nature of neural loss landscapes is often considered to be +the main driver behind the empirical success of deep ensembles. In this work, +we probe this belief by constructing various "connected" ensembles which are +restricted to lie in the same basin. Through our experiments, we demonstrate +that increased connectivity indeed negatively impacts performance. However, +when incorporating the knowledge from other basins implicitly through +distillation, we show that the gap in performance can be mitigated by +re-discovering (multi-basin) deep ensembles within a single basin. Thus, we +conjecture that while the extra-basin knowledge is at least partially present +in any given basin, it cannot be easily harnessed without learning it from +other basins. + +
+
+
+
+
+ + ☆ Predicting Configuration Performance in Multiple Environments with + Sequential Meta-learning + + +
+ Learning and predicting the performance of given software configurations are +of high importance to many software engineering activities. While configurable +software systems will almost certainly face diverse running environments (e.g., +version, hardware, and workload), current work often either builds performance +models under a single environment or fails to properly handle data from diverse +settings, hence restricting their accuracy for new environments. In this paper, +we target configuration performance learning under multiple environments. We do +so by designing SeMPL - a meta-learning framework that learns the common +understanding from configurations measured in distinct (meta) environments and +generalizes them to the unforeseen, target environment. What makes it unique is +that unlike common meta-learning frameworks (e.g., MAML and MetaSGD) that train +the meta environments in parallel, we train them sequentially, one at a time. +The order of training naturally allows discriminating the contributions among +meta environments in the meta-model built, which fits better with the +characteristic of configuration data that is known to dramatically differ +between different environments. Through comparing with 15 state-of-the-art +models under nine systems, our extensive experimental results demonstrate that +SeMPL performs considerably better on 89% of the systems with up to 99% +accuracy improvement, while being data-efficient, leading to a maximum of 3.86x +speedup. All code and data can be found at our repository: +https://github.com/ideas-labo/SeMPL. + +
+
+ comment: This paper has been accepted by FSE'24 +
+
+
+
+
+ + ☆ Empowering Time Series Analysis with Large Language Models: A Survey + + +
+ Recently, remarkable progress has been made over large language models +(LLMs), demonstrating their unprecedented capability in varieties of natural +language tasks. However, completely training a large general-purpose model from +the scratch is challenging for time series analysis, due to the large volumes +and varieties of time series data, as well as the non-stationarity that leads +to concept drift impeding continuous model adaptation and re-training. Recent +advances have shown that pre-trained LLMs can be exploited to capture complex +dependencies in time series data and facilitate various applications. In this +survey, we provide a systematic overview of existing methods that leverage LLMs +for time series analysis. Specifically, we first state the challenges and +motivations of applying language models in the context of time series as well +as brief preliminaries of LLMs. Next, we summarize the general pipeline for +LLM-based time series analysis, categorize existing methods into different +groups (i.e., direct query, tokenization, prompt design, fine-tune, and model +integration), and highlight the key ideas within each group. We also discuss +the applications of LLMs for both general and spatial-temporal time series +data, tailored to specific domains. Finally, we thoroughly discuss future +research opportunities to empower time series analysis with LLMs. + +
+
+
+
+
+ + ☆ Cool-chic video: Learned video coding with 800 parameters + + +
+ We propose a lightweight learned video codec with 900 multiplications per +decoded pixel and 800 parameters overall. To the best of our knowledge, this is +one of the neural video codecs with the lowest decoding complexity. It is built +upon the overfitted image codec Cool-chic and supplements it with an inter +coding module to leverage the video's temporal redundancies. The proposed model +is able to compress videos using both low-delay and random access +configurations and achieves rate-distortion close to AVC while out-performing +other overfitted codecs such as FFNeRV. The system is made open-source: +orange-opensource.github.io/Cool-Chic. + +
+
+ comment: 10 pages, published in Data Compression Conference 2024 +
+
+
+
+
+ + ☆ CIDAR: Culturally Relevant Instruction Dataset For Arabic + + +
+ Instruction tuning has emerged as a prominent methodology for teaching Large +Language Models (LLMs) to follow instructions. However, current instruction +datasets predominantly cater to English or are derived from English-dominated +LLMs, resulting in inherent biases toward Western culture. This bias +significantly impacts the linguistic structures of non-English languages such +as Arabic, which has a distinct grammar reflective of the diverse cultures +across the Arab region. This paper addresses this limitation by introducing +CIDAR: https://hf.co/datasets/arbml/CIDAR, the first open Arabic +instruction-tuning dataset culturally-aligned by human reviewers. CIDAR +contains 10,000 instruction and output pairs that represent the Arab region. We +discuss the cultural relevance of CIDAR via the analysis and comparison to +other models fine-tuned on other datasets. Our experiments show that CIDAR can +help enrich research efforts in aligning LLMs with the Arabic culture. All the +code is available at https://github.com/ARBML/CIDAR. + +
+
+
+
+
+ + ☆ Comparison of Topic Modelling Approaches in the Banking Context + + +
+ Topic modelling is a prominent task for automatic topic extraction in many +applications such as sentiment analysis and recommendation systems. The +approach is vital for service industries to monitor their customer discussions. +The use of traditional approaches such as Latent Dirichlet Allocation (LDA) for +topic discovery has shown great performances, however, they are not consistent +in their results as these approaches suffer from data sparseness and inability +to model the word order in a document. Thus, this study presents the use of +Kernel Principal Component Analysis (KernelPCA) and K-means Clustering in the +BERTopic architecture. We have prepared a new dataset using tweets from +customers of Nigerian banks and we use this to compare the topic modelling +approaches. Our findings showed KernelPCA and K-means in the BERTopic +architecture-produced coherent topics with a coherence score of 0.8463. + +
+
+ comment: 14 pages, Journal of Applied Science +
+
+
+
+
+ + ☆ The Matrix: A Bayesian learning model for LLMs + + +
+ In this paper, we introduce a Bayesian learning model to understand the +behavior of Large Language Models (LLMs). We explore the optimization metric of +LLMs, which is based on predicting the next token, and develop a novel model +grounded in this principle. Our approach involves constructing an ideal +generative text model represented by a multinomial transition probability +matrix with a prior, and we examine how LLMs approximate this matrix. We +discuss the continuity of the mapping between embeddings and multinomial +distributions, and present the Dirichlet approximation theorem to approximate +any prior. Additionally, we demonstrate how text generation by LLMs aligns with +Bayesian learning principles and delve into the implications for in-context +learning, specifically explaining why in-context learning emerges in larger +models where prompts are considered as samples to be updated. Our findings +indicate that the behavior of LLMs is consistent with Bayesian Learning, +offering new insights into their functioning and potential applications. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Decentralized Event-Triggered Online Learning for Safe Consensus of + Multi-Agent Systems with Gaussian Process Regression + + +
+ Consensus control in multi-agent systems has received significant attention +and practical implementation across various domains. However, managing +consensus control under unknown dynamics remains a significant challenge for +control design due to system uncertainties and environmental disturbances. This +paper presents a novel learning-based distributed control law, augmented by an +auxiliary dynamics. Gaussian processes are harnessed to compensate for the +unknown components of the multi-agent system. For continuous enhancement in +predictive performance of Gaussian process model, a data-efficient online +learning strategy with a decentralized event-triggered mechanism is proposed. +Furthermore, the control performance of the proposed approach is ensured via +the Lyapunov theory, based on a probabilistic guarantee for prediction error +bounds. To demonstrate the efficacy of the proposed learning-based controller, +a comparative analysis is conducted, contrasting it with both conventional +distributed control laws and offline learning methodologies. + +
+
+
+
+
+ + ☆ Homograph Attacks on Maghreb Sentiment Analyzers NeurIPS + + +
+ We examine the impact of homograph attacks on the Sentiment Analysis (SA) +task of different Arabic dialects from the Maghreb North-African countries. +Homograph attacks result in a 65.3% decrease in transformer classification from +an F1-score of 0.95 to 0.33 when data is written in "Arabizi". The goal of this +study is to highlight LLMs weaknesses' and to prioritize ethical and +responsible Machine Learning. + +
+
+ comment: NAML, North Africans in Machine Leaning, NeurIPS, Neural Information + Processing Systems +
+
+
+
+
+ + ☆ Is Mamba Capable of In-Context Learning? + + +
+ This work provides empirical evidence that Mamba, a newly proposed selective +structured state space model, has similar in-context learning (ICL) +capabilities as transformers. We evaluated Mamba on tasks involving simple +function approximation as well as more complex natural language processing +problems. Our results demonstrate that across both categories of tasks, Mamba +matches the performance of transformer models for ICL. Further analysis reveals +that like transformers, Mamba appears to solve ICL problems by incrementally +optimizing its internal representations. Overall, our work suggests that Mamba +can be an efficient alternative to transformers for ICL tasks involving longer +input sequences. + +
+
+
+
+
+ + ☆ A Random Matrix Approach to Low-Multilinear-Rank Tensor Approximation + + +
+ This work presents a comprehensive understanding of the estimation of a +planted low-rank signal from a general spiked tensor model near the +computational threshold. Relying on standard tools from the theory of large +random matrices, we characterize the large-dimensional spectral behavior of the +unfoldings of the data tensor and exhibit relevant signal-to-noise ratios +governing the detectability of the principal directions of the signal. These +results allow to accurately predict the reconstruction performance of truncated +multilinear SVD (MLSVD) in the non-trivial regime. This is particularly +important since it serves as an initialization of the higher-order orthogonal +iteration (HOOI) scheme, whose convergence to the best low-multilinear-rank +approximation depends entirely on its initialization. We give a sufficient +condition for the convergence of HOOI and show that the number of iterations +before convergence tends to $1$ in the large-dimensional limit. + +
+
+
+
+
+ + ☆ Decentralized Bilevel Optimization over Graphs: Loopless Algorithmic + Update and Transient Iteration Complexity + + +
+ Stochastic bilevel optimization (SBO) is becoming increasingly essential in +machine learning due to its versatility in handling nested structures. To +address large-scale SBO, decentralized approaches have emerged as effective +paradigms in which nodes communicate with immediate neighbors without a central +server, thereby improving communication efficiency and enhancing algorithmic +robustness. However, current decentralized SBO algorithms face challenges, +including expensive inner-loop updates and unclear understanding of the +influence of network topology, data heterogeneity, and the nested bilevel +algorithmic structures. In this paper, we introduce a single-loop decentralized +SBO (D-SOBA) algorithm and establish its transient iteration complexity, which, +for the first time, clarifies the joint influence of network topology and data +heterogeneity on decentralized bilevel algorithms. D-SOBA achieves the +state-of-the-art asymptotic rate, asymptotic gradient/Hessian complexity, and +transient iteration complexity under more relaxed assumptions compared to +existing methods. Numerical experiments validate our theoretical findings. + +
+
+ comment: 37 pages, 6 figures +
+
+
+
+
+ + ☆ Optimal and Near-Optimal Adaptive Vector Quantization + + +
+ Quantization is a fundamental optimization for many machine-learning use +cases, including compressing gradients, model weights and activations, and +datasets. The most accurate form of quantization is \emph{adaptive}, where the +error is minimized with respect to a given input, rather than optimizing for +the worst case. However, optimal adaptive quantization methods are considered +infeasible in terms of both their runtime and memory requirements. + We revisit the Adaptive Vector Quantization (AVQ) problem and present +algorithms that find optimal solutions with asymptotically improved time and +space complexity. We also present an even faster near-optimal algorithm for +large inputs. Our experiments show our algorithms may open the door to using +AVQ more extensively in a variety of machine learning applications. + +
+
+
+
+
+ + ☆ DogSurf: Quadruped Robot Capable of GRU-based Surface Recognition for + Blind Person Navigation + + +
+ This paper introduces DogSurf - a newapproach of using quadruped robots to +help visually impaired people navigate in real world. The presented method +allows the quadruped robot to detect slippery surfaces, and to use audio and +haptic feedback to inform the user when to stop. A state-of-the-art GRU-based +neural network architecture with mean accuracy of 99.925% was proposed for the +task of multiclass surface classification for quadruped robots. A dataset was +collected on a Unitree Go1 Edu robot. The dataset and code have been posted to +the public domain. + +
+
+ comment: This paper has been accepted for publication at the HRI2024 + conference +
+
+
+
+
+ + ☆ Learning solutions of parametric Navier-Stokes with physics-informed + neural networks + + +
+ We leverage Physics-Informed Neural Networks (PINNs) to learn solution +functions of parametric Navier-Stokes Equations (NSE). Our proposed approach +results in a feasible optimization problem setup that bypasses PINNs' +limitations in converging to solutions of highly nonlinear parametric-PDEs like +NSE. We consider the parameter(s) of interest as inputs of PINNs along with +spatio-temporal coordinates, and train PINNs on generated numerical solutions +of parametric-PDES for instances of the parameters. We perform experiments on +the classical 2D flow past cylinder problem aiming to learn velocities and +pressure functions over a range of Reynolds numbers as parameter of interest. +Provision of training data from generated numerical simulations allows for +interpolation of the solution functions for a range of parameters. Therefore, +we compare PINNs with unconstrained conventional Neural Networks (NN) on this +problem setup to investigate the effectiveness of considering the PDEs +regularization in the loss function. We show that our proposed approach results +in optimizing PINN models that learn the solution functions while making sure +that flow predictions are in line with conservational laws of mass and +momentum. Our results show that PINN results in accurate prediction of +gradients compared to NN model, this is clearly visible in predicted vorticity +fields given that none of these models were trained on vorticity labels. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Microrings Based Incoherent Photonic GEMM + Accelerators + + +
+ Several microring resonator (MRR) based analog photonic architectures have +been proposed to accelerate general matrix-matrix multiplications (GEMMs) in +deep neural networks with exceptional throughput and energy efficiency. To +implement GEMM functions, these MRR-based architectures, in general, manipulate +optical signals in five different ways: (i) Splitting (copying) of multiple +optical signals to achieve a certain fan-out, (ii) Aggregation (multiplexing) +of multiple optical signals to achieve a certain fan-in, (iii) Modulation of +optical signals to imprint input values onto analog signal amplitude, (iv) +Weighting of modulated optical signals to achieve analog input-weight +multiplication, (v) Summation of optical signals. The MRR-based GEMM +accelerators undertake the first four ways of signal manipulation in an +arbitrary order ignoring the possible impact of the order of these +manipulations on their performance. In this paper, we conduct a detailed +analysis of accelerator organizations with three different orders of these +manipulations: (1) Modulation-Aggregation-Splitting-Weighting (MASW), (2) +Aggregation-Splitting-Modulation-Weighting (ASMW), and (3) +Splitting-Modulation-Weighting-Aggregation (SMWA). We show that these +organizations affect the crosstalk noise and optical signal losses in different +magnitudes, which renders these organizations with different levels of +processing parallelism at the circuit level, and different magnitudes of +throughput and energy-area efficiency at the system level. Our evaluation +results for four CNN models show that SMWA organization achieves up to +4.4$\times$, 5$\times$, and 5.2$\times$ better throughput, energy efficiency, +and area-energy efficiency, respectively, compared to ASMW and MASW +organizations on average. + +
+
+ comment: Accepted at ISQED 2024 +
+
+
+
+
+ + ☆ A Multi-step Loss Function for Robust Learning of the Dynamics in + Model-based Reinforcement Learning + + +
+ In model-based reinforcement learning, most algorithms rely on simulating +trajectories from one-step models of the dynamics learned on data. A critical +challenge of this approach is the compounding of one-step prediction errors as +the length of the trajectory grows. In this paper we tackle this issue by using +a multi-step objective to train one-step models. Our objective is a weighted +sum of the mean squared error (MSE) loss at various future horizons. We find +that this new loss is particularly useful when the data is noisy (additive +Gaussian noise in the observations), which is often the case in real-life +environments. To support the multi-step loss, first we study its properties in +two tractable cases: i) uni-dimensional linear system, and ii) two-parameter +non-linear system. Second, we show in a variety of tasks (environments or +datasets) that the models learned with this loss achieve a significant +improvement in terms of the averaged R2-score on future prediction horizons. +Finally, in the pure batch reinforcement learning setting, we demonstrate that +one-step models serve as strong baselines when dynamics are deterministic, +while multi-step models would be more advantageous in the presence of noise, +highlighting the potential of our approach in real-world applications. + +
+
+
+
+
+ + ☆ SafEDMD: A certified learning architecture tailored to data-driven + control of nonlinear dynamical systems + + +
+ The Koopman operator serves as the theoretical backbone for machine learning +of dynamical control systems, where the operator is heuristically approximated +by extended dynamic mode decomposition (EDMD). In this paper, we propose +Stability- and certificate-oriented EDMD (SafEDMD): a novel EDMD-based learning +architecture which comes along with rigorous certificates, resulting in a +reliable surrogate model generated in a data-driven fashion. To ensure +trustworthiness of SafEDMD, we derive proportional error bounds, which vanish +at the origin and are tailored for control tasks, leading to certified +controller design based on semi-definite programming. We illustrate the +developed machinery by means of several benchmark examples and highlight the +advantages over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Less is KEN: a Universal and Simple Non-Parametric Pruning Algorithm for + Large Language Models + + +
+ Neural network pruning has become increasingly crucial due to the complexity +of neural network models and their widespread use in various fields. Existing +pruning algorithms often suffer from limitations such as architecture +specificity, excessive complexity and reliance on complex calculations, +rendering them impractical for real-world applications. In this paper, we +propose KEN: a straightforward, universal and unstructured pruning algorithm +based on Kernel Density Estimation (KDE). KEN aims to construct optimized +transformer models by selectively preserving the most significant parameters +while restoring others to their pre-training state. This approach maintains +model performance while allowing storage of only the optimized subnetwork, +leading to significant memory savings. Extensive evaluations on seven +transformer models demonstrate that KEN achieves equal or better performance +than the original models with a minimum parameter reduction of 25%. In-depth +comparisons against other pruning and PEFT algorithms confirm KEN +effectiveness. Furthermore, we introduce KEN_viz, an explainable tool that +visualizes the optimized model composition and the subnetwork selected by KEN. + +
+
+
+
+
+ + ☆ Boosting Long-Delayed Reinforcement Learning with Auxiliary + Short-Delayed Task + + +
+ Reinforcement learning is challenging in delayed scenarios, a common +real-world situation where observations and interactions occur with delays. +State-of-the-art (SOTA) state-augmentation techniques either suffer from the +state-space explosion along with the delayed steps, or performance degeneration +in stochastic environments. To address these challenges, our novel +Auxiliary-Delayed Reinforcement Learning (AD-RL) leverages an auxiliary +short-delayed task to accelerate the learning on a long-delayed task without +compromising the performance in stochastic environments. Specifically, AD-RL +learns the value function in the short-delayed task and then employs it with +the bootstrapping and policy improvement techniques in the long-delayed task. +We theoretically show that this can greatly reduce the sample complexity +compared to directly learning on the original long-delayed task. On +deterministic and stochastic benchmarks, our method remarkably outperforms the +SOTAs in both sample efficiency and policy performance. + +
+
+
+
+
+ + ☆ Enhancing Neural Subset Selection: Integrating Background Information + into Set Representations + + +
+ Learning neural subset selection tasks, such as compound selection in +AI-aided drug discovery, have become increasingly pivotal across diverse +applications. The existing methodologies in the field primarily concentrate on +constructing models that capture the relationship between utility function +values and subsets within their respective supersets. However, these approaches +tend to overlook the valuable information contained within the superset when +utilizing neural networks to model set functions. In this work, we address this +oversight by adopting a probabilistic perspective. Our theoretical findings +demonstrate that when the target value is conditioned on both the input set and +subset, it is essential to incorporate an \textit{invariant sufficient +statistic} of the superset into the subset of interest for effective learning. +This ensures that the output value remains invariant to permutations of the +subset and its corresponding superset, enabling identification of the specific +superset from which the subset originated. Motivated by these insights, we +propose a simple yet effective information aggregation module designed to merge +the representations of subsets and supersets from a permutation invariance +perspective. Comprehensive empirical evaluations across diverse tasks and +datasets validate the enhanced efficacy of our approach over conventional +methods, underscoring the practicality and potency of our proposed strategies +in real-world contexts. + +
+
+
+
+
+ + ☆ Just Cluster It: An Approach for Exploration in High-Dimensions using + Clustering and Pre-Trained Representations + + +
+ In this paper we adopt a representation-centric perspective on exploration in +reinforcement learning, viewing exploration fundamentally as a density +estimation problem. We investigate the effectiveness of clustering +representations for exploration in 3-D environments, based on the observation +that the importance of pixel changes between transitions is less pronounced in +3-D environments compared to 2-D environments, where pixel changes between +transitions are typically distinct and significant. We propose a method that +performs episodic and global clustering on random representations and on +pre-trained DINO representations to count states, i.e, estimate pseudo-counts. +Surprisingly, even random features can be clustered effectively to count states +in 3-D environments, however when these become visually more complex, +pre-trained DINO representations are more effective thanks to the pre-trained +inductive biases in the representations. Overall, this presents a pathway for +integrating pre-trained biases into exploration. We evaluate our approach on +the VizDoom and Habitat environments, demonstrating that our method surpasses +other well-known exploration methods in these settings. + +
+
+
+
+
+ + ☆ Sociolinguistically Informed Interpretability: A Case Study on Hinglish + Emotion Classification EACL + + +
+ Emotion classification is a challenging task in NLP due to the inherent +idiosyncratic and subjective nature of linguistic expression, especially with +code-mixed data. Pre-trained language models (PLMs) have achieved high +performance for many tasks and languages, but it remains to be seen whether +these models learn and are robust to the differences in emotional expression +across languages. Sociolinguistic studies have shown that Hinglish speakers +switch to Hindi when expressing negative emotions and to English when +expressing positive emotions. To understand if language models can learn these +associations, we study the effect of language on emotion prediction across 3 +PLMs on a Hinglish emotion classification dataset. Using LIME and token level +language ID, we find that models do learn these associations between language +choice and emotional expression. Moreover, having code-mixed data present in +the pre-training can augment that learning when task-specific data is scarce. +We also conclude from the misclassifications that the models may overgeneralise +this heuristic to other infrequent examples where this sociolinguistic +phenomenon does not apply. + +
+
+ comment: 5 pages, Accepted to SIGTYP 2024 @ EACL +
+
+
+
+
+ + ☆ Constrained Decoding for Cross-lingual Label Projection ICLR 2024 + + +
+ Zero-shot cross-lingual transfer utilizing multilingual LLMs has become a +popular learning paradigm for low-resource languages with no labeled training +data. However, for NLP tasks that involve fine-grained predictions on words and +phrases, the performance of zero-shot cross-lingual transfer learning lags far +behind supervised fine-tuning methods. Therefore, it is common to exploit +translation and label projection to further improve the performance by (1) +translating training data that is available in a high-resource language (e.g., +English) together with the gold labels into low-resource languages, and/or (2) +translating test data in low-resource languages to a high-source language to +run inference on, then projecting the predicted span-level labels back onto the +original test data. However, state-of-the-art marker-based label projection +methods suffer from translation quality degradation due to the extra label +markers injected in the input to the translation model. In this work, we +explore a new direction that leverages constrained decoding for label +projection to overcome the aforementioned issues. Our new method not only can +preserve the quality of translated texts but also has the versatility of being +applicable to both translating training and translating test data strategies. +This versatility is crucial as our experiments reveal that translating test +data can lead to a considerable boost in performance compared to translating +only training data. We evaluate on two cross-lingual transfer tasks, namely +Named Entity Recognition and Event Argument Extraction, spanning 20 languages. +The results demonstrate that our approach outperforms the state-of-the-art +marker-based method by a large margin and also shows better performance than +other label projection methods that rely on external word alignment. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ☆ How Free is Parameter-Free Stochastic Optimization? + + +
+ We study the problem of parameter-free stochastic optimization, inquiring +whether, and under what conditions, do fully parameter-free methods exist: +these are methods that achieve convergence rates competitive with optimally +tuned methods, without requiring significant knowledge of the true problem +parameters. Existing parameter-free methods can only be considered +``partially'' parameter-free, as they require some non-trivial knowledge of the +true problem parameters, such as a bound on the stochastic gradient norms, a +bound on the distance to a minimizer, etc. In the non-convex setting, we +demonstrate that a simple hyperparameter search technique results in a fully +parameter-free method that outperforms more sophisticated state-of-the-art +algorithms. We also provide a similar result in the convex setting with access +to noisy function values under mild noise assumptions. Finally, assuming only +access to stochastic gradients, we establish a lower bound that renders fully +parameter-free stochastic convex optimization infeasible, and provide a method +which is (partially) parameter-free up to the limit indicated by our lower +bound. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Towards Eliminating Hard Label Constraints in Gradient Inversion Attacks ICLR2024 + + +
+ Gradient inversion attacks aim to reconstruct local training data from +intermediate gradients exposed in the federated learning framework. Despite +successful attacks, all previous methods, starting from reconstructing a single +data point and then relaxing the single-image limit to batch level, are only +tested under hard label constraints. Even for single-image reconstruction, we +still lack an analysis-based algorithm to recover augmented soft labels. In +this work, we change the focus from enlarging batchsize to investigating the +hard label constraints, considering a more realistic circumstance where label +smoothing and mixup techniques are used in the training process. In particular, +we are the first to initiate a novel algorithm to simultaneously recover the +ground-truth augmented label and the input feature of the last fully-connected +layer from single-input gradients, and provide a necessary condition for any +analytical-based label recovery methods. Extensive experiments testify to the +label recovery accuracy, as well as the benefits to the following image +reconstruction. We believe soft labels in classification tasks are worth +further attention in gradient inversion attacks. + +
+
+ comment: ICLR2024 poster The prior submission version had a bug in the image + reconstruction implementation, which has been corrected without harm to the + main conclusions +
+
+
+
+
+ + ☆ Good Teachers Explain: Explanation-Enhanced Knowledge Distillation + + +
+ Knowledge Distillation (KD) has proven effective for compressing large +teacher models into smaller student models. While it is well known that student +models can achieve similar accuracies as the teachers, it has also been shown +that they nonetheless often do not learn the same function. It is, however, +often highly desirable that the student's and teacher's functions share similar +properties such as basing the prediction on the same input features, as this +ensures that students learn the 'right features' from the teachers. In this +work, we explore whether this can be achieved by not only optimizing the +classic KD loss but also the similarity of the explanations generated by the +teacher and the student. Despite the idea being simple and intuitive, we find +that our proposed 'explanation-enhanced' KD (e$^2$KD) (1) consistently provides +large gains in terms of accuracy and student-teacher agreement, (2) ensures +that the student learns from the teacher to be right for the right reasons and +to give similar explanations, and (3) is robust with respect to the model +architectures, the amount of training data, and even works with 'approximate', +pre-computed explanations. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ☆ Feature-Action Design Patterns for Storytelling Visualizations with Time + Series Data + + +
+ We present a method to create storytelling visualization with time series +data. Many personal decisions nowadays rely on access to dynamic data +regularly, as we have seen during the COVID-19 pandemic. It is thus desirable +to construct storytelling visualization for dynamic data that is selected by an +individual for a specific context. Because of the need to tell data-dependent +stories, predefined storyboards based on known data cannot accommodate dynamic +data easily nor scale up to many different individuals and contexts. Motivated +initially by the need to communicate time series data during the COVID-19 +pandemic, we developed a novel computer-assisted method for meta-authoring of +stories, which enables the design of storyboards that include feature-action +patterns in anticipation of potential features that may appear in dynamically +arrived or selected data. In addition to meta-storyboards involving COVID-19 +data, we also present storyboards for telling stories about progress in a +machine learning workflow. Our approach is complementary to traditional methods +for authoring storytelling visualization, and provides an efficient means to +construct data-dependent storyboards for different data-streams of similar +contexts. + +
+
+
+
+
+ + ☆ Discovering interpretable models of scientific image data with deep + learning + + +
+ How can we find interpretable, domain-appropriate models of natural phenomena +given some complex, raw data such as images? Can we use such models to derive +scientific insight from the data? In this paper, we propose some methods for +achieving this. In particular, we implement disentangled representation +learning, sparse deep neural network training and symbolic regression, and +assess their usefulness in forming interpretable models of complex image data. +We demonstrate their relevance to the field of bioimaging using a well-studied +test problem of classifying cell states in microscopy data. We find that such +methods can produce highly parsimonious models that achieve $\sim98\%$ of the +accuracy of black-box benchmark models, with a tiny fraction of the complexity. +We explore the utility of such interpretable models in producing scientific +explanations of the underlying biological phenomenon. + +
+
+ comment: 33 pages (including appendices), 27 figures +
+
+
+
+
+ + ☆ Infrared Spectra Prediction for Diazo Groups Utilizing a Machine + Learning Approach with Structural Attention Mechanism + + +
+ Infrared (IR) spectroscopy is a pivotal technique in chemical research for +elucidating molecular structures and dynamics through vibrational and +rotational transitions. However, the intricate molecular fingerprints +characterized by unique vibrational and rotational patterns present substantial +analytical challenges. Here, we present a machine learning approach employing a +Structural Attention Mechanism tailored to enhance the prediction and +interpretation of infrared spectra, particularly for diazo compounds. Our model +distinguishes itself by honing in on chemical information proximal to +functional groups, thereby significantly bolstering the accuracy, robustness, +and interpretability of spectral predictions. This method not only demystifies +the correlations between infrared spectral features and molecular structures +but also offers a scalable and efficient paradigm for dissecting complex +molecular interactions. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ Non-Stationary Latent Auto-Regressive Bandits + + +
+ We consider the stochastic multi-armed bandit problem with non-stationary +rewards. We present a novel formulation of non-stationarity in the environment +where changes in the mean reward of the arms over time are due to some unknown, +latent, auto-regressive (AR) state of order $k$. We call this new environment +the latent AR bandit. Different forms of the latent AR bandit appear in many +real-world settings, especially in emerging scientific fields such as +behavioral health or education where there are few mechanistic models of the +environment. If the AR order $k$ is known, we propose an algorithm that +achieves $\tilde{O}(k\sqrt{T})$ regret in this setting. Empirically, our +algorithm outperforms standard UCB across multiple non-stationary environments, +even if $k$ is mis-specified. + +
+
+
+
+
+ + ☆ High-dimensional Bayesian Optimization via Covariance Matrix Adaptation + Strategy + + +
+ Bayesian Optimization (BO) is an effective method for finding the global +optimum of expensive black-box functions. However, it is well known that +applying BO to high-dimensional optimization problems is challenging. To +address this issue, a promising solution is to use a local search strategy that +partitions the search domain into local regions with high likelihood of +containing the global optimum, and then use BO to optimize the objective +function within these regions. In this paper, we propose a novel technique for +defining the local regions using the Covariance Matrix Adaptation (CMA) +strategy. Specifically, we use CMA to learn a search distribution that can +estimate the probabilities of data points being the global optimum of the +objective function. Based on this search distribution, we then define the local +regions consisting of data points with high probabilities of being the global +optimum. Our approach serves as a meta-algorithm as it can incorporate existing +black-box BO optimizers, such as BO, TuRBO, and BAxUS, to find the global +optimum of the objective function within our derived local regions. We evaluate +our proposed method on various benchmark synthetic and real-world problems. The +results demonstrate that our method outperforms existing state-of-the-art +techniques. + +
+
+ comment: 31 pages, 17 figures +
+
+
+
+
+ + ☆ Intent-based Prompt Calibration: Enhancing prompt optimization with + synthetic boundary cases + + +
+ Prompt engineering is a challenging and important task due to the high +sensitivity of Large Language Models (LLMs) to the given prompt and the +inherent ambiguity of a textual task instruction. Automatic prompt engineering +is essential to achieve optimized performance from LLMs. Recent studies have +demonstrated the capabilities of LLMs to automatically conduct prompt +engineering by employing a meta-prompt that incorporates the outcomes of the +last trials and proposes an improved prompt. However, this requires a +high-quality benchmark to compare different prompts, which is difficult and +expensive to acquire in many real-world use cases. In this work, we introduce a +new method for automatic prompt engineering, using a calibration process that +iteratively refines the prompt to the user intent. During the optimization +process, the system jointly generates synthetic data of boundary use cases and +optimizes the prompt according to the generated dataset. We demonstrate the +effectiveness of our method with respect to strong proprietary models on +real-world tasks such as moderation and generation. Our method outperforms +state-of-the-art methods with a limited number of annotated samples. +Furthermore, we validate the advantages of each one of the system's key +components. Our system is built in a modular way, facilitating easy adaptation +to other tasks. The code is available +$\href{https://github.com/Eladlev/AutoPrompt}{here}$. + +
+
+
+
+
+ + ☆ Transcending Adversarial Perturbations: Manifold-Aided Adversarial + Examples with Legitimate Semantics + + +
+ Deep neural networks were significantly vulnerable to adversarial examples +manipulated by malicious tiny perturbations. Although most conventional +adversarial attacks ensured the visual imperceptibility between adversarial +examples and corresponding raw images by minimizing their geometric distance, +these constraints on geometric distance led to limited attack transferability, +inferior visual quality, and human-imperceptible interpretability. In this +paper, we proposed a supervised semantic-transformation generative model to +generate adversarial examples with real and legitimate semantics, wherein an +unrestricted adversarial manifold containing continuous semantic variations was +constructed for the first time to realize a legitimate transition from +non-adversarial examples to adversarial ones. Comprehensive experiments on +MNIST and industrial defect datasets showed that our adversarial examples not +only exhibited better visual quality but also achieved superior attack +transferability and more effective explanations for model vulnerabilities, +indicating their great potential as generic adversarial examples. The code and +pre-trained models were available at https://github.com/shuaili1027/MAELS.git. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Cross-Domain Few-Shot Object Detection via Enhanced Open-Set Object + Detector + + +
+ This paper addresses the challenge of cross-domain few-shot object detection +(CD-FSOD), aiming to develop an accurate object detector for novel domains with +minimal labeled examples. While transformer-based open-set detectors e.g., +DE-ViT~\cite{zhang2023detect} have excelled in both open-vocabulary object +detection and traditional few-shot object detection, detecting categories +beyond those seen during training, we thus naturally raise two key questions: +1) can such open-set detection methods easily generalize to CD-FSOD? 2) If no, +how to enhance the results of open-set methods when faced with significant +domain gaps? To address the first question, we introduce several metrics to +quantify domain variances and establish a new CD-FSOD benchmark with diverse +domain metric values. Some State-Of-The-Art (SOTA) open-set object detection +methods are evaluated on this benchmark, with evident performance degradation +observed across out-of-domain datasets. This indicates the failure of adopting +open-set detectors directly for CD-FSOD. Sequentially, to overcome the +performance degradation issue and also to answer the second proposed question, +we endeavor to enhance the vanilla DE-ViT. With several novel components +including finetuning, a learnable prototype module, and a lightweight attention +module, we present an improved Cross-Domain Vision Transformer for CD-FSOD +(CD-ViTO). Experiments show that our CD-ViTO achieves impressive results on +both out-of-domain and in-domain target datasets, establishing new SOTAs for +both CD-FSOD and FSOD. All the datasets, codes, and models will be released to +the community. + +
+
+
+
+
+ + ☆ Dual Lagrangian Learning for Conic Optimization + + +
+ This paper presents Dual Lagrangian Learning (DLL), a principled learning +methodology that combines conic duality theory with the representation power of +ML models. DLL leverages conic duality to provide dual-feasible solutions, and +therefore valid Lagrangian dual bounds, for parametric linear and nonlinear +conic optimization problems. The paper introduces differentiable conic +projection layers, a systematic dual completion procedure, and a +self-supervised learning framework. The effectiveness of DLL is demonstrated on +linear and nonlinear parametric optimization problems for which DLL provides +valid dual bounds within 0.5% of optimality. + +
+
+
+
+
+ + ☆ Visual Text Meets Low-level Vision: A Comprehensive Survey on Visual + Text Processing + + +
+ Visual text, a pivotal element in both document and scene images, speaks +volumes and attracts significant attention in the computer vision domain. +Beyond visual text detection and recognition, the field of visual text +processing has experienced a surge in research, driven by the advent of +fundamental generative models. However, challenges persist due to the unique +properties and features that distinguish text from general objects. Effectively +leveraging these unique textual characteristics is crucial in visual text +processing, as observed in our study. In this survey, we present a +comprehensive, multi-perspective analysis of recent advancements in this field. +Initially, we introduce a hierarchical taxonomy encompassing areas ranging from +text image enhancement and restoration to text image manipulation, followed by +different learning paradigms. Subsequently, we conduct an in-depth discussion +of how specific textual features such as structure, stroke, semantics, style, +and spatial context are seamlessly integrated into various tasks. Furthermore, +we explore available public datasets and benchmark the reviewed methods on +several widely-used datasets. Finally, we identify principal challenges and +potential avenues for future research. Our aim is to establish this survey as a +fundamental resource, fostering continued exploration and innovation in the +dynamic area of visual text processing. + +
+
+
+
+
+ + ☆ Preference-Conditioned Language-Guided Abstraction + + +
+ Learning from demonstrations is a common way for users to teach robots, but +it is prone to spurious feature correlations. Recent work constructs state +abstractions, i.e. visual representations containing task-relevant features, +from language as a way to perform more generalizable learning. However, these +abstractions also depend on a user's preference for what matters in a task, +which may be hard to describe or infeasible to exhaustively specify using +language alone. How do we construct abstractions to capture these latent +preferences? We observe that how humans behave reveals how they see the world. +Our key insight is that changes in human behavior inform us that there are +differences in preferences for how humans see the world, i.e. their state +abstractions. In this work, we propose using language models (LMs) to query for +those preferences directly given knowledge that a change in behavior has +occurred. In our framework, we use the LM in two ways: first, given a text +description of the task and knowledge of behavioral change between states, we +query the LM for possible hidden preferences; second, given the most likely +preference, we query the LM to construct the state abstraction. In this +framework, the LM is also able to ask the human directly when uncertain about +its own estimate. We demonstrate our framework's ability to construct effective +preference-conditioned abstractions in simulated experiments, a user study, as +well as on a real Spot robot performing mobile manipulation tasks. + +
+
+ comment: HRI 2024 +
+
+
+
+
+ + ☆ Markov Persuasion Processes: Learning to Persuade from Scratch + + +
+ In Bayesian persuasion, an informed sender strategically discloses +information to a receiver so as to persuade them to undertake desirable +actions. Recently, a growing attention has been devoted to settings in which +sender and receivers interact sequentially. Recently, Markov persuasion +processes (MPPs) have been introduced to capture sequential scenarios where a +sender faces a stream of myopic receivers in a Markovian environment. The MPPs +studied so far in the literature suffer from issues that prevent them from +being fully operational in practice, e.g., they assume that the sender knows +receivers' rewards. We fix such issues by addressing MPPs where the sender has +no knowledge about the environment. We design a learning algorithm for the +sender, working with partial feedback. We prove that its regret with respect to +an optimal information-disclosure policy grows sublinearly in the number of +episodes, as it is the case for the loss in persuasiveness cumulated while +learning. Moreover, we provide a lower bound for our setting matching the +guarantees of our algorithm. + +
+
+
+
+
+ + ☆ Learning to Abstract Visuomotor Mappings using Meta-Reinforcement + Learning + + +
+ We investigated the human capacity to acquire multiple visuomotor mappings +for de novo skills. Using a grid navigation paradigm, we tested whether +contextual cues implemented as different "grid worlds", allow participants to +learn two distinct key-mappings more efficiently. Our results indicate that +when contextual information is provided, task performance is significantly +better. The same held true for meta-reinforcement learning agents that differed +in whether or not they receive contextual information when performing the task. +We evaluated their accuracy in predicting human performance in the task and +analyzed their internal representations. The results indicate that contextual +cues allow the formation of separate representations in space and time when +using different visuomotor mappings, whereas the absence of them favors sharing +one representation. While both strategies can allow learning of multiple +visuomotor mappings, we showed contextual cues provide a computational +advantage in terms of how many mappings can be learned. + +
+
+
+
+
+ + ☆ Probabilistic Actor-Critic: Learning to Explore with PAC-Bayes + Uncertainty + + +
+ We introduce Probabilistic Actor-Critic (PAC), a novel reinforcement learning +algorithm with improved continuous control performance thanks to its ability to +mitigate the exploration-exploitation trade-off. PAC achieves this by +seamlessly integrating stochastic policies and critics, creating a dynamic +synergy between the estimation of critic uncertainty and actor training. The +key contribution of our PAC algorithm is that it explicitly models and infers +epistemic uncertainty in the critic through Probably Approximately +Correct-Bayesian (PAC-Bayes) analysis. This incorporation of critic uncertainty +enables PAC to adapt its exploration strategy as it learns, guiding the actor's +decision-making process. PAC compares favorably against fixed or pre-scheduled +exploration schemes of the prior art. The synergy between stochastic policies +and critics, guided by PAC-Bayes analysis, represents a fundamental step +towards a more adaptive and effective exploration strategy in deep +reinforcement learning. We report empirical evaluations demonstrating PAC's +enhanced stability and improved performance over the state of the art in +diverse continuous control problems. + +
+
+ comment: 18 pages, 4 figures, 7 tables +
+
+
+
+
+ + ☆ Multi-Lingual Malaysian Embedding: Leveraging Large Language Models for + Semantic Representations + + +
+ In this work, we present a comprehensive exploration of finetuning Malaysian +language models, specifically Llama2 and Mistral, on embedding tasks involving +negative and positive pairs. We release two distinct models tailored for +Semantic Similarity and Retrieval-Augmented Generation (RAG). + For Semantic Similarity, our 600 million parameter Llama2 model outperforms +OpenAI text-embedding-ada-002 across all recall@k metrics for b.cari.com.my, +c.cari.com.my, Malay news, and Malaysian Twitter test sets. + In the realm of RAG models, our approach proves competitive with OpenAI +text-embedding-ada-002 in the Malaysian context. Notably, our 2 billion +parameter Llama2 model achieves superior Recall@5, Recall@10 for the "Melayu" +keyword research papers dataset and excels in Recall@3, Recall@5, and Recall@10 +for the lom.agc.gov.my dataset. + These findings underscore the effectiveness of our finetuning strategy and +highlight the performance gains in both Semantic Similarity and RAG tasks. + All models released at +https://huggingface.co/collections/mesolitica/malaysian-embedding-6523612bfe5881ad35f81b99 + +
+
+
+
+
+ + ☆ EasyInstruct: An Easy-to-use Instruction Processing Framework for Large + Language Models + + +
+ In recent years, instruction tuning has gained increasing attention and +emerged as a crucial technique to enhance the capabilities of Large Language +Models (LLMs). To construct high-quality instruction datasets, many instruction +processing approaches have been proposed, aiming to achieve a delicate balance +between data quantity and data quality. Nevertheless, due to inconsistencies +that persist among various instruction processing methods, there is no standard +open-source instruction processing implementation framework available for the +community, which hinders practitioners from further developing and advancing. +To facilitate instruction processing research and development, we present +EasyInstruct, an easy-to-use instruction processing framework for LLMs, which +modularizes instruction generation, selection, and prompting, while also +considering their combination and interaction. EasyInstruct is publicly +released and actively maintained at https://github.com/zjunlp/EasyInstruct, +along with a running demo App at +https://huggingface.co/spaces/zjunlp/EasyInstruct for quick-start, calling for +broader research centered on instruction data. + +
+
+ comment: Ongoing work; the project website is at + https://zjunlp.github.io/project/EasyInstruct, code is at + https://github.com/zjunlp/EasyInstruct, demo is at + https://huggingface.co/spaces/zjunlp/EasyInstruct +
+
+
+
+
+ + ☆ Cooperative Learning with Gaussian Processes for Euler-Lagrange Systems + Tracking Control under Switching Topologies + + +
+ This work presents an innovative learning-based approach to tackle the +tracking control problem of Euler-Lagrange multi-agent systems with partially +unknown dynamics operating under switching communication topologies. The +approach leverages a correlation-aware cooperative algorithm framework built +upon Gaussian process regression, which adeptly captures inter-agent +correlations for uncertainty predictions. A standout feature is its exceptional +efficiency in deriving the aggregation weights achieved by circumventing the +computationally intensive posterior variance calculations. Through Lyapunov +stability analysis, the distributed control law ensures bounded tracking errors +with high probability. Simulation experiments validate the protocol's efficacy +in effectively managing complex scenarios, establishing it as a promising +solution for robust tracking control in multi-agent systems characterized by +uncertain dynamics and dynamic communication structures. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ PFDM: Parser-Free Virtual Try-on via Diffusion Model ICASSP 2024 + + +
+ Virtual try-on can significantly improve the garment shopping experiences in +both online and in-store scenarios, attracting broad interest in computer +vision. However, to achieve high-fidelity try-on performance, most +state-of-the-art methods still rely on accurate segmentation masks, which are +often produced by near-perfect parsers or manual labeling. To overcome the +bottleneck, we propose a parser-free virtual try-on method based on the +diffusion model (PFDM). Given two images, PFDM can "wear" garments on the +target person seamlessly by implicitly warping without any other information. +To learn the model effectively, we synthesize many pseudo-images and construct +sample pairs by wearing various garments on persons. Supervised by the +large-scale expanded dataset, we fuse the person and garment features using a +proposed Garment Fusion Attention (GFA) mechanism. Experiments demonstrate that +our proposed PFDM can successfully handle complex cases, synthesize +high-fidelity images, and outperform both state-of-the-art parser-free and +parser-based models. + +
+
+ comment: Accepted by IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Open RL Benchmark: Comprehensive Tracked Experiments for Reinforcement + Learning + + +
+ In many Reinforcement Learning (RL) papers, learning curves are useful +indicators to measure the effectiveness of RL algorithms. However, the complete +raw data of the learning curves are rarely available. As a result, it is +usually necessary to reproduce the experiments from scratch, which can be +time-consuming and error-prone. We present Open RL Benchmark, a set of fully +tracked RL experiments, including not only the usual data such as episodic +return, but also all algorithm-specific and system metrics. Open RL Benchmark +is community-driven: anyone can download, use, and contribute to the data. At +the time of writing, more than 25,000 runs have been tracked, for a cumulative +duration of more than 8 years. Open RL Benchmark covers a wide range of RL +libraries and reference implementations. Special care is taken to ensure that +each experiment is precisely reproducible by providing not only the full +parameters, but also the versions of the dependencies used to generate it. In +addition, Open RL Benchmark comes with a command-line interface (CLI) for easy +fetching and generating figures to present the results. In this document, we +include two case studies to demonstrate the usefulness of Open RL Benchmark in +practice. To the best of our knowledge, Open RL Benchmark is the first RL +benchmark of its kind, and the authors hope that it will improve and facilitate +the work of researchers in the field. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ SIDU-TXT: An XAI Algorithm for NLP with a Holistic Assessment Approach + + +
+ Explainable AI (XAI) aids in deciphering 'black-box' models. While several +methods have been proposed and evaluated primarily in the image domain, the +exploration of explainability in the text domain remains a growing research +area. In this paper, we delve into the applicability of XAI methods for the +text domain. In this context, the 'Similarity Difference and Uniqueness' (SIDU) +XAI method, recognized for its superior capability in localizing entire salient +regions in image-based classification is extended to textual data. The extended +method, SIDU-TXT, utilizes feature activation maps from 'black-box' models to +generate heatmaps at a granular, word-based level, thereby providing +explanations that highlight contextually significant textual elements crucial +for model predictions. Given the absence of a unified standard for assessing +XAI methods, this study applies a holistic three-tiered comprehensive +evaluation framework: Functionally-Grounded, Human-Grounded and +Application-Grounded, to assess the effectiveness of the proposed SIDU-TXT +across various experiments. We find that, in sentiment analysis task of a movie +review dataset, SIDU-TXT excels in both functionally and human-grounded +evaluations, demonstrating superior performance through quantitative and +qualitative analyses compared to benchmarks like Grad-CAM and LIME. In the +application-grounded evaluation within the sensitive and complex legal domain +of asylum decision-making, SIDU-TXT and Grad-CAM demonstrate comparable +performances, each with its own set of strengths and weaknesses. However, both +methods fall short of entirely fulfilling the sophisticated criteria of expert +expectations, highlighting the imperative need for additional research in XAI +methods suitable for such domains. + +
+
+ comment: Preprint submitted to Elsevier on Jan 5th, 2024 +
+
+
+
+
+ + ☆ InteractiveVideo: User-Centric Controllable Video Generation with + Synergistic Multimodal Instructions + + +
+ We introduce $\textit{InteractiveVideo}$, a user-centric framework for video +generation. Different from traditional generative approaches that operate based +on user-provided images or text, our framework is designed for dynamic +interaction, allowing users to instruct the generative model through various +intuitive mechanisms during the whole generation process, e.g. text and image +prompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal +Instruction mechanism, designed to seamlessly integrate users' multimodal +instructions into generative models, thus facilitating a cooperative and +responsive interaction between user inputs and the generative process. This +approach enables iterative and fine-grained refinement of the generation result +through precise and effective user instructions. With +$\textit{InteractiveVideo}$, users are given the flexibility to meticulously +tailor key aspects of a video. They can paint the reference image, edit +semantics, and adjust video motions until their requirements are fully met. +Code, models, and demo are available at +https://github.com/invictus717/InteractiveVideo + +
+
+ comment: Code, models, and demo are available at + https://github.com/invictus717/InteractiveVideo +
+
+
+
+
+ + ☆ Automatic Combination of Sample Selection Strategies for Few-Shot + Learning + + +
+ In few-shot learning, such as meta-learning, few-shot fine-tuning or +in-context learning, the limited number of samples used to train a model have a +significant impact on the overall success. Although a large number of sample +selection strategies exist, their impact on the performance of few-shot +learning is not extensively known, as most of them have been so far evaluated +in typical supervised settings only. In this paper, we thoroughly investigate +the impact of 20 sample selection strategies on the performance of 5 few-shot +learning approaches over 8 image and 6 text datasets. In addition, we propose a +new method for automatic combination of sample selection strategies (ACSESS) +that leverages the strengths and complementary information of the individual +strategies. The experimental results show that our method consistently +outperforms the individual selection strategies, as well as the recently +proposed method for selecting support examples for in-context learning. We also +show a strong modality, dataset and approach dependence for the majority of +strategies as well as their dependence on the number of shots - demonstrating +that the sample selection strategies play a significant role for lower number +of shots, but regresses to random selection at higher number of shots. + +
+
+
+
+
+ + ☆ Functional SDE approximation inspired by a deep operator network + architecture + + +
+ A novel approach to approximate solutions of Stochastic Differential +Equations (SDEs) by Deep Neural Networks is derived and analysed. The +architecture is inspired by the notion of Deep Operator Networks (DeepONets), +which is based on operator learning in function spaces in terms of a reduced +basis also represented in the network. In our setting, we make use of a +polynomial chaos expansion (PCE) of stochastic processes and call the +corresponding architecture SDEONet. The PCE has been used extensively in the +area of uncertainty quantification (UQ) with parametric partial differential +equations. This however is not the case with SDE, where classical sampling +methods dominate and functional approaches are seen rarely. A main challenge +with truncated PCEs occurs due to the drastic growth of the number of +components with respect to the maximum polynomial degree and the number of +basis elements. The proposed SDEONet architecture aims to alleviate the issue +of exponential complexity by learning an optimal sparse truncation of the +Wiener chaos expansion. A complete convergence and complexity analysis is +presented, making use of recent Neural Network approximation results. Numerical +experiments illustrate the promising performance of the suggested approach in +1D and higher dimensions. + +
+
+
+
+
+ + ☆ Understanding and Guiding Weakly Supervised Entity Alignment with + Potential Isomorphism Propagation + + +
+ Weakly Supervised Entity Alignment (EA) is the task of identifying equivalent +entities across diverse knowledge graphs (KGs) using only a limited number of +seed alignments. Despite substantial advances in aggregation-based weakly +supervised EA, the underlying mechanisms in this setting remain unexplored. In +this paper, we present a propagation perspective to analyze weakly supervised +EA and explain the existing aggregation-based EA models. Our theoretical +analysis reveals that these models essentially seek propagation operators for +pairwise entity similarities. We further prove that, despite the structural +heterogeneity of different KGs, the potentially aligned entities within +aggregation-based EA models have isomorphic subgraphs, which is the core +premise of EA but has not been investigated. Leveraging this insight, we +introduce a potential isomorphism propagation operator to enhance the +propagation of neighborhood information across KGs. We develop a general EA +framework, PipEA, incorporating this operator to improve the accuracy of every +type of aggregation-based model without altering the learning process. +Extensive experiments substantiate our theoretical findings and demonstrate +PipEA's significant performance gains over state-of-the-art weakly supervised +EA methods. Our work not only advances the field but also enhances our +comprehension of aggregation-based weakly supervised EA. + +
+
+
+
+
+ + ☆ Data-induced multiscale losses and efficient multirate gradient descent + schemes + + +
+ This paper investigates the impact of multiscale data on machine learning +algorithms, particularly in the context of deep learning. A dataset is +multiscale if its distribution shows large variations in scale across different +directions. This paper reveals multiscale structures in the loss landscape, +including its gradients and Hessians inherited from the data. Correspondingly, +it introduces a novel gradient descent approach, drawing inspiration from +multiscale algorithms used in scientific computing. This approach seeks to +transcend empirical learning rate selection, offering a more systematic, +data-informed strategy to enhance training efficiency, especially in the later +stages. + +
+
+ comment: 28 pages, 4 figures, submitted under review +
+
+
+
+
+ + ☆ Taylor Videos for Action Recognition + + +
+ Effectively extracting motions from video is a critical and long-standing +problem for action recognition. This problem is very challenging because +motions (i) do not have an explicit form, (ii) have various concepts such as +displacement, velocity, and acceleration, and (iii) often contain noise caused +by unstable pixels. Addressing these challenges, we propose the Taylor video, a +new video format that highlights the dominate motions (e.g., a waving hand) in +each of its frames named the Taylor frame. Taylor video is named after Taylor +series, which approximates a function at a given point using important terms. +In the scenario of videos, we define an implicit motion-extraction function +which aims to extract motions from video temporal block. In this block, using +the frames, the difference frames, and higher-order difference frames, we +perform Taylor expansion to approximate this function at the starting frame. We +show the summation of the higher-order terms in the Taylor series gives us +dominant motion patterns, where static objects, small and unstable motions are +removed. Experimentally we show that Taylor videos are effective inputs to +popular architectures including 2D CNNs, 3D CNNs, and transformers. When used +individually, Taylor videos yield competitive action recognition accuracy +compared to RGB videos and optical flow. When fused with RGB or optical flow +videos, further accuracy improvement is achieved. + +
+
+ comment: Research report +
+
+
+
+
+ + ☆ Toward Green and Human-Like Artificial Intelligence: A Complete Survey + on Contemporary Few-Shot Learning Approaches + + +
+ Despite deep learning's widespread success, its data-hungry and +computationally expensive nature makes it impractical for many data-constrained +real-world applications. Few-Shot Learning (FSL) aims to address these +limitations by enabling rapid adaptation to novel learning tasks, seeing +significant growth in recent years. This survey provides a comprehensive +overview of the field's latest advancements. Initially, FSL is formally +defined, and its relationship with different learning fields is presented. A +novel taxonomy is introduced, extending previously proposed ones, and +real-world applications in classic and novel fields are described. Finally, +recent trends shaping the field, outstanding challenges, and promising future +research directions are discussed. + +
+
+ comment: 35 pages, 9 figures. Submitted to ACM Computing Surveys +
+
+
+
+
+ + ☆ Whom to Trust? Elective Learning for Distributed Gaussian Process + Regression + + +
+ This paper introduces an innovative approach to enhance distributed +cooperative learning using Gaussian process (GP) regression in multi-agent +systems (MASs). The key contribution of this work is the development of an +elective learning algorithm, namely prior-aware elective distributed GP +(Pri-GP), which empowers agents with the capability to selectively request +predictions from neighboring agents based on their trustworthiness. The +proposed Pri-GP effectively improves individual prediction accuracy, especially +in cases where the prior knowledge of an agent is incorrect. Moreover, it +eliminates the need for computationally intensive variance calculations for +determining aggregation weights in distributed GP. Furthermore, we establish a +prediction error bound within the Pri-GP framework, ensuring the reliability of +predictions, which is regarded as a crucial property in safety-critical MAS +applications. + +
+
+ comment: 9 pages, conference preprint +
+
+
+
+
+ + ☆ On the Impact of Output Perturbation on Fairness in Binary Linear + Classification + + +
+ We theoretically study how differential privacy interacts with both +individual and group fairness in binary linear classification. More precisely, +we focus on the output perturbation mechanism, a classic approach in +privacy-preserving machine learning. We derive high-probability bounds on the +level of individual and group fairness that the perturbed models can achieve +compared to the original model. Hence, for individual fairness, we prove that +the impact of output perturbation on the level of fairness is bounded but grows +with the dimension of the model. For group fairness, we show that this impact +is determined by the distribution of so-called angular margins, that is signed +margins of the non-private model re-scaled by the norm of each example. + +
+
+
+
+
+ + ☆ Diffusive Gibbs Sampling + + +
+ The inadequate mixing of conventional Markov Chain Monte Carlo (MCMC) methods +for multi-modal distributions presents a significant challenge in practical +applications such as Bayesian inference and molecular dynamics. Addressing +this, we propose Diffusive Gibbs Sampling (DiGS), an innovative family of +sampling methods designed for effective sampling from distributions +characterized by distant and disconnected modes. DiGS integrates recent +developments in diffusion models, leveraging Gaussian convolution to create an +auxiliary noisy distribution that bridges isolated modes in the original space +and applying Gibbs sampling to alternately draw samples from both spaces. Our +approach exhibits a better mixing property for sampling multi-modal +distributions than state-of-the-art methods such as parallel tempering. We +demonstrate that our sampler attains substantially improved results across +various tasks, including mixtures of Gaussians, Bayesian neural networks and +molecular dynamics. + +
+
+ comment: 15 pages, 11 figures, 4 tables, 1 algorithm +
+
+
+
+
+ + ☆ On the development of a practical Bayesian optimisation algorithm for + expensive experiments and simulations with changing environmental conditions + + +
+ Experiments in engineering are typically conducted in controlled environments +where parameters can be set to any desired value. This assumes that the same +applies in a real-world setting -- an assumption that is often incorrect as +many experiments are influenced by uncontrollable environmental conditions such +as temperature, humidity and wind speed. When optimising such experiments, the +focus should lie on finding optimal values conditionally on these +uncontrollable variables. This article extends Bayesian optimisation to the +optimisation of systems in changing environments that include controllable and +uncontrollable parameters. The extension fits a global surrogate model over all +controllable and environmental variables but optimises only the controllable +parameters conditional on measurements of the uncontrollable variables. The +method is validated on two synthetic test functions and the effects of the +noise level, the number of the environmental parameters, the parameter +fluctuation, the variability of the uncontrollable parameters, and the +effective domain size are investigated. ENVBO, the proposed algorithm resulting +from this investigation, is applied to a wind farm simulator with eight +controllable and one environmental parameter. ENVBO finds solutions for the +full domain of the environmental variable that outperforms results from +optimisation algorithms that only focus on a fixed environmental value in all +but one case while using a fraction of their evaluation budget. This makes the +proposed approach very sample-efficient and cost-effective. An off-the-shelf +open-source version of ENVBO is available via the NUBO Python package. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ☆ Careful with that Scalpel: Improving Gradient Surgery with an EMA + + +
+ Beyond minimizing a single training loss, many deep learning estimation +pipelines rely on an auxiliary objective to quantify and encourage desirable +properties of the model (e.g. performance on another dataset, robustness, +agreement with a prior). Although the simplest approach to incorporating an +auxiliary loss is to sum it with the training loss as a regularizer, recent +works have shown that one can improve performance by blending the gradients +beyond a simple sum; this is known as gradient surgery. We cast the problem as +a constrained minimization problem where the auxiliary objective is minimized +among the set of minimizers of the training loss. To solve this bilevel +problem, we follow a parameter update direction that combines the training loss +gradient and the orthogonal projection of the auxiliary gradient to the +training gradient. In a setting where gradients come from mini-batches, we +explain how, using a moving average of the training loss gradients, we can +carefully maintain this critical orthogonality property. We demonstrate that +our method, Bloop, can lead to much better performances on NLP and vision +experiments than other gradient surgery methods without EMA. + +
+
+
+
+
+ + ☆ Text-Guided Image Clustering + + +
+ Image clustering divides a collection of images into meaningful groups, +typically interpreted post-hoc via human-given annotations. Those are usually +in the form of text, begging the question of using text as an abstraction for +image clustering. Current image clustering methods, however, neglect the use of +generated textual descriptions. We, therefore, propose Text-Guided Image +Clustering, i.e., generating text using image captioning and visual +question-answering (VQA) models and subsequently clustering the generated text. +Further, we introduce a novel approach to inject task- or domain knowledge for +clustering by prompting VQA models. Across eight diverse image clustering +datasets, our results show that the obtained text representations often +outperform image features. Additionally, we propose a counting-based cluster +explainability method. Our evaluations show that the derived keyword-based +explanations describe clusters better than the respective cluster accuracy +suggests. Overall, this research challenges traditional approaches and paves +the way for a paradigm shift in image clustering, using generated text. + +
+
+
+
+
+ + ☆ Decoding-time Realignment of Language Models + + +
+ Aligning language models with human preferences is crucial for reducing +errors and biases in these models. Alignment techniques, such as reinforcement +learning from human feedback (RLHF), are typically cast as optimizing a +tradeoff between human preference rewards and a proximity regularization term +that encourages staying close to the unaligned model. Selecting an appropriate +level of regularization is critical: insufficient regularization can lead to +reduced model capabilities due to reward hacking, whereas excessive +regularization hinders alignment. Traditional methods for finding the optimal +regularization level require retraining multiple models with varying +regularization strengths. This process, however, is resource-intensive, +especially for large models. To address this challenge, we propose +decoding-time realignment (DeRa), a simple method to explore and evaluate +different regularization strengths in aligned models without retraining. DeRa +enables control over the degree of alignment, allowing users to smoothly +transition between unaligned and aligned models. It also enhances the +efficiency of hyperparameter tuning by enabling the identification of effective +regularization strengths using a validation dataset. + +
+
+
+
+
+ + ☆ DexDiffuser: Generating Dexterous Grasps with Diffusion Models + + +
+ We introduce DexDiffuser, a novel dexterous grasping method that generates, +evaluates, and refines grasps on partial object point clouds. DexDiffuser +includes the conditional diffusion-based grasp sampler DexSampler and the +dexterous grasp evaluator DexEvaluator. DexSampler generates high-quality +grasps conditioned on object point clouds by iterative denoising of randomly +sampled grasps. We also introduce two grasp refinement strategies: +Evaluator-Guided Diffusion (EGD) and Evaluator-based Sampling Refinement (ESR). +Our simulation and real-world experiments on the Allegro Hand consistently +demonstrate that DexDiffuser outperforms the state-of-the-art multi-finger +grasp generation method FFHNet with an, on average, 21.71--22.20\% higher grasp +success rate. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ A Safety-Adapted Loss for Pedestrian Detection in Automated Driving + + +
+ In safety-critical domains like automated driving (AD), errors by the object +detector may endanger pedestrians and other vulnerable road users (VRU). As +common evaluation metrics are not an adequate safety indicator, recent works +employ approaches to identify safety-critical VRU and back-annotate the risk to +the object detector. However, those approaches do not consider the safety +factor in the deep neural network (DNN) training process. Thus, +state-of-the-art DNN penalizes all misdetections equally irrespective of their +criticality. Subsequently, to mitigate the occurrence of critical failure +cases, i.e., false negatives, a safety-aware training strategy might be +required to enhance the detection performance for critical pedestrians. In this +paper, we propose a novel safety-aware loss variation that leverages the +estimated per-pedestrian criticality scores during training. We exploit the +reachability set-based time-to-collision (TTC-RSB) metric from the motion +domain along with distance information to account for the worst-case threat +quantifying the criticality. Our evaluation results using RetinaNet and FCOS on +the nuScenes dataset demonstrate that training the models with our safety-aware +loss function mitigates the misdetection of critical pedestrians without +sacrificing performance for the general case, i.e., pedestrians outside the +safety-critical zone. + +
+
+
+
+
+ + ☆ Unsupervised semantic segmentation of high-resolution UAV imagery for + road scene parsing + + +
+ Two challenges are presented when parsing road scenes in UAV images. First, +the high resolution of UAV images makes processing difficult. Second, +supervised deep learning methods require a large amount of manual annotations +to train robust and accurate models. In this paper, an unsupervised road +parsing framework that leverages recent advances in vision language models and +fundamental computer vision model is introduced.Initially, a vision language +model is employed to efficiently process ultra-large resolution UAV images to +quickly detect road regions of interest in the images. Subsequently, the vision +foundation model SAM is utilized to generate masks for the road regions without +category information. Following that, a self-supervised representation learning +network extracts feature representations from all masked regions. Finally, an +unsupervised clustering algorithm is applied to cluster these feature +representations and assign IDs to each cluster. The masked regions are combined +with the corresponding IDs to generate initial pseudo-labels, which initiate an +iterative self-training process for regular semantic segmentation. The proposed +method achieves an impressive 89.96% mIoU on the development dataset without +relying on any manual annotation. Particularly noteworthy is the extraordinary +flexibility of the proposed method, which even goes beyond the limitations of +human-defined categories and is able to acquire knowledge of new categories +from the dataset itself. + +
+
+
+
+
+ + ☆ Review on Fault Diagnosis and Fault-Tolerant Control Scheme for Robotic + Manipulators: Recent Advances in AI, Machine Learning, and Digital Twin + + +
+ This comprehensive review article delves into the intricate realm of +fault-tolerant control (FTC) schemes tailored for robotic manipulators. Our +exploration spans the historical evolution of FTC, tracing its development over +time, and meticulously examines the recent breakthroughs fueled by the +synergistic integration of cutting-edge technologies such as artificial +intelligence (AI), machine learning (ML), and digital twin technologies (DTT). +The article places a particular emphasis on the transformative influence these +contemporary trends exert on the landscape of robotic manipulator control and +fault tolerance. + By delving into the historical context, our aim is to provide a comprehensive +understanding of the evolution of FTC schemes. This journey encompasses the +transition from model-based and signal-based schemes to the role of sensors, +setting the stage for an exploration of the present-day paradigm shift enabled +by AI, ML, and DTT. The narrative unfolds as we dissect the intricate interplay +between these advanced technologies and their applications in enhancing fault +tolerance within the domain of robotic manipulators. Our review critically +evaluates the impact of these advancements, shedding light on the novel +methodologies, techniques, and applications that have emerged in recent times. + The overarching goal of this article is to present a comprehensive +perspective on the current state of fault diagnosis and fault-tolerant control +within the context of robotic manipulators, positioning our exploration within +the broader framework of AI, ML, and DTT advancements. Through a meticulous +examination of both historical foundations and contemporary innovations, this +review significantly contributes to the existing body of knowledge, offering +valuable insights for researchers, practitioners, and enthusiasts navigating +the dynamic landscape of robotic manipulator control. + +
+
+ comment: 24 pages, 6 figures +
+
+
+
+
+ + ☆ Variational Flow Models: Flowing in Your Style + + +
+ We introduce a variational inference interpretation for models of "posterior +flows" - generalizations of "probability flows" to a broader class of +stochastic processes not necessarily diffusion processes. We coin the resulting +models as "Variational Flow Models". Additionally, we propose a systematic +training-free method to transform the posterior flow of a "linear" stochastic +process characterized by the equation Xt = at * X0 + st * X1 into a straight +constant-speed (SC) flow, reminiscent of Rectified Flow. This transformation +facilitates fast sampling along the original posterior flow without training a +new model of the SC flow. The flexibility of our approach allows us to extend +our transformation to inter-convert two posterior flows from distinct "linear" +stochastic processes. Moreover, we can easily integrate high-order numerical +solvers into the transformed SC flow, further enhancing sampling accuracy and +efficiency. Rigorous theoretical analysis and extensive experimental results +substantiate the advantages of our framework. + +
+
+
+
+
+ + ☆ Boosting, Voting Classifiers and Randomized Sample Compression Schemes + + +
+ In boosting, we aim to leverage multiple weak learners to produce a strong +learner. At the center of this paradigm lies the concept of building the strong +learner as a voting classifier, which outputs a weighted majority vote of the +weak learners. While many successful boosting algorithms, such as the iconic +AdaBoost, produce voting classifiers, their theoretical performance has long +remained sub-optimal: the best known bounds on the number of training examples +necessary for a voting classifier to obtain a given accuracy has so far always +contained at least two logarithmic factors above what is known to be achievable +by general weak-to-strong learners. In this work, we break this barrier by +proposing a randomized boosting algorithm that outputs voting classifiers whose +generalization error contains a single logarithmic dependency on the sample +size. We obtain this result by building a general framework that extends sample +compression methods to support randomized learning algorithms based on +sub-sampling. + +
+
+
+
+
+ + ☆ Retrieval-Augmented Score Distillation for Text-to-3D Generation + + +
+ Text-to-3D generation has achieved significant success by incorporating +powerful 2D diffusion models, but insufficient 3D prior knowledge also leads to +the inconsistency of 3D geometry. Recently, since large-scale multi-view +datasets have been released, fine-tuning the diffusion model on the multi-view +datasets becomes a mainstream to solve the 3D inconsistency problem. However, +it has confronted with fundamental difficulties regarding the limited quality +and diversity of 3D data, compared with 2D data. To sidestep these trade-offs, +we explore a retrieval-augmented approach tailored for score distillation, +dubbed RetDream. We postulate that both expressiveness of 2D diffusion models +and geometric consistency of 3D assets can be fully leveraged by employing the +semantically relevant assets directly within the optimization process. To this +end, we introduce novel framework for retrieval-based quality enhancement in +text-to-3D generation. We leverage the retrieved asset to incorporate its +geometric prior in the variational objective and adapt the diffusion model's 2D +prior toward view consistency, achieving drastic improvements in both geometry +and fidelity of generated scenes. We conduct extensive experiments to +demonstrate that RetDream exhibits superior quality with increased geometric +consistency. Project page is available at https://ku-cvlab.github.io/RetDream/. + +
+
+ comment: Project Page: https://ku-cvlab.github.io/RetDream/ +
+
+
+
+
+ + ☆ Towards Understanding the Word Sensitivity of Attention Layers: A Study + via Random Features + + +
+ Unveiling the reasons behind the exceptional success of transformers requires +a better understanding of why attention layers are suitable for NLP tasks. In +particular, such tasks require predictive models to capture contextual meaning +which often depends on one or few words, even if the sentence is long. Our work +studies this key property, dubbed word sensitivity (WS), in the prototypical +setting of random features. We show that attention layers enjoy high WS, +namely, there exists a vector in the space of embeddings that largely perturbs +the random attention features map. The argument critically exploits the role of +the softmax in the attention layer, highlighting its benefit compared to other +activations (e.g., ReLU). In contrast, the WS of standard random features is of +order $1/\sqrt{n}$, $n$ being the number of words in the textual sample, and +thus it decays with the length of the context. We then translate these results +on the word sensitivity into generalization bounds: due to their low WS, random +features provably cannot learn to distinguish between two sentences that differ +only in a single word; in contrast, due to their high WS, random attention +features have higher generalization capabilities. We validate our theoretical +results with experimental evidence over the BERT-Base word embeddings of the +imdb review dataset. + +
+
+
+
+
+ + ☆ Delving into Multi-modal Multi-task Foundation Models for Road Scene + Understanding: From Learning Paradigm Perspectives + + +
+ Foundation models have indeed made a profound impact on various fields, +emerging as pivotal components that significantly shape the capabilities of +intelligent systems. In the context of intelligent vehicles, leveraging the +power of foundation models has proven to be transformative, offering notable +advancements in visual understanding. Equipped with multi-modal and multi-task +learning capabilities, multi-modal multi-task visual understanding foundation +models (MM-VUFMs) effectively process and fuse data from diverse modalities and +simultaneously handle various driving-related tasks with powerful adaptability, +contributing to a more holistic understanding of the surrounding scene. In this +survey, we present a systematic analysis of MM-VUFMs specifically designed for +road scenes. Our objective is not only to provide a comprehensive overview of +common practices, referring to task-specific models, unified multi-modal +models, unified multi-task models, and foundation model prompting techniques, +but also to highlight their advanced capabilities in diverse learning +paradigms. These paradigms include open-world understanding, efficient transfer +for road scenes, continual learning, interactive and generative capability. +Moreover, we provide insights into key challenges and future trends, such as +closed-loop driving systems, interpretability, embodied driving agents, and +world models. To facilitate researchers in staying abreast of the latest +developments in MM-VUFMs for road scenes, we have established a continuously +updated repository at https://github.com/rolsheng/MM-VUFM4DS + +
+
+ comment: 24 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ Mixed Noise and Posterior Estimation with Conditional DeepGEM + + +
+ Motivated by indirect measurements and applications from nanometrology with a +mixed noise model, we develop a novel algorithm for jointly estimating the +posterior and the noise parameters in Bayesian inverse problems. We propose to +solve the problem by an expectation maximization (EM) algorithm. Based on the +current noise parameters, we learn in the E-step a conditional normalizing flow +that approximates the posterior. In the M-step, we propose to find the noise +parameter updates again by an EM algorithm, which has analytical formulas. We +compare the training of the conditional normalizing flow with the forward and +reverse KL, and show that our model is able to incorporate information from +many measurements, unlike previous approaches. + +
+
+
+
+
+ + ♻ ☆ Regularization and Optimization in Model-Based Clustering + + +
+ Due to their conceptual simplicity, k-means algorithm variants have been +extensively used for unsupervised cluster analysis. However, one main +shortcoming of these algorithms is that they essentially fit a mixture of +identical spherical Gaussians to data that vastly deviates from such a +distribution. In comparison, general Gaussian Mixture Models (GMMs) can fit +richer structures but require estimating a quadratic number of parameters per +cluster to represent the covariance matrices. This poses two main issues: (i) +the underlying optimization problems are challenging due to their larger number +of local minima, and (ii) their solutions can overfit the data. In this work, +we design search strategies that circumvent both issues. We develop more +effective optimization algorithms for general GMMs, and we combine these +algorithms with regularization strategies that avoid overfitting. Through +extensive computational analyses, we observe that optimization or +regularization in isolation does not substantially improve cluster recovery. +However, combining these techniques permits a completely new level of +performance previously unachieved by k-means algorithm variants, unraveling +vastly different cluster structures. These results shed new light on the +current status quo between GMM and k-means methods and suggest the more +frequent use of general GMMs for data exploration. To facilitate such +applications, we provide open-source code as well as Julia packages +(UnsupervisedClustering.jl and RegularizedCovarianceMatrices.jl) implementing +the proposed techniques. + +
+
+
+
+
+ + ♻ ☆ Mixed Traffic Control and Coordination from Pixels ICRA + + +
+ Traffic congestion is a persistent problem in our society. Previous methods +for traffic control have proven futile in alleviating current congestion levels +leading researchers to explore ideas with robot vehicles given the increased +emergence of vehicles with different levels of autonomy on our roads. This +gives rise to mixed traffic control, where robot vehicles regulate human-driven +vehicles through reinforcement learning (RL). However, most existing studies +use precise observations that require domain expertise and hand engineering for +each road network's observation space. Additionally, precise observations use +global information, such as environment outflow, and local information, i.e., +vehicle positions and velocities. Obtaining this information requires updating +existing road infrastructure with vast sensor environments and communication to +potentially unwilling human drivers. We consider image observations, a modality +that has not been extensively explored for mixed traffic control via RL, as the +alternative: 1) images do not require a complete re-imagination of the +observation space from environment to environment; 2) images are ubiquitous +through satellite imagery, in-car camera systems, and traffic monitoring +systems; and 3) images only require communication to equipment. In this work, +we show robot vehicles using image observations can achieve competitive +performance to using precise information on environments, including ring, +figure eight, intersection, merge, and bottleneck. In certain scenarios, our +approach even outperforms using precision observations, e.g., up to 8% increase +in average vehicle velocity in the merge environment, despite only using local +traffic information as opposed to global traffic information. + +
+
+ comment: Accepted to IEEE International Conference on Robotics and Automation + (ICRA), 2024 +
+
+
+
+
+ + ♻ ☆ Guiding Language Model Math Reasoning with Planning Tokens + + +
+ Large language models (LLMs) have recently attracted considerable interest +for their ability to perform complex reasoning tasks, such as chain-of-thought +reasoning. However, most of the existing approaches to enhance this ability +rely heavily on data-driven methods, while neglecting the structural aspects of +the model's reasoning capacity. We find that while LLMs can manage individual +reasoning steps well, they struggle with maintaining consistency across an +entire reasoning chain. To solve this, we introduce planning tokens at the +start of each reasoning step, serving as a guide for the model, and add their +embeddings to the model parameters. Our approach requires a negligible increase +in trainable parameters (just 0.001%) and can be applied through either full +fine-tuning or a more parameter-efficient scheme. We demonstrate our method's +effectiveness by applying it to three different LLMs, showing notable accuracy +improvements across three math word problem datasets w.r.t. standard +fine-tuning baselines. + +
+
+
+
+
+ + ♻ ☆ One Pass Streaming Algorithm for Super Long Token Attention + Approximation in Sublinear Space + + +
+ Attention computation takes both the time complexity of $O(n^2)$ and the +space complexity of $O(n^2)$ simultaneously, which makes deploying Large +Language Models (LLMs) in streaming applications that involve long contexts +requiring substantial computational resources. In recent OpenAI DevDay (Nov 6, +2023), OpenAI released a new model that is able to support a 128K-long +document, in our paper, we focus on the memory-efficient issue when context +length $n$ is much greater than 128K ($n \gg 2^d$). Considering a single-layer +self-attention with Query, Key, and Value matrices $Q, K, V \in \mathbb{R}^{n +\times d}$, the polynomial method approximates the attention output $T \in +\mathbb{R}^{n \times d}$. It accomplishes this by constructing $U_1, U_2 \in +\mathbb{R}^{n \times t}$ to expedite attention ${\sf Attn}(Q, K, V)$ +computation within $n^{1+o(1)}$ time executions. Despite this, computing the +approximated attention matrix $U_1U_2^\top \in \mathbb{R}^{n \times n}$ still +necessitates $O(n^2)$ space, leading to significant memory usage. In response +to these challenges, we introduce a new algorithm that only reads one pass of +the data in a streaming fashion. This method employs sublinear space $o(n)$ to +store three sketch matrices, alleviating the need for exact $K, V$ storage. +Notably, our algorithm exhibits exceptional memory-efficient performance with +super-long tokens. As the token length $n$ increases, our error guarantee +diminishes while the memory usage remains nearly constant. This unique +attribute underscores the potential of our technique in efficiently handling +LLMs in streaming applications. + +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of LLaMA and ChatGPT Embeddings for Molecule + Embedding + + +
+ Purpose: Large Language Models (LLMs) like ChatGPT and LLaMA are increasingly +recognized for their potential in the field of cheminformatics, particularly in +interpreting Simplified Molecular Input Line Entry System (SMILES), a standard +method for representing chemical structures. These LLMs can decode SMILES +strings into vector representations, providing a novel approach to +understanding chemical graphs. + Methods: We investigate the performance of ChatGPT and LLaMA in embedding +SMILES strings. Our evaluation focuses on two key applications: molecular +property (MP) prediction and drug-drug interaction (DDI) prediction, both +essential in drug development and healthcare. + Results: We find that SMILES embeddings generated using LLaMA outperform +those from ChatGPT in both MP and DDI prediction tasks. Notably, LLaMA-based +SMILES embeddings show results comparable to existing methods in both +prediction tasks. + Conclusion: The application of LLMs in cheminformatics, particularly in +utilizing SMILES embeddings, shows significant promise for advancing drug +development. This includes improving the prediction of chemical properties and +facilitating the drug discovery process. GitHub: +https://github.com/sshaghayeghs/LLaMA-VS-ChatGPT + +
+
+
+
+
+ + ♻ ☆ Multimodal Speech Enhancement Using Burst Propagation + + +
+ This paper proposes the MBURST, a novel multimodal solution for audio-visual +speech enhancements that consider the most recent neurological discoveries +regarding pyramidal cells of the prefrontal cortex and other brain regions. The +so-called burst propagation implements several criteria to address the credit +assignment problem in a more biologically plausible manner: steering the sign +and magnitude of plasticity through feedback, multiplexing the feedback and +feedforward information across layers through different weight connections, +approximating feedback and feedforward connections, and linearizing the +feedback signals. MBURST benefits from such capabilities to learn correlations +between the noisy signal and the visual stimuli, thus attributing meaning to +the speech by amplifying relevant information and suppressing noise. +Experiments conducted over a Grid Corpus and CHiME3-based dataset show that +MBURST can reproduce similar mask reconstructions to the multimodal +backpropagation-based baseline while demonstrating outstanding energy +efficiency management, reducing the neuron firing rates to values up to +\textbf{$70\%$} lower. Such a feature implies more sustainable implementations, +suitable and desirable for hearing aids or any other similar embedded systems. + +
+
+
+
+
+ + ♻ ☆ Transfer Learning for the Prediction of Entity Modifiers in Clinical + Text: Application to Opioid Use Disorder Case Detection + + +
+ Background: The semantics of entities extracted from a clinical text can be +dramatically altered by modifiers, including entity negation, uncertainty, +conditionality, severity, and subject. Existing models for determining +modifiers of clinical entities involve regular expression or features weights +that are trained independently for each modifier. + Methods: We develop and evaluate a multi-task transformer architecture design +where modifiers are learned and predicted jointly using the publicly available +SemEval 2015 Task 14 corpus and a new Opioid Use Disorder (OUD) data set that +contains modifiers shared with SemEval as well as novel modifiers specific for +OUD. We evaluate the effectiveness of our multi-task learning approach versus +previously published systems and assess the feasibility of transfer learning +for clinical entity modifiers when only a portion of clinical modifiers are +shared. + Results: Our approach achieved state-of-the-art results on the ShARe corpus +from SemEval 2015 Task 14, showing an increase of 1.1% on weighted accuracy, +1.7% on unweighted accuracy, and 10% on micro F1 scores. + Conclusions: We show that learned weights from our shared model can be +effectively transferred to a new partially matched data set, validating the use +of transfer learning for clinical text modifiers + +
+
+ comment: 18 pages, 2 figures, 6 tables. To be submitted to the Journal of + Biomedical Semantics +
+
+
+
+
+ + ♻ ☆ Faster Rates for Switchback Experiments + + +
+ Switchback experimental design, wherein a single unit (e.g., a whole system) +is exposed to a single random treatment for interspersed blocks of time, +tackles both cross-unit and temporal interference. Hu and Wager (2022) recently +proposed a treatment-effect estimator that truncates the beginnings of blocks +and established a $T^{-1/3}$ rate for estimating the global average treatment +effect (GATE) in a Markov setting with rapid mixing. They claim this rate is +optimal and suggest focusing instead on a different (and design-dependent) +estimand so as to enjoy a faster rate. For the same design we propose an +alternative estimator that uses the whole block and surprisingly show that it +in fact achieves an estimation rate of $\sqrt{\log T/T}$ for the original +design-independent GATE estimand under the same assumptions. + +
+
+
+
+
+ + ♻ ☆ Data Diversity Matters for Robust Instruction Tuning + + +
+ Recent works have shown that by curating high quality and diverse instruction +tuning datasets, we can significantly improve instruction-following +capabilities. However, creating such datasets is difficult and most works rely +on manual curation or proprietary language models. Automatic data curation is +difficult as it is still not clear how we can define diversity for instruction +tuning, how diversity and quality depend on one other, and how we can optimize +dataset quality and diversity. To resolve these issue, we propose a new +algorithm, Quality-Diversity Instruction Tuning (QDIT). QDIT provides a simple +method to simultaneously control dataset diversity and quality, allowing us to +conduct an in-depth study on the effect of diversity and quality on instruction +tuning performance. From this study we draw two key insights (1) there is a +natural tradeoff between data diversity and quality and (2) increasing data +diversity significantly improves the worst case instruction following +performance, therefore improving robustness. We validate the performance of +QDIT on several large scale instruction tuning datasets, where we find it can +substantially improve worst and average case performance compared to +quality-driven data selection. + +
+
+ comment: 22 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Semi-Supervised Imbalanced Node Classification from + Bias-Variance Decomposition NeurIPS 2023 + + +
+ This paper introduces a new approach to address the issue of class imbalance +in graph neural networks (GNNs) for learning on graph-structured data. Our +approach integrates imbalanced node classification and Bias-Variance +Decomposition, establishing a theoretical framework that closely relates data +imbalance to model variance. We also leverage graph augmentation technique to +estimate the variance, and design a regularization term to alleviate the impact +of imbalance. Exhaustive tests are conducted on multiple benchmarks, including +naturally imbalanced datasets and public-split class-imbalanced datasets, +demonstrating that our approach outperforms state-of-the-art methods in various +imbalanced scenarios. This work provides a novel theoretical perspective for +addressing the problem of imbalanced node classification in GNNs. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Equivariant Deep Weight Space Alignment + + +
+ Permutation symmetries of deep networks make basic operations like model +merging and similarity estimation challenging. In many cases, aligning the +weights of the networks, i.e., finding optimal permutations between their +weights, is necessary. Unfortunately, weight alignment is an NP-hard problem. +Prior research has mainly focused on solving relaxed versions of the alignment +problem, leading to either time-consuming methods or sub-optimal solutions. To +accelerate the alignment process and improve its quality, we propose a novel +framework aimed at learning to solve the weight alignment problem, which we +name Deep-Align. To that end, we first prove that weight alignment adheres to +two fundamental symmetries and then, propose a deep architecture that respects +these symmetries. Notably, our framework does not require any labeled data. We +provide a theoretical analysis of our approach and evaluate Deep-Align on +several types of network architectures and learning setups. Our experimental +results indicate that a feed-forward pass with Deep-Align produces better or +equivalent alignments compared to those produced by current optimization +algorithms. Additionally, our alignments can be used as an effective +initialization for other methods, leading to improved solutions with a +significant speedup in convergence. + +
+
+
+
+
+ + ♻ ☆ Piecewise Polynomial Regression of Tame Functions via Integer + Programming + + +
+ We consider approximating so-called tame functions, a class of nonsmooth, +nonconvex functions, with piecewise polynomial functions. Tame functions appear +in a wide range of applications: functions encountered in the training of deep +neural networks with all common activations, value functions of mixed-integer +programs, or wave functions of small molecules. We bound the quality of +approximation of a tame function by a piecewise polynomial function with a +given number of segments on any full-dimensional cube. We also present the +first ever mixed-integer programming formulation of piecewise polynomial +regression. Together, these can be used to estimate tame functions. We +demonstrate promising computational results. + +
+
+
+
+
+ + ♻ ☆ DoGE: Domain Reweighting with Generalization Estimation + + +
+ The coverage and composition of the pretraining data significantly impacts +the generalization ability of Large Language Models (LLMs). Despite its +importance, recent LLMs still rely on heuristics and trial and error to +increase or reduce the influence of data-domains. We propose DOmain reweighting +with Generalization Estimation (DoGE), which optimizes the probability of +sampling from each domain (domain weights) in a principled way. Our approach is +a two-stage process consisting of (i) training a proxy model to obtain domain +weights using a bi-level optimization algorithm; (ii) training a larger base +model by sampling training domains according to the learned domain weights. In +our experiments, we extensively show how DoGE improves the generalization of +the base model to any target data mixture. On the SlimPajama dataset, our base +model gets better perplexity and few-shot reasoning accuracies across $6$ tasks +compared to baseline methods. Moreover, aiming to generalize to out-of-domain +target tasks, which is unseen in the pretraining corpus (OOD domain), DoGE can +effectively identify inter-domain dependencies, and consistently achieves +better test perplexity on the target domain. + +
+
+
+
+
+ + ♻ ☆ DiffusionWorldViewer: Exposing and Broadening the Worldview Reflected by + Generative Text-to-Image Models + + +
+ Generative text-to-image (TTI) models produce high-quality images from short +textual descriptions and are widely used in academic and creative domains. Like +humans, TTI models have a worldview, a conception of the world learned from +their training data and task that influences the images they generate for a +given prompt. However, the worldviews of TTI models are often hidden from +users, making it challenging for users to build intuition about TTI outputs, +and they are often misaligned with users' worldviews, resulting in output +images that do not match user expectations. In response, we introduce +DiffusionWorldViewer, an interactive interface that exposes a TTI model's +worldview across output demographics and provides editing tools for aligning +output images with user perspectives. In a user study with 18 diverse TTI +users, we find that DiffusionWorldViewer helps users represent their varied +viewpoints in generated images and challenge the limited worldview reflected in +current TTI models. + +
+
+ comment: 20 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Improving Neural Additive Models with Bayesian Principles + + +
+ Neural additive models (NAMs) enhance the transparency of deep neural +networks by handling input features in separate additive sub-networks. However, +they lack inherent mechanisms that provide calibrated uncertainties and enable +selection of relevant features and interactions. Approaching NAMs from a +Bayesian perspective, we augment them in three primary ways, namely by a) +providing credible intervals for the individual additive sub-networks; b) +estimating the marginal likelihood to perform an implicit selection of features +via an empirical Bayes procedure; and c) facilitating the ranking of feature +pairs as candidates for second-order interaction in fine-tuned models. In +particular, we develop Laplace-approximated NAMs (LA-NAMs), which show improved +empirical performance on tabular datasets and challenging real-world medical +tasks. + +
+
+
+
+
+ + ♻ ☆ Neural incomplete factorization: learning preconditioners for the + conjugate gradient method + + +
+ Finding suitable preconditioners to accelerate iterative solution methods, +such as the conjugate gradient method, is an active area of research. In this +paper, we develop a computationally efficient data-driven approach to replace +the typically hand-engineered algorithms with neural networks. Optimizing the +condition number of the linear system directly is computationally infeasible. +Instead, our method generates an incomplete factorization of the matrix and is, +therefore, referred to as neural incomplete factorization (NeuralIF). For +efficient training, we utilize a stochastic approximation of the Frobenius loss +which only requires matrix-vector multiplications. At the core of our method is +a novel messagepassing block, inspired by sparse matrix theory, that aligns +with the objective of finding a sparse factorization of the matrix. By +replacing conventional preconditioners used within the conjugate gradient +method by data-driven models based on graph neural networks, we accelerate the +iterative solving procedure. We evaluate our proposed method on both a +synthetic and a real-world problem arising from scientific computing and show +its ability to reduce the solving time while remaining computationally +efficient. + +
+
+ comment: Under review. 18 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ GAD-NR: Graph Anomaly Detection via Neighborhood Reconstruction WSDM-2024 + + +
+ Graph Anomaly Detection (GAD) is a technique used to identify abnormal nodes +within graphs, finding applications in network security, fraud detection, +social media spam detection, and various other domains. A common method for GAD +is Graph Auto-Encoders (GAEs), which encode graph data into node +representations and identify anomalies by assessing the reconstruction quality +of the graphs based on these representations. However, existing GAE models are +primarily optimized for direct link reconstruction, resulting in nodes +connected in the graph being clustered in the latent space. As a result, they +excel at detecting cluster-type structural anomalies but struggle with more +complex structural anomalies that do not conform to clusters. To address this +limitation, we propose a novel solution called GAD-NR, a new variant of GAE +that incorporates neighborhood reconstruction for graph anomaly detection. +GAD-NR aims to reconstruct the entire neighborhood of a node, encompassing the +local structure, self-attributes, and neighbor attributes, based on the +corresponding node representation. By comparing the neighborhood reconstruction +loss between anomalous nodes and normal nodes, GAD-NR can effectively detect +any anomalies. Extensive experimentation conducted on six real-world datasets +validates the effectiveness of GAD-NR, showcasing significant improvements (by +up to 30% in AUC) over state-of-the-art competitors. The source code for GAD-NR +is openly available. Importantly, the comparative analysis reveals that the +existing methods perform well only in detecting one or two types of anomalies +out of the three types studied. In contrast, GAD-NR excels at detecting all +three types of anomalies across the datasets, demonstrating its comprehensive +anomaly detection capabilities. + +
+
+ comment: Accepted at the 17th ACM International Conference on Web Search and + Data Mining (WSDM-2024) +
+
+
+
+
+ + ♻ ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ As Distributed Ledger Technologies (DLTs) rapidly evolve, their impacts +extend beyond technology, influencing environmental and societal aspects. This +evolution has increased publications, making manual literature analysis +increasingly challenging. We address this with a Natural Language Processing +(NLP)-based systematic literature review method to explore the intersection of +Distributed Ledger Technology (DLT) with its Environmental, Social, and +Governance (ESG) aspects. Our approach involves building and refining a +directed citation network from 107 seed papers to a corpus of 24,539 +publications and fine-tuning a transformer-based language model for Named +Entity Recognition (NER) on DLT and ESG domains. Applying this model, we +distilled the corpus to 505 key publications, enabling an inaugural literature +review and temporal graph analysis of DLT's evolution in ESG contexts. Our +contributions include an adaptable and scalable NLP-driven systematic +literature review methodology and a unique NER dataset of 54,808 entities, +tailored for DLT and ESG research. Our inaugural literature review demonstrates +their applicability and effectiveness in analyzing DLT's evolution and impacts, +proving invaluable for stakeholders in the DLT domain. + +
+
+
+
+
+ + ♻ ☆ Extending Path-Dependent NJ-ODEs to Noisy Observations and a Dependent + Observation Framework + + +
+ The Path-Dependent Neural Jump Ordinary Differential Equation (PD-NJ-ODE) is +a model for predicting continuous-time stochastic processes with irregular and +incomplete observations. In particular, the method learns optimal forecasts +given irregularly sampled time series of incomplete past observations. So far +the process itself and the coordinate-wise observation times were assumed to be +independent and observations were assumed to be noiseless. In this work we +discuss two extensions to lift these restrictions and provide theoretical +guarantees as well as empirical examples for them. In particular, we can lift +the assumption of independence by extending the theory to much more realistic +settings of conditional independence without any need to change the algorithm. +Moreover, we introduce a new loss function, which allows us to deal with noisy +observations and explain why the previously used loss function did not lead to +a consistent estimator. + +
+
+
+
+
+ + ♻ ☆ Applications of artificial intelligence in the analysis of + histopathology images of gliomas: a review + + +
+ In recent years, the diagnosis of gliomas has become increasingly complex. +Analysis of glioma histopathology images using artificial intelligence (AI) +offers new opportunities to support diagnosis and outcome prediction. To give +an overview of the current state of research, this review examines 70 publicly +available research studies that have proposed AI-based methods for whole-slide +histopathology images of human gliomas, covering the diagnostic tasks of +subtyping (16/70), grading (23/70), molecular marker prediction (13/70), and +survival prediction (27/70). All studies were reviewed with regard to +methodological aspects as well as clinical applicability. It was found that the +focus of current research is the assessment of hematoxylin and eosin-stained +tissue sections of adult-type diffuse gliomas. The majority of studies (49/70) +are based on the publicly available glioblastoma and low-grade glioma datasets +from The Cancer Genome Atlas (TCGA) and only a few studies employed other +datasets in isolation (10/70) or in addition to the TCGA datasets (11/70). +Current approaches mostly rely on convolutional neural networks (53/70) for +analyzing tissue at 20x magnification (30/70). A new field of research is the +integration of clinical data, omics data, or magnetic resonance imaging +(27/70). So far, AI-based methods have achieved promising results, but are not +yet used in real clinical settings. Future work should focus on the independent +validation of methods on larger, multi-site datasets with high-quality and +up-to-date clinical and molecular pathology annotations to demonstrate routine +applicability. + +
+
+
+
+
+ + ♻ ☆ Improved Sample Complexity Analysis of Natural Policy Gradient Algorithm + with General Parameterization for Infinite Horizon Discounted Reward Markov + Decision Processes + + +
+ We consider the problem of designing sample efficient learning algorithms for +infinite horizon discounted reward Markov Decision Process. Specifically, we +propose the Accelerated Natural Policy Gradient (ANPG) algorithm that utilizes +an accelerated stochastic gradient descent process to obtain the natural policy +gradient. ANPG achieves $\mathcal{O}({\epsilon^{-2}})$ sample complexity and +$\mathcal{O}(\epsilon^{-1})$ iteration complexity with general parameterization +where $\epsilon$ defines the optimality error. This improves the +state-of-the-art sample complexity by a $\log(\frac{1}{\epsilon})$ factor. ANPG +is a first-order algorithm and unlike some existing literature, does not +require the unverifiable assumption that the variance of importance sampling +(IS) weights is upper bounded. In the class of Hessian-free and IS-free +algorithms, ANPG beats the best-known sample complexity by a factor of +$\mathcal{O}(\epsilon^{-\frac{1}{2}})$ and simultaneously matches their +state-of-the-art iteration complexity. + +
+
+
+
+
+ + ♻ ☆ PowerFlowNet: Power Flow Approximation Using Message Passing Graph + Neural Networks + + +
+ Accurate and efficient power flow (PF) analysis is crucial in modern +electrical networks' operation and planning. Therefore, there is a need for +scalable algorithms that can provide accurate and fast solutions for both small +and large scale power networks. As the power network can be interpreted as a +graph, Graph Neural Networks (GNNs) have emerged as a promising approach for +improving the accuracy and speed of PF approximations by exploiting information +sharing via the underlying graph structure. In this study, we introduce +PowerFlowNet, a novel GNN architecture for PF approximation that showcases +similar performance with the traditional Newton-Raphson method but achieves it +4 times faster in the simple IEEE 14-bus system and 145 times faster in the +realistic case of the French high voltage network (6470rte). Meanwhile, it +significantly outperforms other traditional approximation methods, such as the +DC relaxation method, in terms of performance and execution time; therefore, +making PowerFlowNet a highly promising solution for real-world PF analysis. +Furthermore, we verify the efficacy of our approach by conducting an in-depth +experimental evaluation, thoroughly examining the performance, scalability, +interpretability, and architectural dependability of PowerFlowNet. The +evaluation provides insights into the behavior and potential applications of +GNNs in power system analysis. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Fast Empirical Scenarios + + +
+ We seek to extract a small number of representative scenarios from large and +high-dimensional panel data that are consistent with sample moments. Among two +novel algorithms, the first identifies scenarios that have not been observed +before, and comes with a scenario-based representation of covariance matrices. +The second proposal picks important data points from states of the world that +have already realized, and are consistent with higher-order sample moment +information. Both algorithms are efficient to compute, and lend themselves to +consistent scenario-based modeling and high-dimensional numerical integration. +Extensive numerical benchmarking studies and an application in portfolio +optimization favor the proposed algorithms. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Zero-Level-Set Encoder for Neural Distance Fields + + +
+ Neural shape representation generally refers to representing 3D geometry +using neural networks, e.g., to compute a signed distance or occupancy value at +a specific spatial position. In this paper, we present a novel encoder-decoder +neural network for embedding 3D shapes in a single forward pass. Our +architecture is based on a multi-scale hybrid system incorporating graph-based +and voxel-based components, as well as a continuously differentiable decoder. +Furthermore, the network is trained to solve the Eikonal equation and only +requires knowledge of the zero-level set for training and inference. This means +that in contrast to most previous work, our network is able to output valid +signed distance fields without explicit prior knowledge of non-zero distance +values or shape occupancy. We further propose a modification of the loss +function in case that surface normals are not well defined, e.g., in the +context of non-watertight surfaces and non-manifold geometry. Overall, this can +help reduce the computational overhead of training and evaluating neural +distance fields, as well as enabling the application to difficult shapes. We +finally demonstrate the efficacy, generalizability and scalability of our +method on datasets consisting of deforming shapes, both based on simulated data +and raw 3D scans. We further show single-class and multi-class encoding, on +both fixed and variable vertex-count inputs, showcasing a wide range of +possible applications. + +
+
+
+
+
+ + ♻ ☆ Improving Protein Optimization with Smoothed Fitness Landscapes ICLR 2024 + + +
+ The ability to engineer novel proteins with higher fitness for a desired +property would be revolutionary for biotechnology and medicine. Modeling the +combinatorially large space of sequences is infeasible; prior methods often +constrain optimization to a small mutational radius, but this drastically +limits the design space. Instead of heuristics, we propose smoothing the +fitness landscape to facilitate protein optimization. First, we formulate +protein fitness as a graph signal then use Tikunov regularization to smooth the +fitness landscape. We find optimizing in this smoothed landscape leads to +improved performance across multiple methods in the GFP and AAV benchmarks. +Second, we achieve state-of-the-art results utilizing discrete energy-based +models and MCMC in the smoothed landscape. Our method, called Gibbs sampling +with Graph-based Smoothing (GGS), demonstrates a unique ability to achieve 2.5 +fold fitness improvement (with in-silico evaluation) over its training set. GGS +demonstrates potential to optimize proteins in the limited data regime. Code: +https://github.com/kirjner/GGS + +
+
+ comment: ICLR 2024. Code: https://github.com/kirjner/GGS +
+
+
+
+
+ + ♻ ☆ Translating Subgraphs to Nodes Makes Simple GNNs Strong and Efficient + for Subgraph Representation Learning + + +
+ Subgraph representation learning has emerged as an important problem, but it +is by default approached with specialized graph neural networks on a large +global graph. These models demand extensive memory and computational resources +but challenge modeling hierarchical structures of subgraphs. In this paper, we +propose Subgraph-To-Node (S2N) translation, a novel formulation for learning +representations of subgraphs. Specifically, given a set of subgraphs in the +global graph, we construct a new graph by coarsely transforming subgraphs into +nodes. Demonstrating both theoretical and empirical evidence, S2N not only +significantly reduces memory and computational costs compared to +state-of-the-art models but also outperforms them by capturing both local and +global structures of the subgraph. By leveraging graph coarsening methods, our +method outperforms baselines even in a data-scarce setting with insufficient +subgraphs. Our experiments on eight benchmarks demonstrate that fined-tuned +models with S2N translation can process 183 -- 711 times more subgraph samples +than state-of-the-art models at a better or similar performance level. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Attention-Refined Unrolling for Sparse Sequential micro-Doppler + Reconstruction + + +
+ The reconstruction of micro-Doppler signatures of human movements is a key +enabler for fine-grained activity recognition wireless sensing. In Joint +Communication and Sensing (JCS) systems, unlike in dedicated radar sensing +systems, a suitable trade-off between sensing accuracy and communication +overhead has to be attained. It follows that the micro-Doppler has to be +reconstructed from incomplete windows of channel estimates obtained from +communication packets. Existing approaches exploit compressed sensing, but +produce very poor reconstructions when only a few channel measurements are +available, which is often the case with real communication patterns. In +addition, the large number of iterations they need to converge hinders their +use in real-time systems. In this work, we propose and validate STAR, a neural +network that reconstructs micro-Doppler sequences of human movement even from +highly incomplete channel measurements. STAR is based upon a new architectural +design that combines a single unrolled iterative hard-thresholding layer with +an attention mechanism, used at its output. This results in an interpretable +and lightweight architecture that reaps the benefits of both model-based and +data driven solutions. STAR is evaluated on a public JCS dataset of 60 GHz +channel measurements of human activity traces. Experimental results show that +it substantially outperforms state-of-the-art techniques in terms of the +reconstructed micro-Doppler quality. Remarkably, STAR enables human activity +recognition with satisfactory accuracy even with 90% of missing channel +measurements, for which existing techniques fail. + +
+
+ comment: 16 pages, 10 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Towards Understanding Clean Generalization and Robust Overfitting in + Adversarial Training + + +
+ Similar to surprising performance in the standard deep learning, deep nets +trained by adversarial training also generalize well for $\textit{unseen clean +data (natural data)}$. However, despite adversarial training can achieve low +robust training error, there exists a significant $\textit{robust +generalization gap}$. We call this phenomenon the $\textit{Clean Generalization +and Robust Overfitting (CGRO)}$. In this work, we study the CGRO phenomenon in +adversarial training from two views: $\textit{representation complexity}$ and +$\textit{training dynamics}$. Specifically, we consider a binary classification +setting with $N$ separated training data points. $\textit{First}$, we prove +that, based on the assumption that we assume there is +$\operatorname{poly}(D)$-size clean classifier (where $D$ is the data +dimension), ReLU net with only $O(N D)$ extra parameters is able to leverages +robust memorization to achieve the CGRO, while robust classifier still requires +exponential representation complexity in worst case. $\textit{Next}$, we focus +on a structured-data case to analyze training dynamics, where we train a +two-layer convolutional network with $O(N D)$ width against adversarial +perturbation. We then show that a three-stage phase transition occurs during +learning process and the network provably converges to robust memorization +regime, which thereby results in the CGRO. $\textit{Besides}$, we also +empirically verify our theoretical analysis by experiments in real-image +recognition datasets. + +
+
+ comment: 28 pages, comments welcome +
+
+
+
+
+ + ♻ ☆ Cascaded Scaling Classifier: class incremental learning with probability + scaling + + +
+ Humans are capable of acquiring new knowledge and transferring learned +knowledge into different domains, incurring a small forgetting. The same +ability, called Continual Learning, is challenging to achieve when operating +with neural networks due to the forgetting affecting past learned tasks when +learning new ones. This forgetting can be mitigated by replaying stored samples +from past tasks, but a large memory size may be needed for long sequences of +tasks; moreover, this could lead to overfitting on saved samples. In this +paper, we propose a novel regularisation approach and a novel incremental +classifier called, respectively, Margin Dampening and Cascaded Scaling +Classifier. The first combines a soft constraint and a knowledge distillation +approach to preserve past learned knowledge while allowing the model to learn +new patterns effectively. The latter is a gated incremental classifier, helping +the model modify past predictions without directly interfering with them. This +is achieved by modifying the output of the model with auxiliary scaling +functions. We empirically show that our approach performs well on multiple +benchmarks against well-established baselines, and we also study each component +of our proposal and how the combinations of such components affect the final +results. + +
+
+ comment: Paper under review. The official code is available + https://github.com/jaryP/Cascaded-Scaling-Classifier +
+
+
+
+
+ + ♻ ☆ Cost-Efficient Online Decision Making: A Combinatorial Multi-Armed + Bandit Approach + + +
+ Online decision making plays a crucial role in numerous real-world +applications. In many scenarios, the decision is made based on performing a +sequence of tests on the incoming data points. However, performing all tests +can be expensive and is not always possible. In this paper, we provide a novel +formulation of the online decision making problem based on combinatorial +multi-armed bandits and take the (possibly stochastic) cost of performing tests +into account. Based on this formulation, we provide a new framework for +cost-efficient online decision making which can utilize posterior sampling or +BayesUCB for exploration. We provide a theoretical analysis of Thompson +Sampling for cost-efficient online decision making, and present various +experimental results that demonstrate the applicability of our framework to +real-world problems. + +
+
+
+
+
+ + ♻ ☆ A Rational Model of Dimension-reduced Human Categorization + + +
+ Humans tend to categorize objects based on a few key features. We propose a +rational model of categorization that utilizes a mixture of probabilistic +principal component analyzers (mPPCA). This model represents each category with +reduced feature dimensions and allows local features to be shared across +categories to facilitate few-shot learning. Theoretically, we identify the +necessary and sufficient condition for dimension-reduced representation to +outperform full-dimension representation. We then show the superior performance +of mPPCA in predicting human categorization over exemplar and prototype models +in a behavioral experiment. When combined with the convolutional neural +network, the mPPCA classifier with a single principal component dimension for +each category achieves comparable performance to ResNet with a linear +classifier on the ${\tt CIFAR-10H}$ human categorization dataset. + +
+
+
+
+
+ + ♻ ☆ Quantum Neural Estimation of Entropies + + +
+ Entropy measures quantify the amount of information and correlation present +in a quantum system. In practice, when the quantum state is unknown and only +copies thereof are available, one must resort to the estimation of such entropy +measures. Here we propose a variational quantum algorithm for estimating the +von Neumann and R\'enyi entropies, as well as the measured relative entropy and +measured R\'enyi relative entropy. Our approach first parameterizes a +variational formula for the measure of interest by a quantum circuit and a +classical neural network, and then optimizes the resulting objective over +parameter space. Numerical simulations of our quantum algorithm are provided, +using a noiseless quantum simulator. The algorithm provides accurate estimates +of the various entropy measures for the examples tested, which renders it as a +promising approach for usage in downstream tasks. + +
+
+ comment: 14 pages, 2 figures; see also independent works of Shin, Lee, and + Jeong at arXiv:2306.14566v1 and Lee, Kwon, and Lee at arXiv:2307.13511v2 +
+
+
+
+
+ + ♻ ☆ Almost Tight Error Bounds on Differentially Private Continual Counting + + +
+ The first large-scale deployment of private federated learning uses +differentially private counting in the continual release model as a subroutine +(Google AI blog titled "Federated Learning with Formal Differential Privacy +Guarantees"). In this case, a concrete bound on the error is very relevant to +reduce the privacy parameter. The standard mechanism for continual counting is +the binary mechanism. We present a novel mechanism and show that its mean +squared error is both asymptotically optimal and a factor 10 smaller than the +error of the binary mechanism. We also show that the constants in our analysis +are almost tight by giving non-asymptotic lower and upper bounds that differ +only in the constants of lower-order terms. Our algorithm is a matrix mechanism +for the counting matrix and takes constant time per release. We also use our +explicit factorization of the counting matrix to give an upper bound on the +excess risk of the private learning algorithm of Denisov et al. (NeurIPS 2022). +Our lower bound for any continual counting mechanism is the first tight lower +bound on continual counting under approximate differential privacy. It is +achieved using a new lower bound on a certain factorization norm, denoted by +$\gamma_F(\cdot)$, in terms of the singular values of the matrix. In +particular, we show that for any complex matrix, $A \in \mathbb{C}^{m \times +n}$, \[ \gamma_F(A) \geq \frac{1}{\sqrt{m}}\|A\|_1, \] where $\|\cdot \|$ +denotes the Schatten-1 norm. + We believe this technique will be useful in proving lower bounds for a larger +class of linear queries. To illustrate the power of this technique, we show the +first lower bound on the mean squared error for answering parity queries. + +
+
+ comment: Updated the citations to include two papers we learned about since + version 01 +
+
+
+
+
+ + ♻ ☆ Navigating Neural Space: Revisiting Concept Activation Vectors to + Overcome Directional Divergence + + +
+ With a growing interest in understanding neural network prediction +strategies, Concept Activation Vectors (CAVs) have emerged as a popular tool +for modeling human-understandable concepts in the latent space. Commonly, CAVs +are computed by leveraging linear classifiers optimizing the separability of +latent representations of samples with and without a given concept. However, in +this paper we show that such a separability-oriented computation leads to +solutions, which may diverge from the actual goal of precisely modeling the +concept direction. This discrepancy can be attributed to the significant +influence of distractor directions, i.e., signals unrelated to the concept, +which are picked up by filters (i.e., weights) of linear models to optimize +class-separability. To address this, we introduce pattern-based CAVs, solely +focussing on concept signals, thereby providing more accurate concept +directions. We evaluate various CAV methods in terms of their alignment with +the true concept direction and their impact on CAV applications, including +concept sensitivity testing and model correction for shortcut behavior caused +by data artifacts. We demonstrate the benefits of pattern-based CAVs using the +Pediatric Bone Age, ISIC2019, and FunnyBirds datasets with VGG, ResNet, and +EfficientNet model architectures. + +
+
+
+
+
+ + ♻ ☆ Efficient Subseasonal Weather Forecast using Teleconnection-informed + Transformers + + +
+ Subseasonal forecasting, which is pivotal for agriculture, water resource +management, and early warning of disasters, faces challenges due to the chaotic +nature of the atmosphere. Recent advances in machine learning (ML) have +revolutionized weather forecasting by achieving competitive predictive skills +to numerical models. However, training such foundation models requires +thousands of GPU days, which causes substantial carbon emissions and limits +their broader applicability. Moreover, ML models tend to fool the pixel-wise +error scores by producing smoothed results which lack physical consistency and +meteorological meaning. To deal with the aforementioned problems, we propose a +teleconnection-informed transformer. Our architecture leverages the pretrained +Pangu model to achieve good initial weights and integrates a +teleconnection-informed temporal module to improve predictability in an +extended temporal range. Remarkably, by adjusting 1.1% of the Pangu model's +parameters, our method enhances predictability on four surface and five +upper-level atmospheric variables at a two-week lead time. Furthermore, the +teleconnection-filtered features improve the spatial granularity of outputs +significantly, indicating their potential physical consistency. Our research +underscores the importance of atmospheric and oceanic teleconnections in +driving future weather conditions. Besides, it presents a resource-efficient +pathway for researchers to leverage existing foundation models on versatile +downstream tasks. + +
+
+ comment: Submitted to IGARSS 2024 +
+
+
+
+
+ + ♻ ☆ LoTR: Low Tensor Rank Weight Adaptation + + +
+ In this paper we generalize and extend an idea of low-rank adaptation (LoRA) +of large language models (LLMs) based on Transformer architecture. Widely used +LoRA-like methods of fine-tuning LLMs are based on matrix factorization of +gradient update. We introduce LoTR, a novel approach for parameter-efficient +fine-tuning of LLMs which represents a gradient update to parameters in a form +of tensor decomposition. Low-rank adapter for each layer is constructed as a +product of three matrices, and tensor structure arises from sharing left and +right multipliers of this product among layers. Simultaneous compression of a +sequence of layers with low-rank tensor representation allows LoTR to archive +even better parameter efficiency then LoRA especially for deep models. +Moreover, the core tensor does not depend on original weight dimension and can +be made arbitrary small, which allows for extremely cheap and fast downstream +fine-tuning. + +
+
+ comment: Submitted; missing author and sections were added; +
+
+
+
+
+
+
+
+ + Multimedia 10 + +
+
+
+ + ☆ Unified Hallucination Detection for Multimodal Large Language Models + + +
+ Despite significant strides in multimodal tasks, Multimodal Large Language +Models (MLLMs) are plagued by the critical issue of hallucination. The reliable +detection of such hallucinations in MLLMs has, therefore, become a vital aspect +of model evaluation and the safeguarding of practical application deployment. +Prior research in this domain has been constrained by a narrow focus on +singular tasks, an inadequate range of hallucination categories addressed, and +a lack of detailed granularity. In response to these challenges, our work +expands the investigative horizons of hallucination detection. We present a +novel meta-evaluation benchmark, MHaluBench, meticulously crafted to facilitate +the evaluation of advancements in hallucination detection methods. +Additionally, we unveil a novel unified multimodal hallucination detection +framework, UNIHD, which leverages a suite of auxiliary tools to validate the +occurrence of hallucinations robustly. We demonstrate the effectiveness of +UNIHD through meticulous evaluation and comprehensive analysis. We also provide +strategic insights on the application of specific tools for addressing various +categories of hallucinations. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ InteractiveVideo: User-Centric Controllable Video Generation with + Synergistic Multimodal Instructions + + +
+ We introduce $\textit{InteractiveVideo}$, a user-centric framework for video +generation. Different from traditional generative approaches that operate based +on user-provided images or text, our framework is designed for dynamic +interaction, allowing users to instruct the generative model through various +intuitive mechanisms during the whole generation process, e.g. text and image +prompts, painting, drag-and-drop, etc. We propose a Synergistic Multimodal +Instruction mechanism, designed to seamlessly integrate users' multimodal +instructions into generative models, thus facilitating a cooperative and +responsive interaction between user inputs and the generative process. This +approach enables iterative and fine-grained refinement of the generation result +through precise and effective user instructions. With +$\textit{InteractiveVideo}$, users are given the flexibility to meticulously +tailor key aspects of a video. They can paint the reference image, edit +semantics, and adjust video motions until their requirements are fully met. +Code, models, and demo are available at +https://github.com/invictus717/InteractiveVideo + +
+
+ comment: Code, models, and demo are available at + https://github.com/invictus717/InteractiveVideo +
+
+
+
+
+ + ☆ Panoramic Image Inpainting With Gated Convolution And Contextual + Reconstruction Loss ICASSP 2024 + + +
+ Deep learning-based methods have demonstrated encouraging results in tackling +the task of panoramic image inpainting. However, it is challenging for existing +methods to distinguish valid pixels from invalid pixels and find suitable +references for corrupted areas, thus leading to artifacts in the inpainted +results. In response to these challenges, we propose a panoramic image +inpainting framework that consists of a Face Generator, a Cube Generator, a +side branch, and two discriminators. We use the Cubemap Projection (CMP) format +as network input. The generator employs gated convolutions to distinguish valid +pixels from invalid ones, while a side branch is designed utilizing contextual +reconstruction (CR) loss to guide the generators to find the most suitable +reference patch for inpainting the missing region. The proposed method is +compared with state-of-the-art (SOTA) methods on SUN360 Street View dataset in +terms of PSNR and SSIM. Experimental results and ablation study demonstrate +that the proposed method outperforms SOTA both quantitatively and +qualitatively. + +
+
+ comment: Copyright 2024 IEEE - to appear in IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Perceptual Learned Image Compression via End-to-End JND-Based + Optimization ICIP 2024 + + +
+ Emerging Learned image Compression (LC) achieves significant improvements in +coding efficiency by end-to-end training of neural networks for compression. An +important benefit of this approach over traditional codecs is that any +optimization criteria can be directly applied to the encoder-decoder networks +during training. Perceptual optimization of LC to comply with the Human Visual +System (HVS) is among such criteria, which has not been fully explored yet. +This paper addresses this gap by proposing a novel framework to integrate Just +Noticeable Distortion (JND) principles into LC. Leveraging existing JND +datasets, three perceptual optimization methods are proposed to integrate JND +into the LC training process: (1) Pixel-Wise JND Loss (PWL) prioritizes +pixel-by-pixel fidelity in reproducing JND characteristics, (2) Image-Wise JND +Loss (IWL) emphasizes on overall imperceptible degradation levels, and (3) +Feature-Wise JND Loss (FWL) aligns the reconstructed image features with +perceptually significant features. Experimental evaluations demonstrate the +effectiveness of JND integration, highlighting improvements in rate-distortion +performance and visual quality, compared to baseline methods. The proposed +methods add no extra complexity after training. + +
+
+ comment: Copyright 2024 IEEE - Submitted to IEEE ICIP 2024 +
+
+
+
+
+ + ☆ ToonAging: Face Re-Aging upon Artistic Portrait Style Transfer + + +
+ Face re-aging is a prominent field in computer vision and graphics, with +significant applications in photorealistic domains such as movies, advertising, +and live streaming. Recently, the need to apply face re-aging to +non-photorealistic images, like comics, illustrations, and animations, has +emerged as an extension in various entertainment sectors. However, the absence +of a network capable of seamlessly editing the apparent age on NPR images means +that these tasks have been confined to a naive approach, applying each task +sequentially. This often results in unpleasant artifacts and a loss of facial +attributes due to domain discrepancies. In this paper, we introduce a novel +one-stage method for face re-aging combined with portrait style transfer, +executed in a single generative step. We leverage existing face re-aging and +style transfer networks, both trained within the same PR domain. Our method +uniquely fuses distinct latent vectors, each responsible for managing +aging-related attributes and NPR appearance. Adopting an exemplar-based +approach, our method offers greater flexibility than domain-level fine-tuning +approaches, which typically require separate training or fine-tuning for each +domain. This effectively addresses the limitation of requiring paired datasets +for re-aging and domain-level, data-driven approaches for stylization. Our +experiments show that our model can effortlessly generate re-aged images while +simultaneously transferring the style of examples, maintaining both natural +appearance and controllability. + +
+
+ comment: 8 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ Video Super-Resolution for Optimized Bitrate and Green Online Streaming + + +
+ Conventional per-title encoding schemes strive to optimize encoding +resolutions to deliver the utmost perceptual quality for each bitrate ladder +representation. Nevertheless, maintaining encoding time within an acceptable +threshold is equally imperative in online streaming applications. Furthermore, +modern client devices are equipped with the capability for fast +deep-learning-based video super-resolution (VSR) techniques, enhancing the +perceptual quality of the decoded bitstream. This suggests that opting for +lower resolutions in representations during the encoding process can curtail +the overall energy consumption without substantially compromising perceptual +quality. In this context, this paper introduces a video super-resolution-based +latency-aware optimized bitrate encoding scheme (ViSOR) designed for online +adaptive streaming applications. ViSOR determines the encoding resolution for +each target bitrate, ensuring the highest achievable perceptual quality after +VSR within the bound of a maximum acceptable latency. Random forest-based +prediction models are trained to predict the perceptual quality after VSR and +the encoding time for each resolution using the spatiotemporal features +extracted for each video segment. Experimental results show that ViSOR +targeting fast super-resolution convolutional neural network (FSRCNN) achieves +an overall average bitrate reduction of 24.65 % and 32.70 % to maintain the +same PSNR and VMAF, compared to the HTTP Live Streaming (HLS) bitrate ladder +encoding of 4 s segments using the x265 encoder, when the maximum acceptable +latency for each representation is set as two seconds. Considering a just +noticeable difference (JND) of six VMAF points, the average cumulative storage +consumption and encoding energy for each segment is reduced by 79.32 % and +68.21 %, respectively, contributing towards greener streaming. + +
+
+ comment: 2024 Picture Coding Symposium (PCS) +
+
+
+
+
+ + ☆ Perceptual Video Quality Assessment: A Survey + + +
+ Perceptual video quality assessment plays a vital role in the field of video +processing due to the existence of quality degradations introduced in various +stages of video signal acquisition, compression, transmission and display. With +the advancement of internet communication and cloud service technology, video +content and traffic are growing exponentially, which further emphasizes the +requirement for accurate and rapid assessment of video quality. Therefore, +numerous subjective and objective video quality assessment studies have been +conducted over the past two decades for both generic videos and specific videos +such as streaming, user-generated content (UGC), 3D, virtual and augmented +reality (VR and AR), high frame rate (HFR), audio-visual, etc. This survey +provides an up-to-date and comprehensive review of these video quality +assessment studies. Specifically, we first review the subjective video quality +assessment methodologies and databases, which are necessary for validating the +performance of video quality metrics. Second, the objective video quality +assessment algorithms for general purposes are surveyed and concluded according +to the methodologies utilized in the quality measures. Third, we overview the +objective video quality assessment measures for specific applications and +emerging topics. Finally, the performances of the state-of-the-art video +quality assessment measures are compared and analyzed. This survey provides a +systematic overview of both classical works and recent progresses in the realm +of video quality assessment, which can help other researchers quickly access +the field and conduct relevant research. + +
+
+
+
+
+ + ♻ ☆ Multimodal Speech Enhancement Using Burst Propagation + + +
+ This paper proposes the MBURST, a novel multimodal solution for audio-visual +speech enhancements that consider the most recent neurological discoveries +regarding pyramidal cells of the prefrontal cortex and other brain regions. The +so-called burst propagation implements several criteria to address the credit +assignment problem in a more biologically plausible manner: steering the sign +and magnitude of plasticity through feedback, multiplexing the feedback and +feedforward information across layers through different weight connections, +approximating feedback and feedforward connections, and linearizing the +feedback signals. MBURST benefits from such capabilities to learn correlations +between the noisy signal and the visual stimuli, thus attributing meaning to +the speech by amplifying relevant information and suppressing noise. +Experiments conducted over a Grid Corpus and CHiME3-based dataset show that +MBURST can reproduce similar mask reconstructions to the multimodal +backpropagation-based baseline while demonstrating outstanding energy +efficiency management, reducing the neuron firing rates to values up to +\textbf{$70\%$} lower. Such a feature implies more sustainable implementations, +suitable and desirable for hearing aids or any other similar embedded systems. + +
+
+
+
+
+ + ♻ ☆ SSTFormer: Bridging Spiking Neural Network and Memory Support + Transformer for Frame-Event based Recognition + + +
+ Event camera-based pattern recognition is a newly arising research topic in +recent years. Current researchers usually transform the event streams into +images, graphs, or voxels, and adopt deep neural networks for event-based +classification. Although good performance can be achieved on simple event +recognition datasets, however, their results may be still limited due to the +following two issues. Firstly, they adopt spatial sparse event streams for +recognition only, which may fail to capture the color and detailed texture +information well. Secondly, they adopt either Spiking Neural Networks (SNN) for +energy-efficient recognition with suboptimal results, or Artificial Neural +Networks (ANN) for energy-intensive, high-performance recognition. However, +seldom of them consider achieving a balance between these two aspects. In this +paper, we formally propose to recognize patterns by fusing RGB frames and event +streams simultaneously and propose a new RGB frame-event recognition framework +to address the aforementioned issues. The proposed method contains four main +modules, i.e., memory support Transformer network for RGB frame encoding, +spiking neural network for raw event stream encoding, multi-modal bottleneck +fusion module for RGB-Event feature aggregation, and prediction head. Due to +the scarce of RGB-Event based classification dataset, we also propose a +large-scale PokerEvent dataset which contains 114 classes, and 27102 +frame-event pairs recorded using a DVS346 event camera. Extensive experiments +on two RGB-Event based classification datasets fully validated the +effectiveness of our proposed framework. We hope this work will boost the +development of pattern recognition by fusing RGB frames and event streams. Both +our dataset and source code of this work will be released at +https://github.com/Event-AHU/SSTFormer. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As Multimodal Large Language Models (MLLMs) grow in size, adapting them to +specialized tasks becomes increasingly challenging due to high computational +and memory demands. Indeed, traditional fine-tuning methods are costly, due to +the need for extensive, task-specific training. While efficient adaptation +methods exist that aim to reduce these costs, in practice they suffer from +shallow inter-modal alignment, which severely hurts model effectiveness. To +tackle these computational challenges and improve inter-modal alignment, we +introduce the MultiWay-Adapter (MWA), a novel framework featuring an 'Alignment +Enhancer'. This enhancer deepens inter-modal alignment, enabling high +transferability with minimal tuning effort. Our experiments show that unlike +prior efficient tuning approaches, MWA maintains model effectiveness, while +reducing training time by up-to 57%. MWA is also lightweight, increasing model +size by only 2-3% (in terms of parameters) for state-of-the-art foundation +models like BEiT-3 Large. These results demonstrate that MWA provides an +efficient and effective adaptation method for MLLMs, significantly broadening +their applicability. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ LLM-Enhanced Data Management + + +
+ Machine learning (ML) techniques for optimizing data management problems have +been extensively studied and widely deployed in recent five years. However +traditional ML methods have limitations on generalizability (adapting to +different scenarios) and inference ability (understanding the context). +Fortunately, large language models (LLMs) have shown high generalizability and +human-competitive abilities in understanding context, which are promising for +data management tasks (e.g., database diagnosis, database tuning). However, +existing LLMs have several limitations: hallucination, high cost, and low +accuracy for complicated tasks. To address these challenges, we design LLMDB, +an LLM-enhanced data management paradigm which has generalizability and high +inference ability while avoiding hallucination, reducing LLM cost, and +achieving high accuracy. LLMDB embeds domain-specific knowledge to avoid +hallucination by LLM fine-tuning and prompt engineering. LLMDB reduces the high +cost of LLMs by vector databases which provide semantic search and caching +abilities. LLMDB improves the task accuracy by LLM agent which provides +multiple-round inference and pipeline executions. We showcase three real-world +scenarios that LLMDB can well support, including query rewrite, database +diagnosis and data analytics. We also summarize the open research challenges of +LLMDB. + +
+
+
+
+
+ + ☆ It's how you do things that matters": Attending to Process to Better + Serve Indigenous Communities with Language Technologies + + +
+ Indigenous languages are historically under-served by Natural Language +Processing (NLP) technologies, but this is changing for some languages with the +recent scaling of large multilingual models and an increased focus by the NLP +community on endangered languages. This position paper explores ethical +considerations in building NLP technologies for Indigenous languages, based on +the premise that such projects should primarily serve Indigenous communities. +We report on interviews with 17 researchers working in or with Aboriginal +and/or Torres Strait Islander communities on language technology projects in +Australia. Drawing on insights from the interviews, we recommend practices for +NLP researchers to increase attention to the process of engagements with +Indigenous communities, rather than focusing only on decontextualised +artefacts. + +
+
+
+
+
+ + ☆ Can Large Language Models Learn Independent Causal Mechanisms? + + +
+ Despite impressive performance on language modelling and complex reasoning +tasks, Large Language Models (LLMs) fall short on the same tasks in uncommon +settings or with distribution shifts, exhibiting some lack of generalisation +ability. This issue has usually been alleviated by feeding more training data +into the LLM. However, this method is brittle, as the scope of tasks may not be +readily predictable or may evolve, and updating the model with new data +generally requires extensive additional training. By contrast, systems, such as +causal models, that learn abstract variables and causal relationships can +demonstrate increased robustness against changes in the distribution. One +reason for this success is the existence and use of Independent Causal +Mechanisms (ICMs) representing high-level concepts that only sparsely interact. +In this work, we apply two concepts from causality to learn ICMs within LLMs. +We develop a new LLM architecture composed of multiple sparsely interacting +language modelling modules. We introduce a routing scheme to induce +specialisation of the network into domain-specific modules. We also present a +Mutual Information minimisation objective that trains a separate module to +learn abstraction and domain-invariant mechanisms. We show that such causal +constraints can improve out-of-distribution performance on abstract and causal +reasoning tasks. + +
+
+ comment: 17 pages, 8 pages for the main paper and 9 pages for references and + appendices, 12 figures +
+
+
+
+
+ + ☆ Predicting Machine Translation Performance on Low-Resource Languages: + The Role of Domain Similarity EACL 2024 + + +
+ Fine-tuning and testing a multilingual large language model is expensive and +challenging for low-resource languages (LRLs). While previous studies have +predicted the performance of natural language processing (NLP) tasks using +machine learning methods, they primarily focus on high-resource languages, +overlooking LRLs and shifts across domains. Focusing on LRLs, we investigate +three factors: the size of the fine-tuning corpus, the domain similarity +between fine-tuning and testing corpora, and the language similarity between +source and target languages. We employ classical regression models to assess +how these factors impact the model's performance. Our results indicate that +domain similarity has the most critical impact on predicting the performance of +Machine Translation models. + +
+
+ comment: 13 pages, 5 figures, accepted to EACL 2024, findings +
+
+
+
+
+ + ☆ GIRT-Model: Automated Generation of Issue Report Templates + + +
+ Platforms such as GitHub and GitLab introduce Issue Report Templates (IRTs) +to enable more effective issue management and better alignment with developer +expectations. However, these templates are not widely adopted in most +repositories, and there is currently no tool available to aid developers in +generating them. In this work, we introduce GIRT-Model, an assistant language +model that automatically generates IRTs based on the developer's instructions +regarding the structure and necessary fields. We create GIRT-Instruct, a +dataset comprising pairs of instructions and IRTs, with the IRTs sourced from +GitHub repositories. We use GIRT-Instruct to instruction-tune a T5-base model +to create the GIRT-Model. In our experiments, GIRT-Model outperforms general +language models (T5 and Flan-T5 with different parameter sizes) in IRT +generation by achieving significantly higher scores in ROUGE, BLEU, METEOR, and +human evaluation. Additionally, we analyze the effectiveness of GIRT-Model in a +user study in which participants wrote short IRTs with GIRT-Model. Our results +show that the participants find GIRT-Model useful in the automated generation +of templates. We hope that through the use of GIRT-Model, we can encourage more +developers to adopt IRTs in their repositories. We publicly release our code, +dataset, and model at https://github.com/ISE-Research/girt-model. + +
+
+ comment: Accepted to be published at the 21st IEEE/ACM International + Conference on Mining Software Repositories (MSR 2024) +
+
+
+
+
+ + ☆ Enhancing Transformer RNNs with Multiple Temporal Perspectives ICML 2024 + + +
+ We introduce the concept of multiple temporal perspectives, a novel approach +applicable to Recurrent Neural Network (RNN) architectures for enhancing their +understanding of sequential data. This method involves maintaining diverse +temporal views of previously encountered text, significantly enriching the +language models' capacity to interpret context. To show the efficacy of this +approach, we incorporate it into the Receptance Weighted Key Value (RWKV) +architecture, addressing its inherent challenge of retaining all historical +information within a single hidden state. Notably, this improvement is achieved +with a minimal increase in the number of parameters --even as little as +$0.04\%$ of the original number of parameters. Further, the additional +parameters necessary for the multiple temporal perspectives are fine-tuned with +minimal computational overhead, avoiding the need for a full pre-training. The +resulting model maintains linear computational complexity during prompt +inference, ensuring consistent efficiency across various sequence lengths. The +empirical results and ablation studies included in our research validate the +effectiveness of our approach, showcasing improved performance across multiple +benchmarks. The code, model weights and datasets are open-sourced at: +https://github.com/RazvanDu/TemporalRNNs. + +
+
+ comment: 11 pages, 8 figures, 4 tables, in review for ICML 2024 +
+
+
+
+
+ + ☆ DenseFormer: Enhancing Information Flow in Transformers via Depth + Weighted Averaging + + +
+ The transformer architecture from Vaswani et al. (2017) is now ubiquitous +across application domains, from natural language processing to speech +processing and image understanding. We propose DenseFormer, a simple +modification to the standard architecture that improves the perplexity of the +model without increasing its size -- adding a few thousand parameters for +large-scale models in the 100B parameters range. Our approach relies on an +additional averaging step after each transformer block, which computes a +weighted average of current and past representations -- we refer to this +operation as Depth-Weighted-Average (DWA). The learned DWA weights exhibit +coherent patterns of information flow, revealing the strong and structured +reuse of activations from distant layers. Experiments demonstrate that +DenseFormer is more data efficient, reaching the same perplexity of much deeper +transformer models, and that for the same perplexity, these new models +outperform transformer baselines in terms of memory efficiency and inference +time. + +
+
+
+
+
+ + ☆ Layer-Wise Analysis of Self-Supervised Acoustic Word Embeddings: A Study + on Speech Emotion Recognition ICASSP2024 + + +
+ The efficacy of self-supervised speech models has been validated, yet the +optimal utilization of their representations remains challenging across diverse +tasks. In this study, we delve into Acoustic Word Embeddings (AWEs), a +fixed-length feature derived from continuous representations, to explore their +advantages in specific tasks. AWEs have previously shown utility in capturing +acoustic discriminability. In light of this, we propose measuring layer-wise +similarity between AWEs and word embeddings, aiming to further investigate the +inherent context within AWEs. Moreover, we evaluate the contribution of AWEs, +in comparison to other types of speech features, in the context of Speech +Emotion Recognition (SER). Through a comparative experiment and a layer-wise +accuracy analysis on two distinct corpora, IEMOCAP and ESD, we explore +differences between AWEs and raw self-supervised representations, as well as +the proper utilization of AWEs alone and in combination with word embeddings. +Our findings underscore the acoustic context conveyed by AWEs and showcase the +highly competitive SER accuracies by appropriately employing AWEs. + +
+
+ comment: Accepted to ICASSP2024 Self-supervision in Audio, Speech and Beyond + (SASB) workshop. First two authors contributed equally +
+
+
+
+
+ + ☆ PuzzleBench: Can LLMs Solve Challenging First-Order Combinatorial + Reasoning Problems? + + +
+ Recent works have explored the use of LLMs for reasoning tasks focussing on +relatively simple problems, such as logical question answering. In our work, we +wish to tackle more complicated problems, significantly expanding the +capabilities of these models. Particularly, we explore whether LLMs can solve +challenging first-order combinatorial reasoning problems, an example being the +popular puzzle Sudoku. These problems have an underlying first-order structure +described by a general description in natural language and can be instantiated +to instances of varying sizes. Moreover these problems are computationally +intensive requiring several reasoning steps to reach the solution. We present +PuzzleBench a dataset of 31 such challenging puzzles. We observe that LLMs even +when aided by symbolic solvers perform rather poorly on our benchmark. In +response we propose a new approach, Puzzle-LM which combines LLMs with both +symbolic solvers and program interpreters enabling them to reason about such +challenging problems. We also show how feedback from smaller solved instances +can help improve this reasoning ability. + +
+
+
+
+
+ + ☆ On the performance of phonetic algorithms in microtext normalization + + +
+ User-generated content published on microblogging social networks constitutes +a priceless source of information. However, microtexts usually deviate from the +standard lexical and grammatical rules of the language, thus making its +processing by traditional intelligent systems very difficult. As an answer, +microtext normalization consists in transforming those non-standard microtexts +into standard well-written texts as a preprocessing step, allowing traditional +approaches to continue with their usual processing. Given the importance of +phonetic phenomena in non-standard text formation, an essential element of the +knowledge base of a normalizer would be the phonetic rules that encode these +phenomena, which can be found in the so-called phonetic algorithms. + In this work we experiment with a wide range of phonetic algorithms for the +English language. The aim of this study is to determine the best phonetic +algorithms within the context of candidate generation for microtext +normalization. In other words, we intend to find those algorithms that taking +as input non-standard terms to be normalized allow us to obtain as output the +smallest possible sets of normalization candidates which still contain the +corresponding target standard words. As it will be stated, the choice of the +phonetic algorithm will depend heavily on the capabilities of the candidate +selection mechanism which we usually find at the end of a microtext +normalization pipeline. The faster it can make the right choices among big +enough sets of candidates, the more we can sacrifice on the precision of the +phonetic algorithms in favour of coverage in order to increase the overall +performance of the normalization system. + KEYWORDS: microtext normalization; phonetic algorithm; fuzzy matching; +Twitter; texting + +
+
+ comment: Accepted for publication in journal Expert Systems with Applications +
+
+
+
+
+ + ☆ A Quantitative Discourse Analysis of Asian Workers in the US Historical + Newspapers + + +
+ Warning: This paper contains examples of offensive language targetting +marginalized population. The digitization of historical texts invites +researchers to explore the large-scale corpus of historical texts with +computational methods. In this study, we present computational text analysis on +a relatively understudied topic of how Asian workers are represented in +historical newspapers in the United States. We found that the word "coolie" was +semantically different in some States (e.g., Massachusetts, Rhode Island, +Wyoming, Oklahoma, and Arkansas) with the different discourses around coolie. +We also found that then-Confederate newspapers and then-Union newspapers formed +distinctive discourses by measuring over-represented words. Newspapers from +then-Confederate States associated coolie with slavery-related words. In +addition, we found Asians were perceived to be inferior to European immigrants +and subjected to the target of racism. This study contributes to supplementing +the qualitative analysis of racism in the United States with quantitative +discourse analysis. + +
+
+ comment: 3rd International Conference on Natural Language Processing for + Digital Humanities (NLP4DH) +
+
+
+
+
+ + ☆ A Truly Joint Neural Architecture for Segmentation and Parsing + + +
+ Contemporary multilingual dependency parsers can parse a diverse set of +languages, but for Morphologically Rich Languages (MRLs), performance is +attested to be lower than other languages. The key challenge is that, due to +high morphological complexity and ambiguity of the space-delimited input +tokens, the linguistic units that act as nodes in the tree are not known in +advance. Pre-neural dependency parsers for MRLs subscribed to the joint +morpho-syntactic hypothesis, stating that morphological segmentation and +syntactic parsing should be solved jointly, rather than as a pipeline where +segmentation precedes parsing. However, neural state-of-the-art parsers to date +use a strict pipeline. In this paper we introduce a joint neural architecture +where a lattice-based representation preserving all morphological ambiguity of +the input is provided to an arc-factored model, which then solves the +morphological segmentation and syntactic parsing tasks at once. Our experiments +on Hebrew, a rich and highly ambiguous MRL, demonstrate state-of-the-art +performance on parsing, tagging and segmentation of the Hebrew section of UD, +using a single model. This proposed architecture is LLM-based and language +agnostic, providing a solid foundation for MRLs to obtain further performance +improvements and bridge the gap with other languages. + +
+
+
+
+
+ + ☆ DefInt: A Default-interventionist Framework for Efficient Reasoning with + Hybrid Large Language Models + + +
+ Large language models (LLMs) have shown impressive emergent abilities in a +wide range of tasks, but still face challenges in handling complex reasoning +problems. Previous works like chain-of-thought (CoT) and tree-of-thoughts(ToT) +have predominately focused on enhancing accuracy, but overlook the rapidly +increasing token cost, which could be particularly problematic for open-ended +real-world tasks with huge solution spaces. Motivated by the dual process +theory of human cognition, we propose a Default-Interventionist framework +(DefInt) to unleash the synergistic potential of hybrid LLMs. By default, +DefInt uses smaller-scale language models to generate low-cost reasoning +thoughts, which resembles the fast intuitions produced by System 1. If the +intuitions are considered with low confidence, DefInt will invoke the +reflective reasoning of scaled-up language models as the intervention of System +2, which can override the default thoughts and rectify the reasoning process. +Experiments on five representative reasoning tasks show that DefInt +consistently achieves state-of-the-art reasoning accuracy and solution +diversity. More importantly, it substantially reduces the token cost by 49%-79% +compared to the second accurate baselines. Specifically, the open-ended tasks +have an average 75% token cost reduction. Code repo with all prompts will be +released upon publication. + +
+
+ comment: 18 pages, 10 figures, 14 tables +
+
+
+
+
+ + ☆ NavHint: Vision and Language Navigation Agent with a Hint Generator + + +
+ Existing work on vision and language navigation mainly relies on +navigation-related losses to establish the connection between vision and +language modalities, neglecting aspects of helping the navigation agent build a +deep understanding of the visual environment. In our work, we provide indirect +supervision to the navigation agent through a hint generator that provides +detailed visual descriptions. The hint generator assists the navigation agent +in developing a global understanding of the visual environment. It directs the +agent's attention toward related navigation details, including the relevant +sub-instruction, potential challenges in recognition and ambiguities in +grounding, and the targeted viewpoint description. To train the hint generator, +we construct a synthetic dataset based on landmarks in the instructions and +visible and distinctive objects in the visual environment. We evaluate our +method on the R2R and R4R datasets and achieve state-of-the-art on several +metrics. The experimental results demonstrate that generating hints not only +enhances the navigation performance but also helps improve the interpretability +of the agent's actions. + +
+
+
+
+
+ + ☆ Enhancing Robustness in Biomedical NLI Models: A Probing Approach for + Clinical Trials + + +
+ Large Language Models have revolutionized various fields and industries, such +as Conversational AI, Content Generation, Information Retrieval, Business +Intelligence, and Medical, to name a few. One major application in the field of +medical is to analyze and investigate clinical trials for entailment +tasks.However, It has been observed that Large Language Models are susceptible +to shortcut learning, factual inconsistency, and performance degradation with +little variation in context. Adversarial and robust testing is performed to +ensure the integrity of models output. But, ambiguity still persists. In order +to ensure the integrity of the reasoning performed and investigate the model +has correct syntactic and semantic understanding probing is used. Here, I used +mnestic probing to investigate the Sci-five model, trained on clinical trial. I +investigated the model for feature learnt with respect to natural logic. To +achieve the target, I trained task specific probes. Used these probes to +investigate the final layers of trained model. Then, fine tuned the trained +model using iterative null projection. The results shows that model accuracy +improved. During experimentation, I observed that size of the probe has affect +on the fine tuning process. + +
+
+
+
+
+ + ☆ Are Large Language Models Table-based Fact-Checkers? SC + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ☆ "What's my model inside of?": Exploring the role of environments for + grounded natural language understanding + + +
+ In contrast to classical cognitive science which studied brains in isolation, +ecological approaches focused on the role of the body and environment in +shaping cognition. Similarly, in this thesis we adopt an ecological approach to +grounded natural language understanding (NLU) research. Grounded language +understanding studies language understanding systems situated in the context of +events, actions and precepts in naturalistic/simulated virtual environments. +Where classic research tends to focus on designing new models and optimization +methods while treating environments as given, we explore the potential of +environment design for improving data collection and model development. We +developed novel training and annotation approaches for procedural text +understanding based on text-based game environments. We also drew upon embodied +cognitive linguistics literature to propose a roadmap for grounded NLP +research, and to inform the development of a new benchmark for measuring the +progress of large language models on challenging commonsense reasoning tasks. +We leveraged the richer supervision provided by text-based game environments to +develop Breakpoint Transformers, a novel approach to modeling intermediate +semantic information in long narrative or procedural texts. Finally, we +integrated theories on the role of environments in collective human +intelligence to propose a design for AI-augmented "social thinking +environments" for knowledge workers like scientists. + +
+
+ comment: PhD Thesis +
+
+
+
+
+ + ☆ Knowledge Generation for Zero-shot Knowledge-based VQA EACL 2023 + + +
+ Previous solutions to knowledge-based visual question answering~(K-VQA) +retrieve knowledge from external knowledge bases and use supervised learning to +train the K-VQA model. Recently pre-trained LLMs have been used as both a +knowledge source and a zero-shot QA model for K-VQA and demonstrated promising +results. However, these recent methods do not explicitly show the knowledge +needed to answer the questions and thus lack interpretability. Inspired by +recent work on knowledge generation from LLMs for text-based QA, in this work +we propose and test a similar knowledge-generation-based K-VQA method, which +first generates knowledge from an LLM and then incorporates the generated +knowledge for K-VQA in a zero-shot manner. We evaluate our method on two K-VQA +benchmarks and found that our method performs better than previous zero-shot +K-VQA methods and our generated knowledge is generally relevant and helpful. + +
+
+ comment: accepted as Findings in EACL 2023; +
+
+
+
+
+ + ☆ Absolute convergence and error thresholds in non-active adaptive + sampling + + +
+ Non-active adaptive sampling is a way of building machine learning models +from a training data base which are supposed to dynamically and automatically +derive guaranteed sample size. In this context and regardless of the strategy +used in both scheduling and generating of weak predictors, a proposal for +calculating absolute convergence and error thresholds is described. We not only +make it possible to establish when the quality of the model no longer +increases, but also supplies a proximity condition to estimate in absolute +terms how close it is to achieving such a goal, thus supporting decision making +for fine-tuning learning parameters in model selection. The technique proves +its correctness and completeness with respect to our working hypotheses, in +addition to strengthening the robustness of the sampling scheme. Tests meet our +expectations and illustrate the proposal in the domain of natural language +processing, taking the generation of part-of-speech taggers as case study. + +
+
+ comment: 27 pages, 10 figures +
+
+
+
+
+ + ☆ Adaptive scheduling for adaptive sampling in POS taggers construction + + +
+ We introduce an adaptive scheduling for adaptive sampling as a novel way of +machine learning in the construction of part-of-speech taggers. The goal is to +speed up the training on large data sets, without significant loss of +performance with regard to an optimal configuration. In contrast to previous +methods using a random, fixed or regularly rising spacing between the +instances, ours analyzes the shape of the learning curve geometrically in +conjunction with a functional model to increase or decrease it at any time. The +algorithm proves to be formally correct regarding our working hypotheses. +Namely, given a case, the following one is the nearest ensuring a net gain of +learning ability from the former, it being possible to modulate the level of +requirement for this condition. We also improve the robustness of sampling by +paying greater attention to those regions of the training data base subject to +a temporary inflation in performance, thus preventing the learning from +stopping prematurely. + The proposal has been evaluated on the basis of its reliability to identify +the convergence of models, corroborating our expectations. While a concrete +halting condition is used for testing, users can choose any condition +whatsoever to suit their own specific needs. + +
+
+ comment: 23 pager, 10 figures +
+
+
+
+
+ + ☆ Modeling of learning curves with applications to pos tagging + + +
+ An algorithm to estimate the evolution of learning curves on the whole of a +training data base, based on the results obtained from a portion and using a +functional strategy, is introduced. We approximate iteratively the sought value +at the desired time, independently of the learning technique used and once a +point in the process, called prediction level, has been passed. The proposal +proves to be formally correct with respect to our working hypotheses and +includes a reliable proximity condition. This allows the user to fix a +convergence threshold with respect to the accuracy finally achievable, which +extends the concept of stopping criterion and seems to be effective even in the +presence of distorting observations. + Our aim is to evaluate the training effort, supporting decision making in +order to reduce the need for both human and computational resources during the +learning process. The proposal is of interest in at least three operational +procedures. The first is the anticipation of accuracy gain, with the purpose of +measuring how much work is needed to achieve a certain degree of performance. +The second relates the comparison of efficiency between systems at training +time, with the objective of completing this task only for the one that best +suits our requirements. The prediction of accuracy is also a valuable item of +information for customizing systems, since we can estimate in advance the +impact of settings on both the performance and the development costs. Using the +generation of part-of-speech taggers as an example application, the +experimental results are consistent with our expectations. + +
+
+ comment: 30 pages, 11 figures +
+
+
+
+
+ + ☆ Early stopping by correlating online indicators in neural networks + + +
+ In order to minimize the generalization error in neural networks, a novel +technique to identify overfitting phenomena when training the learner is +formally introduced. This enables support of a reliable and trustworthy early +stopping condition, thus improving the predictive power of that type of +modeling. Our proposal exploits the correlation over time in a collection of +online indicators, namely characteristic functions for indicating if a set of +hypotheses are met, associated with a range of independent stopping conditions +built from a canary judgment to evaluate the presence of overfitting. That way, +we provide a formal basis for decision making in terms of interrupting the +learning process. + As opposed to previous approaches focused on a single criterion, we take +advantage of subsidiarities between independent assessments, thus seeking both +a wider operating range and greater diagnostic reliability. With a view to +illustrating the effectiveness of the halting condition described, we choose to +work in the sphere of natural language processing, an operational continuum +increasingly based on machine learning. As a case study, we focus on parser +generation, one of the most demanding and complex tasks in the domain. The +selection of cross-validation as a canary function enables an actual comparison +with the most representative early stopping conditions based on overfitting +identification, pointing to a promising start toward an optimal bias and +variance control. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ Surfing the modeling of PoS taggers in low-resource scenarios + + +
+ The recent trend towards the application of deep structured techniques has +revealed the limits of huge models in natural language processing. This has +reawakened the interest in traditional machine learning algorithms, which have +proved still to be competitive in certain contexts, in particular low-resource +settings. In parallel, model selection has become an essential task to boost +performance at reasonable cost, even more so when we talk about processes +involving domains where the training and/or computational resources are scarce. +Against this backdrop, we evaluate the early estimation of learning curves as a +practical mechanism for selecting the most appropriate model in scenarios +characterized by the use of non-deep learners in resource-lean settings. On the +basis of a formal approximation model previously evaluated under conditions of +wide availability of training and validation resources, we study the +reliability of such an approach in a different and much more demanding +operationalenvironment. Using as case study the generation of PoS taggers for +Galician, a language belonging to the Western Ibero-Romance group, the +experimental results are consistent with our expectations. + +
+
+ comment: 17 papes, 5 figures +
+
+
+
+
+ + ☆ Breaking MLPerf Training: A Case Study on Optimizing BERT + + +
+ Speeding up the large-scale distributed training is challenging in that it +requires improving various components of training including load balancing, +communication, optimizers, etc. We present novel approaches for fast +large-scale training of BERT model which individually ameliorates each +component thereby leading to a new level of BERT training performance. Load +balancing is imperative in distributed BERT training since its training +datasets are characterized by samples with various lengths. Communication cost, +which is proportional to the scale of distributed training, needs to be hidden +by useful computation. In addition, the optimizers, e.g., ADAM, LAMB, etc., +need to be carefully re-evaluated in the context of large-scale distributed +training. We propose two new ideas, (1) local presorting based on dataset +stratification for load balancing and (2) bucket-wise gradient clipping before +allreduce which allows us to benefit from the overlap of gradient computation +and synchronization as well as the fast training of gradient clipping before +allreduce. We also re-evaluate existing optimizers via hyperparameter +optimization and utilize ADAM, which also contributes to fast training via +larger batches than existing methods. Our proposed methods, all combined, give +the fastest MLPerf BERT training of 25.1 (22.3) seconds on 1,024 NVIDIA A100 +GPUs, which is 1.33x (1.13x) and 1.57x faster than the other top two (one) +submissions to MLPerf v1.1 (v2.0). Our implementation and evaluation results +are available at MLPerf v1.1~v2.1. + +
+
+ comment: Total 15 pages (Appendix 3 pages) +
+
+
+
+
+ + ☆ Factuality of Large Language Models in the Year 2024 + + +
+ Large language models (LLMs), especially when instruction-tuned for chat, +have become part of our daily lives, freeing people from the process of +searching, extracting, and integrating information from multiple sources by +offering a straightforward answer to a variety of questions in a single place. +Unfortunately, in many cases, LLM responses are factually incorrect, which +limits their applicability in real-world scenarios. As a result, research on +evaluating and improving the factuality of LLMs has attracted a lot of research +attention recently. In this survey, we critically analyze existing work with +the aim to identify the major challenges and their associated causes, pointing +out to potential solutions for improving the factuality of LLMs, and analyzing +the obstacles to automated factuality evaluation for open-ended text +generation. We further offer an outlook on where future research should go. + +
+
+ comment: 9 pages, 1 figure and 2 tables +
+
+
+
+
+ + ☆ Aligner: Achieving Efficient Alignment through Weak-to-Strong Correction + + +
+ Efforts to align Large Language Models (LLMs) are mainly conducted via +Reinforcement Learning from Human Feedback (RLHF) methods. However, RLHF +encounters major challenges including training reward models, actor-critic +engineering, and importantly, it requires access to LLM parameters. Here we +introduce Aligner, a new efficient alignment paradigm that bypasses the whole +RLHF process by learning the correctional residuals between the aligned and the +unaligned answers. Our Aligner offers several key advantages. Firstly, it is an +autoregressive seq2seq model that is trained on the query-answer-correction +dataset via supervised learning; this offers a parameter-efficient alignment +solution with minimal resources. Secondly, the Aligner facilitates +weak-to-strong generalization; finetuning large pretrained models by Aligner's +supervisory signals demonstrates strong performance boost. Thirdly, Aligner +functions as a model-agnostic plug-and-play module, allowing for its direct +application on different open-source and API-based models. Remarkably, +Aligner-7B improves 11 different LLMs by 18% in helpfulness and 23% in +harmlessness on average (GPT-4 by 26.9% and 17.5%). When finetuning (strong) +Llama2-70B with (weak) Aligner-7B's supervision, we can improve Llama2 by 8.2% +in helpfulness and 61.6% in harmlessness. See our dataset and code at +\url{https://aligner2024.github.io}. + +
+
+ comment: 35 pages +
+
+
+
+
+ + ☆ GLaPE: Gold Label-agnostic Prompt Evaluation and Optimization for Large + Language Model + + +
+ Despite the rapid progress of large language models (LLMs), their task +performance remains sensitive to prompt design. Recent studies have explored +leveraging the LLM itself as an optimizer to identify optimal prompts that +maximize task accuracy. However, when evaluating prompts, such approaches +heavily rely on elusive manually annotated gold labels to calculate task +accuracy for each candidate prompt, which hinders the widespread implementation +and generality. To overcome the limitation, this work proposes a gold +label-agnostic prompt evaluation (GLaPE) to alleviate dependence on gold +labels. Motivated by the observed correlation between self-consistency and the +accuracy of the answer, we adopt self-consistency as the initial evaluation +score. Subsequently, we refine the scores of prompts producing identical +answers to be mutually consistent. Experimental results show that GLaPE +provides reliable evaluations uniform with accuracy, even in the absence of +gold labels. Moreover, on six popular reasoning tasks, our GLaPE-based prompt +optimization yields effective prompts comparable to accuracy-based ones. The +code is publicly available at https://github.com/thunderous77/GLaPE. + +
+
+
+
+
+ + ☆ DeLLMa: A Framework for Decision Making Under Uncertainty with Large + Language Models + + +
+ Large language models (LLMs) are increasingly used across society, including +in domains like business, engineering, and medicine. These fields often grapple +with decision-making under uncertainty, a critical yet challenging task. In +this paper, we show that directly prompting LLMs on these types of +decision-making problems yields poor results, especially as the problem +complexity increases. To overcome this limitation, we propose DeLLMa +(Decision-making Large Language Model assistant), a framework designed to +enhance decision-making accuracy in uncertain environments. DeLLMa involves a +multi-step scaffolding procedure, drawing upon principles from decision theory +and utility theory, to provide an optimal and human-auditable decision-making +process. We validate our framework on decision-making environments involving +real agriculture and finance data. Our results show that DeLLMa can +significantly improve LLM decision-making performance, achieving up to a 40% +increase in accuracy over competing methods. + +
+
+ comment: 23 pages, 17 figures +
+
+
+
+
+ + ☆ KICGPT: Large Language Model with Knowledge in Context for Knowledge + Graph Completion EMNLP 2023 + + +
+ Knowledge Graph Completion (KGC) is crucial for addressing knowledge graph +incompleteness and supporting downstream applications. Many models have been +proposed for KGC. They can be categorized into two main classes: triple-based +and text-based approaches. Triple-based methods struggle with long-tail +entities due to limited structural information and imbalanced entity +distributions. Text-based methods alleviate this issue but require costly +training for language models and specific finetuning for knowledge graphs, +which limits their efficiency. To alleviate these limitations, in this paper, +we propose KICGPT, a framework that integrates a large language model (LLM) and +a triple-based KGC retriever. It alleviates the long-tail problem without +incurring additional training overhead. KICGPT uses an in-context learning +strategy called Knowledge Prompt, which encodes structural knowledge into +demonstrations to guide the LLM. Empirical results on benchmark datasets +demonstrate the effectiveness of KICGPT with smaller training overhead and no +finetuning. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Solution-oriented Agent-based Models Generation with Verifier-assisted + Iterative In-context Learning + + +
+ Agent-based models (ABMs) stand as an essential paradigm for proposing and +validating hypothetical solutions or policies aimed at addressing challenges +posed by complex systems and achieving various objectives. This process demands +labor-intensive endeavors and multidisciplinary expertise. Large language +models (LLMs) encapsulating cross-domain knowledge and programming proficiency +could potentially alleviate the difficulty of this process. However, LLMs excel +in handling sequential information, making it challenging for analyzing the +intricate interactions and nonlinear dynamics inherent in ABMs. Additionally, +due to the lack of self-evaluation capability of LLMs, relying solely on LLMs +is insufficient to effectively accomplish this process. In this paper, we +present SAGE, a general solution-oriented ABM generation framework designed for +automatic modeling and generating solutions for targeted problems. Unlike +approaches reliant on expert handcrafting or resource-intensive neural network +training, SAGE establishes a verifier-assisted iterative in-context learning +process employing large language models (LLMs) to leverages their inherent +cross-domain knowledge for tackling intricate demands from diverse domain +scenarios. In SAGE, we introduce an semi-structured conceptual representation +expliciting the intricate structures of ABMs and an objective representation to +guide LLMs in modeling scenarios and proposing hypothetical solutions through +in-context learning. To ensure the model executability and solution +feasibility, SAGE devises a two-level verifier with chain-of-thought prompting +tailored to the complex interactions and non-linear dynamics of ABMs, driving +the iterative generation optimization. Moreover, we construct an evaluation +dataset of solution-oriented ABMs from open sources.It contains practical +models across various domains. + +
+
+
+
+
+ + ☆ Evaluating Large Language Models in Analysing Classroom Dialogue + + +
+ This study explores the application of Large Language Models (LLMs), +specifically GPT-4, in the analysis of classroom dialogue, a crucial research +task for both teaching diagnosis and quality improvement. Recognizing the +knowledge-intensive and labor-intensive nature of traditional qualitative +methods in educational research, this study investigates the potential of LLM +to streamline and enhance the analysis process. The study involves datasets +from a middle school, encompassing classroom dialogues across mathematics and +Chinese classes. These dialogues were manually coded by educational experts and +then analyzed using a customised GPT-4 model. This study focuses on comparing +manual annotations with the outputs of GPT-4 to evaluate its efficacy in +analyzing educational dialogues. Time efficiency, inter-coder agreement, and +inter-coder reliability between human coders and GPT-4 are evaluated. Results +indicate substantial time savings with GPT-4, and a high degree of consistency +in coding between the model and human coders, with some discrepancies in +specific codes. These findings highlight the strong potential of LLM in +teaching evaluation and facilitation. + +
+
+
+
+
+ + ☆ Rethinking the Evaluation of Pre-trained Text-and-Layout Models from an + Entity-Centric Perspective + + +
+ Recently developed pre-trained text-and-layout models (PTLMs) have shown +remarkable success in multiple information extraction tasks on visually-rich +documents. However, the prevailing evaluation pipeline may not be sufficiently +robust for assessing the information extraction ability of PTLMs, due to +inadequate annotations within the benchmarks. Therefore, we claim the necessary +standards for an ideal benchmark to evaluate the information extraction ability +of PTLMs. We then introduce EC-FUNSD, an entity-centric benckmark designed for +the evaluation of semantic entity recognition and entity linking on +visually-rich documents. This dataset contains diverse formats of document +layouts and annotations of semantic-driven entities and their relations. +Moreover, this dataset disentangles the falsely coupled annotation of segment +and entity that arises from the block-level annotation of FUNSD. Experiment +results demonstrate that state-of-the-art PTLMs exhibit overfitting tendencies +on the prevailing benchmarks, as their performance sharply decrease when the +dataset bias is removed. + +
+
+
+
+
+ + ☆ M$^3$Face: A Unified Multi-Modal Multilingual Framework for Human Face + Generation and Editing + + +
+ Human face generation and editing represent an essential task in the era of +computer vision and the digital world. Recent studies have shown remarkable +progress in multi-modal face generation and editing, for instance, using face +segmentation to guide image generation. However, it may be challenging for some +users to create these conditioning modalities manually. Thus, we introduce +M3Face, a unified multi-modal multilingual framework for controllable face +generation and editing. This framework enables users to utilize only text input +to generate controlling modalities automatically, for instance, semantic +segmentation or facial landmarks, and subsequently generate face images. We +conduct extensive qualitative and quantitative experiments to showcase our +frameworks face generation and editing capabilities. Additionally, we propose +the M3CelebA Dataset, a large-scale multi-modal and multilingual face dataset +containing high-quality images, semantic segmentations, facial landmarks, and +different captions for each image in multiple languages. The code and the +dataset will be released upon publication. + +
+
+
+
+
+ + ♻ ☆ Formal-LLM: Integrating Formal Language and Natural Language for + Controllable LLM-based Agents + + +
+ Recent advancements on Large Language Models (LLMs) enable AI Agents to +automatically generate and execute multi-step plans to solve complex tasks. +However, since LLM's content generation process is hardly controllable, current +LLM-based agents frequently generate invalid or non-executable plans, which +jeopardizes the performance of the generated plans and corrupts users' trust in +LLM-based agents. In response, this paper proposes a novel ``Formal-LLM'' +framework for LLM-based agents by integrating the expressiveness of natural +language and the precision of formal language. Specifically, the framework +allows human users to express their requirements or constraints for the +planning process as an automaton. A stack-based LLM plan generation process is +then conducted under the supervision of the automaton to ensure that the +generated plan satisfies the constraints, making the planning process +controllable. We conduct experiments on both benchmark tasks and practical +real-life tasks, and our framework achieves over 50% overall performance +increase, which validates the feasibility and effectiveness of employing +Formal-LLM to guide the plan generation of agents, preventing the agents from +generating invalid and unsuccessful plans. Further, more controllable LLM-based +agents can facilitate the broader utilization of LLM in application scenarios +where high validity of planning is essential. The work is open-sourced at +https://github.com/agiresearch/Formal-LLM. + +
+
+ comment: 21 pages, 6 figures; comments and suggestions are welcome +
+
+
+
+
+ + ♻ ☆ Language is All a Graph Needs EACL 2024 + + +
+ The emergence of large-scale pre-trained language models has revolutionized +various AI research domains. Transformers-based Large Language Models (LLMs) +have gradually replaced CNNs and RNNs to unify fields of computer vision and +natural language processing. Compared with independent data samples such as +images, videos or texts, graphs usually contain rich structural and relational +information. Meanwhile, language, especially natural language, being one of the +most expressive mediums, excels in describing complex structures. However, +existing work on incorporating graph problems into the generative language +modeling framework remains very limited. Considering the rising prominence of +LLMs, it becomes essential to explore whether LLMs can also replace GNNs as the +foundation model for graphs. In this paper, we propose InstructGLM +(Instruction-finetuned Graph Language Model) with highly scalable prompts based +on natural language instructions. We use natural language to describe +multi-scale geometric structure of the graph and then instruction finetune an +LLM to perform graph tasks, which enables Generative Graph Learning. Our method +surpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets, +underscoring its effectiveness and sheds light on generative LLMs as new +foundation model for graph machine learning. Our code is open-sourced at +https://github.com/agiresearch/InstructGLM. + +
+
+ comment: In EACL 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as + an Alternative to Attention Layers in Transformers AAAI24 + + +
+ This work presents an analysis of the effectiveness of using standard shallow +feed-forward networks to mimic the behavior of the attention mechanism in the +original Transformer model, a state-of-the-art architecture for +sequence-to-sequence tasks. We substitute key elements of the attention +mechanism in the Transformer with simple feed-forward networks, trained using +the original components via knowledge distillation. Our experiments, conducted +on the IWSLT2017 dataset, reveal the capacity of these "attentionless +Transformers" to rival the performance of the original architecture. Through +rigorous ablation studies, and experimenting with various replacement network +types and sizes, we offer insights that support the viability of our approach. +This not only sheds light on the adaptability of shallow feed-forward networks +in emulating attention mechanisms but also underscores their potential to +streamline complex architectures for sequence-to-sequence tasks. + +
+
+ comment: Accepted at AAAI24(https://aaai.org/aaai-conference/) +
+
+
+
+
+ + ♻ ☆ Framework-Based Qualitative Analysis of Free Responses of Large Language + Models: Algorithmic Fidelity + + +
+ Today, using Large-scale generative Language Models (LLMs) it is possible to +simulate free responses to interview questions like those traditionally +analyzed using qualitative research methods. Qualitative methodology +encompasses a broad family of techniques involving manual analysis of +open-ended interviews or conversations conducted freely in natural language. +Here we consider whether artificial "silicon participants" generated by LLMs +may be productively studied using qualitative methods aiming to produce +insights that could generalize to real human populations. The key concept in +our analysis is algorithmic fidelity, a term introduced by Argyle et al. (2023) +capturing the degree to which LLM-generated outputs mirror human +sub-populations' beliefs and attitudes. By definition, high algorithmic +fidelity suggests latent beliefs elicited from LLMs may generalize to real +humans, whereas low algorithmic fidelity renders such research invalid. Here we +used an LLM to generate interviews with silicon participants matching specific +demographic characteristics one-for-one with a set of human participants. Using +framework-based qualitative analysis, we showed the key themes obtained from +both human and silicon participants were strikingly similar. However, when we +analyzed the structure and tone of the interviews we found even more striking +differences. We also found evidence of the hyper-accuracy distortion described +by Aher et al. (2023). We conclude that the LLM we tested (GPT-3.5) does not +have sufficient algorithmic fidelity to expect research on it to generalize to +human populations. However, the rapid pace of LLM research makes it plausible +this could change in the future. Thus we stress the need to establish epistemic +norms now around how to assess validity of LLM-based qualitative research, +especially concerning the need to ensure representation of heterogeneous lived +experiences. + +
+
+ comment: 52 pages, 5 tables, 5 figures +
+
+
+
+
+ + ♻ ☆ Model Editing Can Hurt General Abilities of Large Language Models + + +
+ One critical challenge that has emerged is the presence of hallucinations in +the output of large language models (LLMs) due to false or outdated knowledge. +Since retraining LLMs with updated information is resource-intensive, there has +been a growing interest in model editing. However, current model editing +methods, while effective in improving editing performance in various scenarios, +often overlook potential side effects on the general abilities of LLMs. In this +paper, we raise concerns that model editing inherently improves the factuality +of the model, but may come at the cost of a significant degradation of these +general abilities. Systematically, we analyze side effects by evaluating four +popular editing methods on three LLMs across eight representative task +categories. Extensive empirical research reveals that current model editing +methods are difficult to couple well with LLMs to simultaneously improve the +factuality and maintain the general abilities such as reasoning, question +answering, etc. Strikingly, the use of a specific method to edit LLaMA-1 (7B) +resulted in a drastic performance degradation to nearly 0 on all selected tasks +with just a single edit. Therefore, we advocate for more research efforts to +minimize the loss of general abilities acquired during LLM pre-training and to +ultimately preserve them during model editing. + +
+
+ comment: Add new results on LLaMA-2 (7B) +
+
+
+
+
+ + ♻ ☆ EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty + + +
+ Autoregressive decoding makes the inference of Large Language Models (LLMs) +time-consuming. In this paper, we reconsider speculative sampling and derive +two key observations. Firstly, autoregression at the feature +(second-to-top-layer) level is more straightforward than at the token level. +Secondly, the inherent uncertainty in feature (second-to-top-layer) level +autoregression constrains its performance. Based on these insights, we +introduce EAGLE (Extrapolation Algorithm for Greater Language-model +Efficiency), a simple yet highly efficient speculative sampling framework. By +incorporating a token sequence advanced by one time step, EAGLE effectively +resolves the uncertainty, enabling precise second-to-top-layer feature +prediction with minimal overhead. We conducted comprehensive evaluations of +EAGLE, including all models from the Vicuna and LLaMA2-Chat series, the MoE +model Mixtral 8x7B Instruct, and tasks in dialogue, code generation, +mathematical reasoning, and instruction following. For LLaMA2-Chat 70B, EAGLE +achieved a latency speedup ratio of 2.7x-3.5x, doubled throughput, while +maintaining the distribution of the generated text. + +
+
+
+
+
+ + ♻ ☆ Language Models are Super Mario: Absorbing Abilities from Homologous + Models as a Free Lunch + + +
+ In this paper, we unveil that Language Models (LMs) can acquire new +capabilities by assimilating parameters from homologous models without +retraining or GPUs. We first introduce DARE to set most delta parameters (i.e., +the disparity between fine-tuned and pre-trained parameters) to zeros without +affecting the abilities of Supervised Fine-Tuning (SFT) LMs, which randomly +Drops delta parameters with a ratio p And REscales the remaining ones by 1/(1 - +p) to approximate the original embeddings. Then, we use DARE as a versatile +plug-and-play technique to sparsify delta parameters of multiple SFT homologous +models for mitigating parameter interference and merge them into a single model +by parameter fusing. We experiment with encoder- and decoder-based LMs, showing +that: (1) SFT delta parameter value ranges are typically small (within 0.005) +with extreme redundancy, and DARE can effortlessly eliminate 90% or even 99% of +them. (2) DARE can merge multiple task-specific LMs into one LM with diverse +capabilities. For instance, the amalgamation of WizardLM and WizardMath +significantly enhances the GSM8K zero-shot accuracy of WizardLM from 2.2 to +66.3, retaining the instruction-following proficiency while surpassing +WizardMath's 64.2 performance. Our merged LM also ranks first among models with +7 billion parameters on the Open LLM Leaderboard. + +
+
+ comment: 24 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ A decoder-only foundation model for time-series forecasting + + +
+ Motivated by recent advances in large language models for Natural Language +Processing (NLP), we design a time-series foundation model for forecasting +whose out-of-the-box zero-shot performance on a variety of public datasets +comes close to the accuracy of state-of-the-art supervised forecasting models +for each individual dataset. Our model is based on pretraining a +patched-decoder style attention model on a large time-series corpus, and can +work well across different forecasting history lengths, prediction lengths and +temporal granularities. + +
+
+
+
+
+ + ♻ ☆ Can Language Models Be Tricked by Language Illusions? Easier with + Syntax, Harder with Semantics + + +
+ Language models (LMs) have been argued to overlap substantially with human +beings in grammaticality judgment tasks. But when humans systematically make +errors in language processing, should we expect LMs to behave like cognitive +models of language and mimic human behavior? We answer this question by +investigating LMs' more subtle judgments associated with "language illusions" +-- sentences that are vague in meaning, implausible, or ungrammatical but +receive unexpectedly high acceptability judgments by humans. We looked at three +illusions: the comparative illusion (e.g. "More people have been to Russia than +I have"), the depth-charge illusion (e.g. "No head injury is too trivial to be +ignored"), and the negative polarity item (NPI) illusion (e.g. "The hunter who +no villager believed to be trustworthy will ever shoot a bear"). We found that +probabilities represented by LMs were more likely to align with human judgments +of being "tricked" by the NPI illusion which examines a structural dependency, +compared to the comparative and the depth-charge illusions which require +sophisticated semantic understanding. No single LM or metric yielded results +that are entirely consistent with human behavior. Ultimately, we show that LMs +are limited both in their construal as cognitive models of human language +processing and in their capacity to recognize nuanced but critical information +in complicated language materials. + +
+
+ comment: Accepted by The SIGNLL Conference on Computational Natural Language + Learning 2023 +
+
+
+
+
+ + ♻ ☆ Deductive Beam Search: Decoding Deducible Rationale for Chain-of-Thought + Reasoning + + +
+ Recent advancements have significantly augmented the reasoning capabilities +of Large Language Models (LLMs) through various methodologies, especially +chain-of-thought (CoT) reasoning. However, previous methods fail to address +reasoning errors in intermediate steps, leading to accumulative errors. In this +paper, we propose Deductive Beam Search (DBS), which seamlessly integrates CoT +and deductive reasoning with step-wise beam search for LLMs. Our approach +deploys a verifier, verifying the deducibility of a reasoning step and its +premises, thus alleviating the error accumulation. Furthermore, we introduce a +scalable and labor-free data construction method to amplify our model's +verification capabilities. Extensive experiments demonstrate that our approach +significantly enhances the base performance of LLMs of various scales (7B, 13B, +70B, and ChatGPT) across 8 reasoning datasets from 3 diverse reasoning genres, +including arithmetic, commonsense, and symbolic. Moreover, our analysis proves +DBS's capability of detecting diverse and subtle reasoning errors and +robustness on different model scales. + +
+
+
+
+
+ + ♻ ☆ Gaining Wisdom from Setbacks: Aligning Large Language Models via Mistake + Analysis ICLR 2024 + + +
+ The rapid development of large language models (LLMs) has not only provided +numerous opportunities but also presented significant challenges. This becomes +particularly evident when LLMs inadvertently generate harmful or toxic content, +either unintentionally or because of intentional inducement. Existing alignment +methods usually direct LLMs toward the favorable outcomes by utilizing +human-annotated, flawless instruction-response pairs. Conversely, this study +proposes a novel alignment technique based on mistake analysis, which +deliberately exposes LLMs to erroneous content to learn the reasons for +mistakes and how to avoid them. In this case, mistakes are repurposed into +valuable data for alignment, effectively helping to avoid the production of +erroneous responses. Without external models or human annotations, our method +leverages a model's intrinsic ability to discern undesirable mistakes and +improves the safety of its generated responses. Experimental results reveal +that our method outperforms existing alignment approaches in enhancing model +safety while maintaining the overall utility. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ REFINER: Reasoning Feedback on Intermediate Representations EACL 2024 + + +
+ Language models (LMs) have recently shown remarkable performance on reasoning +tasks by explicitly generating intermediate inferences, e.g., chain-of-thought +prompting. However, these intermediate inference steps may be inappropriate +deductions from the initial context and lead to incorrect final predictions. +Here we introduce REFINER, a framework for finetuning LMs to explicitly +generate intermediate reasoning steps while interacting with a critic model +that provides automated feedback on the reasoning. Specifically, the critic +provides structured feedback that the reasoning LM uses to iteratively improve +its intermediate arguments. Empirical evaluations of REFINER on three diverse +reasoning tasks show significant improvements over baseline LMs of comparable +scale. Furthermore, when using GPT-3.5 or ChatGPT as the reasoner, the trained +critic significantly improves reasoning without finetuning the reasoner. +Finally, our critic model is trained without expensive human-in-the-loop data +but can be substituted with humans at inference time. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ♻ ☆ Jina Embeddings 2: 8192-Token General-Purpose Text Embeddings for Long + Documents + + +
+ Text embedding models have emerged as powerful tools for transforming +sentences into fixed-sized feature vectors that encapsulate semantic +information. While these models are essential for tasks like information +retrieval, semantic clustering, and text re-ranking, most existing open-source +models, especially those built on architectures like BERT, struggle to +represent lengthy documents and often resort to truncation. One common approach +to mitigate this challenge involves splitting documents into smaller paragraphs +for embedding. However, this strategy results in a much larger set of vectors, +consequently leading to increased memory consumption and computationally +intensive vector searches with elevated latency. + To address these challenges, we introduce Jina Embeddings 2, an open-source +text embedding model capable of accommodating up to 8192 tokens. This model is +designed to transcend the conventional 512-token limit and adeptly process long +documents. Jina Embeddings 2 not only achieves state-of-the-art performance on +a range of embedding-related tasks in the MTEB benchmark but also matches the +performance of OpenAI's proprietary ada-002 model. Additionally, our +experiments indicate that an extended context can enhance performance in tasks +such as NarrativeQA. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Rethinking STS and NLI in Large Language Models + + +
+ Recent years have seen the rise of large language models (LLMs), where +practitioners use task-specific prompts; this was shown to be effective for a +variety of tasks. However, when applied to semantic textual similarity (STS) +and natural language inference (NLI), the effectiveness of LLMs turns out to be +limited by low-resource domain accuracy, model overconfidence, and difficulty +to capture the disagreements between human judgements. With this in mind, here +we try to rethink STS and NLI in the era of LLMs. We first evaluate the +performance of STS and NLI in the clinical/biomedical domain, and then we +assess LLMs' predictive confidence and their capability of capturing collective +human opinions. We find that these old problems are still to be properly +addressed in the era of LLMs. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.13138 by other authors +
+
+
+
+
+ + ♻ ☆ Does DetectGPT Fully Utilize Perturbation? Selective Perturbation on + Model-Based Contrastive Learning Detector would be Better + + +
+ The burgeoning capabilities of large language models (LLMs) have raised +growing concerns about abuse. DetectGPT, a zero-shot metric-based unsupervised +machine-generated text detector, first introduces perturbation and shows great +performance improvement. However, DetectGPT's random perturbation strategy +might introduce noise, limiting the distinguishability and further performance +improvements. Moreover, its logit regression module relies on setting the +threshold, which harms the generalizability and applicability of individual or +small-batch inputs. Hence, we propose a novel detector, Pecola, which uses +selective strategy perturbation to relieve the information loss caused by +random masking, and multi-pair contrastive learning to capture the implicit +pattern information during perturbation, facilitating few-shot performance. The +experiments show that Pecola outperforms the SOTA method by 1.20% in accuracy +on average on four public datasets. We further analyze the effectiveness, +robustness, and generalization of our perturbation method. + +
+
+
+
+
+ + ♻ ☆ Generating Benchmarks for Factuality Evaluation of Language Models + + +
+ Before deploying a language model (LM) within a given domain, it is important +to measure its tendency to generate factually incorrect information in that +domain. Existing methods for factuality evaluation of LLM generation focus on +facts sampled from the LM itself, and thus do not control the set of evaluated +facts and might under-represent domain specific or rare facts. We propose +FACTOR: Factual Assessment via Corpus TransfORmation, a scalable approach for +evaluating LM factuality. FACTOR automatically transforms a factual corpus of +interest into a benchmark evaluating an LM's propensity to generate true facts +from the corpus vs. similar but incorrect statements. We use our framework to +create three benchmarks: Wiki-FACTOR, News-FACTOR and Expert-FACTOR. We show +that: (i) our benchmark scores increase with model size and improve when the LM +is augmented with retrieval; (ii) benchmark score and perplexity do not always +agree on model ranking; (iii) when perplexity and benchmark score disagree, the +latter better reflects factuality in open-ended generation, as measured by +human annotators. We make our data and code publicly available in +https://github.com/AI21Labs/factor. + +
+
+
+
+
+ + ♻ ☆ SSCFormer: Push the Limit of Chunk-wise Conformer for Streaming ASR + Using Sequentially Sampled Chunks and Chunked Causal Convolution SP + + +
+ Currently, the chunk-wise schemes are often used to make Automatic Speech +Recognition (ASR) models to support streaming deployment. However, existing +approaches are unable to capture the global context, lack support for parallel +training, or exhibit quadratic complexity for the computation of multi-head +self-attention (MHSA). On the other side, the causal convolution, no future +context used, has become the de facto module in streaming Conformer. In this +paper, we propose SSCFormer to push the limit of chunk-wise Conformer for +streaming ASR using the following two techniques: 1) A novel cross-chunks +context generation method, named Sequential Sampling Chunk (SSC) scheme, to +re-partition chunks from regular partitioned chunks to facilitate efficient +long-term contextual interaction within local chunks. 2)The Chunked Causal +Convolution (C2Conv) is designed to concurrently capture the left context and +chunk-wise future context. Evaluations on AISHELL-1 show that an End-to-End +(E2E) CER 5.33% can achieve, which even outperforms a strong time-restricted +baseline U2. Moreover, the chunk-wise MHSA computation in our model enables it +to train with a large batch size and perform inference with linear complexity. + +
+
+ comment: This manuscript has been accepted by SPL +
+
+
+
+
+ + ♻ ☆ Detection of ChatGPT Fake Science with the xFakeBibs Learning Algorithm + + +
+ ChatGPT is becoming a new reality. In this paper, we demonstrate a method for +distinguishing ChatGPT-generated publications from those produced by +scientists. The objective of this work is to introduce a newly designed +supervised network-driven algorithm that illustrates how to predict +machine-generated content. The premise is that ChatGPT content exhibits +behavior that is distinctive and can be set apart from scientific articles. The +algorithm was trained and tested on three disease-specific publications, with +each model constructed from 100 abstracts. Additionally, the algorithm +underwent k-Folds calibration (depending on the availability of the data) to +establish a lower-upper bound range of acceptance. The network training model +of ChatGPT showed a lower number of nodes and a higher number of edges when +compared with models of real article abstracts. The algorithm was executed in +single-mode to predict the class of one type of dataset at a time and achieved +>94%. It was also executed in multi-mode on mixed documents of ChatGPT and +PubMed abstracts. The algorithm remarkably predicted real articles with a +precision of 100% and, on rare occasions, 96%-98%. However, ChatGPT content was +often misclassified as real publications with up to 88% accuracy in all +datasets of the three diseases. Our results also showed that the year of +publications mixed with ChatGPT-generated content may play a factor in +detecting the correct class, where the older the publication, the better the +prediction. + +
+
+ comment: 14 pages, 6 figures, 4 tables, 2 algorithms +
+
+
+
+
+ + ♻ ☆ Large Malaysian Language Model Based on Mistral for Enhanced Local + Language Understanding + + +
+ In this paper, we present significant advancements in the pretraining of +Mistral 7B, a large-scale language model, using a dataset of 32.6 GB, +equivalent to 1.1 billion tokens. We explore the impact of extending the +context length, releasing models with context lengths of 4096 and 32768 tokens, +and further refining performance with a specialized 16384 context length +instruction-tuned model, we called it Malaysian Mistral. + Our experiments demonstrate the efficacy of continue pretraining and the +influence of extended context lengths on Mistral 7B's language understanding +capabilities. Additionally, we release a model specifically tuned with a 16384 +context length instruction, showcasing its potential for capturing nuanced +language intricacies. + Furthermore, our research contributes to the benchmarking of Malaysian +Mistral against prominent language models, including ChatGPT3.5 and Claude 2. +We present compelling results indicating Malaysian Mistral's superior +performance on Tatabahasa (Malay grammar) test set, particularly when +fine-tuned with instructions. + All models released at +https://huggingface.co/collections/mesolitica/malaysian-mistral-7b-6528f2ec825f4bba46c1700c + +
+
+
+
+
+ + ♻ ☆ CharSpan: Utilizing Lexical Similarity to Enable Zero-Shot Machine + Translation for Extremely Low-resource Languages + + +
+ We address the task of machine translation (MT) from extremely low-resource +language (ELRL) to English by leveraging cross-lingual transfer from +'closely-related' high-resource language (HRL). The development of an MT system +for ELRL is challenging because these languages typically lack parallel corpora +and monolingual corpora, and their representations are absent from large +multilingual language models. Many ELRLs share lexical similarities with some +HRLs, which presents a novel modeling opportunity. However, existing +subword-based neural MT models do not explicitly harness this lexical +similarity, as they only implicitly align HRL and ELRL latent embedding space. +To overcome this limitation, we propose a novel, CharSpan, approach based on +'character-span noise augmentation' into the training data of HRL. This serves +as a regularization technique, making the model more robust to 'lexical +divergences' between the HRL and ELRL, thus facilitating effective +cross-lingual transfer. Our method significantly outperformed strong baselines +in zero-shot settings on closely related HRL and ELRL pairs from three diverse +language families, emerging as the state-of-the-art model for ELRLs. + +
+
+
+
+
+ + ♻ ☆ Improving Speaker Diarization using Semantic Information: Joint Pairwise + Constraints Propagation + + +
+ Speaker diarization has gained considerable attention within speech +processing research community. Mainstream speaker diarization rely primarily on +speakers' voice characteristics extracted from acoustic signals and often +overlook the potential of semantic information. Considering the fact that +speech signals can efficiently convey the content of a speech, it is of our +interest to fully exploit these semantic cues utilizing language models. In +this work we propose a novel approach to effectively leverage semantic +information in clustering-based speaker diarization systems. Firstly, we +introduce spoken language understanding modules to extract speaker-related +semantic information and utilize these information to construct pairwise +constraints. Secondly, we present a novel framework to integrate these +constraints into the speaker diarization pipeline, enhancing the performance of +the entire system. Extensive experiments conducted on the public dataset +demonstrate the consistent superiority of our proposed approach over +acoustic-only speaker diarization systems. + +
+
+
+
+
+ + ♻ ☆ Language Models Understand Numbers, at Least Partially + + +
+ Large language models (LLMs) have exhibited impressive competence in various +tasks, but their opaque internal mechanisms hinder their use in mathematical +problems. In this paper, we study a fundamental question: whether language +models understand numbers, a basic element in math. Based on an assumption that +LLMs should be capable of compressing numbers in their hidden states to solve +mathematical problems, we construct a synthetic dataset comprising addition +problems and utilize linear probes to read out input numbers from the hidden +states. Experimental results support the existence of compressed numbers in +LLMs. However, it is difficult to precisely reconstruct the original numbers, +indicating that the compression process may not be lossless. Further +experiments show that LLMs can utilize encoded numbers to perform arithmetic +computations, and the computational ability scales up with the model size. Our +preliminary research suggests that LLMs exhibit a partial understanding of +numbers, offering insights for future investigations about the models' +mathematical capability. + +
+
+
+
+
+ + ♻ ☆ Structured Probabilistic Coding AAAI 2024 + + +
+ This paper presents a new supervised representation learning framework, +namely structured probabilistic coding (SPC), to learn compact and informative +representations from input related to the target task. SPC is an encoder-only +probabilistic coding technology with a structured regularization from the +target space. It can enhance the generalization ability of pre-trained language +models for better language understanding. Specifically, our probabilistic +coding simultaneously performs information encoding and task prediction in one +module to more fully utilize the effective information from input data. It uses +variational inference in the output space to reduce randomness and uncertainty. +Besides, to better control the learning process of probabilistic +representations, a structured regularization is proposed to promote uniformity +across classes in the latent space. With the regularization term, SPC can +preserve the Gaussian structure of the latent code and achieve better coverage +of the hidden space with class uniformly. Experimental results on 12 natural +language understanding tasks demonstrate that our SPC effectively improves the +performance of pre-trained language models for classification and regression. +Extensive experiments show that SPC can enhance the generalization capability, +robustness to label noise, and clustering quality of output representations. + +
+
+ comment: 11 pages, accepted by AAAI 2024 (Oral) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 41 + +
+
+
+ + ☆ Key-Graph Transformer for Image Restoration + + +
+ While it is crucial to capture global information for effective image +restoration (IR), integrating such cues into transformer-based methods becomes +computationally expensive, especially with high input resolution. Furthermore, +the self-attention mechanism in transformers is prone to considering +unnecessary global cues from unrelated objects or regions, introducing +computational inefficiencies. In response to these challenges, we introduce the +Key-Graph Transformer (KGT) in this paper. Specifically, KGT views patch +features as graph nodes. The proposed Key-Graph Constructor efficiently forms a +sparse yet representative Key-Graph by selectively connecting essential nodes +instead of all the nodes. Then the proposed Key-Graph Attention is conducted +under the guidance of the Key-Graph only among selected nodes with linear +computational complexity within each window. Extensive experiments across 6 IR +tasks confirm the proposed KGT's state-of-the-art performance, showcasing +advancements both quantitatively and qualitatively. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ DiffEditor: Boosting Accuracy and Flexibility on Diffusion-based Image + Editing + + +
+ Large-scale Text-to-Image (T2I) diffusion models have revolutionized image +generation over the last few years. Although owning diverse and high-quality +generation capabilities, translating these abilities to fine-grained image +editing remains challenging. In this paper, we propose DiffEditor to rectify +two weaknesses in existing diffusion-based image editing: (1) in complex +scenarios, editing results often lack editing accuracy and exhibit unexpected +artifacts; (2) lack of flexibility to harmonize editing operations, e.g., +imagine new content. In our solution, we introduce image prompts in +fine-grained image editing, cooperating with the text prompt to better describe +the editing content. To increase the flexibility while maintaining content +consistency, we locally combine stochastic differential equation (SDE) into the +ordinary differential equation (ODE) sampling. In addition, we incorporate +regional score-based gradient guidance and a time travel strategy into the +diffusion sampling, further improving the editing quality. Extensive +experiments demonstrate that our method can efficiently achieve +state-of-the-art performance on various fine-grained image editing tasks, +including editing within a single image (e.g., object moving, resizing, and +content dragging) and across images (e.g., appearance replacing and object +pasting). Our source code is released at +https://github.com/MC-E/DragonDiffusion. + +
+
+
+
+
+ + ☆ Spatio-temporal Prompting Network for Robust Video Feature Extraction + + +
+ Frame quality deterioration is one of the main challenges in the field of +video understanding. To compensate for the information loss caused by +deteriorated frames, recent approaches exploit transformer-based integration +modules to obtain spatio-temporal information. However, these integration +modules are heavy and complex. Furthermore, each integration module is +specifically tailored for its target task, making it difficult to generalise to +multiple tasks. In this paper, we present a neat and unified framework, called +Spatio-Temporal Prompting Network (STPN). It can efficiently extract robust and +accurate video features by dynamically adjusting the input features in the +backbone network. Specifically, STPN predicts several video prompts containing +spatio-temporal information of neighbour frames. Then, these video prompts are +prepended to the patch embeddings of the current frame as the updated input for +video feature extraction. Moreover, STPN is easy to generalise to various video +tasks because it does not contain task-specific modules. Without bells and +whistles, STPN achieves state-of-the-art performance on three widely-used +datasets for different video understanding tasks, i.e., ImageNetVID for video +object detection, YouTubeVIS for video instance segmentation, and GOT-10k for +visual object tracking. Code is available at +https://github.com/guanxiongsun/vfe.pytorch. + +
+
+
+
+
+ + ☆ Generalizable Entity Grounding via Assistance of Large Language Model + + +
+ In this work, we propose a novel approach to densely ground visual entities +from a long caption. We leverage a large multimodal model (LMM) to extract +semantic nouns, a class-agnostic segmentation model to generate entity-level +segmentation, and the proposed multi-modal feature fusion module to associate +each semantic noun with its corresponding segmentation mask. Additionally, we +introduce a strategy of encoding entity segmentation masks into a colormap, +enabling the preservation of fine-grained predictions from features of +high-resolution masks. This approach allows us to extract visual features from +low-resolution images using the CLIP vision encoder in the LMM, which is more +computationally efficient than existing approaches that use an additional +encoder for high-resolution images. Our comprehensive experiments demonstrate +the superiority of our method, outperforming state-of-the-art techniques on +three tasks, including panoptic narrative grounding, referring expression +segmentation, and panoptic segmentation. + +
+
+
+
+
+ + ☆ DeSparsify: Adversarial Attack Against Token Sparsification Mechanisms + in Vision Transformers + + +
+ Vision transformers have contributed greatly to advancements in the computer +vision domain, demonstrating state-of-the-art performance in diverse tasks +(e.g., image classification, object detection). However, their high +computational requirements grow quadratically with the number of tokens used. +Token sparsification techniques have been proposed to address this issue. These +techniques employ an input-dependent strategy, in which uninformative tokens +are discarded from the computation pipeline, improving the model's efficiency. +However, their dynamism and average-case assumption makes them vulnerable to a +new threat vector - carefully crafted adversarial examples capable of fooling +the sparsification mechanism, resulting in worst-case performance. In this +paper, we present DeSparsify, an attack targeting the availability of vision +transformers that use token sparsification mechanisms. The attack aims to +exhaust the operating system's resources, while maintaining its stealthiness. +Our evaluation demonstrates the attack's effectiveness on three token +sparsification techniques and examines the attack's transferability between +them and its effect on the GPU resources. To mitigate the impact of the attack, +we propose various countermeasures. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Classification of Tennis Actions Using Deep Learning + + +
+ Recent advances of deep learning makes it possible to identify specific +events in videos with greater precision. This has great relevance in sports +like tennis in order to e.g., automatically collect game statistics, or replay +actions of specific interest for game strategy or player improvements. In this +paper, we investigate the potential and the challenges of using deep learning +to classify tennis actions. Three models of different size, all based on the +deep learning architecture SlowFast were trained and evaluated on the academic +tennis dataset THETIS. The best models achieve a generalization accuracy of 74 +%, demonstrating a good performance for tennis action classification. We +provide an error analysis for the best model and pinpoint directions for +improvement of tennis datasets in general. We discuss the limitations of the +data set, general limitations of current publicly available tennis data-sets, +and future steps needed to make progress. + +
+
+ comment: 5 Figures +
+
+
+
+
+ + ☆ LHRS-Bot: Empowering Remote Sensing with VGI-Enhanced Large Multimodal + Language Model + + +
+ The revolutionary capabilities of large language models (LLMs) have paved the +way for multimodal large language models (MLLMs) and fostered diverse +applications across various specialized domains. In the remote sensing (RS) +field, however, the diverse geographical landscapes and varied objects in RS +imagery are not adequately considered in recent MLLM endeavors. To bridge this +gap, we construct a large-scale RS image-text dataset, LHRS-Align, and an +informative RS-specific instruction dataset, LHRS-Instruct, leveraging the +extensive volunteered geographic information (VGI) and globally available RS +images. Building on this foundation, we introduce LHRS-Bot, an MLLM tailored +for RS image understanding through a novel multi-level vision-language +alignment strategy and a curriculum learning method. Comprehensive experiments +demonstrate that LHRS-Bot exhibits a profound understanding of RS images and +the ability to perform nuanced reasoning within the RS domain. + +
+
+ comment: 32 pages, 8 figures. Github https://github.com/NJU-LHRS/LHRS-Bot +
+
+
+
+
+ + ☆ Knowledge Generation for Zero-shot Knowledge-based VQA EACL 2023 + + +
+ Previous solutions to knowledge-based visual question answering~(K-VQA) +retrieve knowledge from external knowledge bases and use supervised learning to +train the K-VQA model. Recently pre-trained LLMs have been used as both a +knowledge source and a zero-shot QA model for K-VQA and demonstrated promising +results. However, these recent methods do not explicitly show the knowledge +needed to answer the questions and thus lack interpretability. Inspired by +recent work on knowledge generation from LLMs for text-based QA, in this work +we propose and test a similar knowledge-generation-based K-VQA method, which +first generates knowledge from an LLM and then incorporates the generated +knowledge for K-VQA in a zero-shot manner. We evaluate our method on two K-VQA +benchmarks and found that our method performs better than previous zero-shot +K-VQA methods and our generated knowledge is generally relevant and helpful. + +
+
+ comment: accepted as Findings in EACL 2023; +
+
+
+
+
+ + ☆ Embedding Non-Distortive Cancelable Face Template Generation + + +
+ Biometric authentication systems are crucial for security, but developing +them involves various complexities, including privacy, security, and achieving +high accuracy without directly storing pure biometric data in storage. We +introduce an innovative image distortion technique that makes facial images +unrecognizable to the eye but still identifiable by any custom embedding neural +network model. Using the proposed approach, we test the reliability of +biometric recognition networks by determining the maximum image distortion that +does not change the predicted identity. Through experiments on MNIST and LFW +datasets, we assess its effectiveness and compare it based on the traditional +comparison metrics. + +
+
+
+
+
+ + ☆ SIMPL: A Simple and Efficient Multi-agent Motion Prediction Baseline for + Autonomous Driving + + +
+ This paper presents a Simple and effIcient Motion Prediction baseLine (SIMPL) +for autonomous vehicles. Unlike conventional agent-centric methods with high +accuracy but repetitive computations and scene-centric methods with compromised +accuracy and generalizability, SIMPL delivers real-time, accurate motion +predictions for all relevant traffic participants. To achieve improvements in +both accuracy and inference speed, we propose a compact and efficient global +feature fusion module that performs directed message passing in a symmetric +manner, enabling the network to forecast future motion for all road users in a +single feed-forward pass and mitigating accuracy loss caused by viewpoint +shifting. Additionally, we investigate the continuous trajectory +parameterization using Bernstein basis polynomials in trajectory decoding, +allowing evaluations of states and their higher-order derivatives at any +desired time point, which is valuable for downstream planning tasks. As a +strong baseline, SIMPL exhibits highly competitive performance on Argoverse 1 & +2 motion forecasting benchmarks compared with other state-of-the-art methods. +Furthermore, its lightweight design and low inference latency make SIMPL highly +extensible and promising for real-world onboard deployment. We open-source the +code at https://github.com/HKUST-Aerial-Robotics/SIMPL. + +
+
+ comment: Code is available at https://github.com/HKUST-Aerial-Robotics/SIMPL +
+
+
+
+
+ + ☆ Deep Supervision by Gaussian Pseudo-label-based Morphological Attention + for Abdominal Aorta Segmentation in Non-Contrast CTs + + +
+ The segmentation of the abdominal aorta in non-contrast CT images is a +non-trivial task for computer-assisted endovascular navigation, particularly in +scenarios where contrast agents are unsuitable. While state-of-the-art deep +learning segmentation models have been proposed recently for this task, they +are trained on manually annotated strong labels. However, the inherent +ambiguity in the boundary of the aorta in non-contrast CT may undermine the +reliability of strong labels, leading to potential overfitting risks. This +paper introduces a Gaussian-based pseudo label, integrated into conventional +deep learning models through deep supervision, to achieve Morphological +Attention (MA) enhancement. As the Gaussian pseudo label retains the +morphological features of the aorta without explicitly representing its +boundary distribution, we suggest that it preserves aortic morphology during +training while mitigating the negative impact of ambiguous boundaries, reducing +the risk of overfitting. It is introduced in various 2D/3D deep learning models +and validated on our local data set of 30 non-contrast CT volumes comprising +5749 CT slices. The results underscore the effectiveness of MA in preserving +the morphological characteristics of the aorta and addressing overfitting +concerns, thereby enhancing the performance of the models. + +
+
+ comment: Accepted by 21st IEEE International Symposium on Biomedical Imaging +
+
+
+
+
+ + ☆ GeReA: Question-Aware Prompt Captions for Knowledge-based Visual + Question Answering + + +
+ Knowledge-based visual question answering (VQA) requires world knowledge +beyond the image for accurate answer. Recently, instead of extra knowledge +bases, a large language model (LLM) like GPT-3 is activated as an implicit +knowledge engine to jointly acquire and reason the necessary knowledge for +answering by converting images into textual information (e.g., captions and +answer candidates). However, such conversion may introduce irrelevant +information, which causes the LLM to misinterpret images and ignore visual +details crucial for accurate knowledge. We argue that multimodal large language +model (MLLM) is a better implicit knowledge engine than the LLM for its +superior capability of visual understanding. Despite this, how to activate the +capacity of MLLM as the implicit knowledge engine has not been explored yet. +Therefore, we propose GeReA, a generate-reason framework that prompts a MLLM +like InstructBLIP with question relevant vision and language information to +generate knowledge-relevant descriptions and reasons those descriptions for +knowledge-based VQA. Specifically, the question-relevant image regions and +question-specific manual prompts are encoded in the MLLM to generate the +knowledge relevant descriptions, referred to as question-aware prompt captions. +After that, the question-aware prompt captions, image-question pair, and +similar samples are sent into the multi-modal reasoning model to learn a joint +knowledge-image-question representation for answer prediction. GeReA unlocks +the use of MLLM as the implicit knowledge engine, surpassing all previous +state-of-the-art methods on OK-VQA and A-OKVQA datasets, with test accuracies +of 66.5% and 63.3% respectively. Our code will be released at +https://github.com/Upper9527/GeReA. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Point Cloud Matters: Rethinking the Impact of Different Observation + Spaces on Robot Learning + + +
+ In this study, we explore the influence of different observation spaces on +robot learning, focusing on three predominant modalities: RGB, RGB-D, and point +cloud. Through extensive experimentation on over 17 varied contact-rich +manipulation tasks, conducted across two benchmarks and simulators, we have +observed a notable trend: point cloud-based methods, even those with the +simplest designs, frequently surpass their RGB and RGB-D counterparts in +performance. This remains consistent in both scenarios: training from scratch +and utilizing pretraining. Furthermore, our findings indicate that point cloud +observations lead to improved policy zero-shot generalization in relation to +various geometry and visual clues, including camera viewpoints, lighting +conditions, noise levels and background appearance. The outcomes suggest that +3D point cloud is a valuable observation modality for intricate robotic tasks. +We will open-source all our codes and checkpoints, hoping that our insights can +help design more generalizable and robust robotic models. + +
+
+
+
+
+ + ☆ Fully Differentiable Correlation-driven 2D/3D Registration for X-ray to + CT Image Fusion + + +
+ Image-based rigid 2D/3D registration is a critical technique for fluoroscopic +guided surgical interventions. In recent years, some learning-based fully +differentiable methods have produced beneficial outcomes while the process of +feature extraction and gradient flow transmission still lack controllability +and interpretability. To alleviate these problems, in this work, we propose a +novel fully differentiable correlation-driven network using a dual-branch +CNN-transformer encoder which enables the network to extract and separate +low-frequency global features from high-frequency local features. A +correlation-driven loss is further proposed for low-frequency feature and +high-frequency feature decomposition based on embedded information. Besides, a +training strategy that learns to approximate a convex-shape similarity function +is applied in our work. We test our approach on a in-house datasetand show that +it outperforms both existing fully differentiable learning-based registration +approaches and the conventional optimization-based baseline. + +
+
+ comment: ISBI 2024 +
+
+
+
+
+ + ☆ VM-UNet: Vision Mamba UNet for Medical Image Segmentation + + +
+ In the realm of medical image segmentation, both CNN-based and +Transformer-based models have been extensively explored. However, CNNs exhibit +limitations in long-range modeling capabilities, whereas Transformers are +hampered by their quadratic computational complexity. Recently, State Space +Models (SSMs), exemplified by Mamba, have emerged as a promising approach. They +not only excel in modeling long-range interactions but also maintain a linear +computational complexity. In this paper, leveraging state space models, we +propose a U-shape architecture model for medical image segmentation, named +Vision Mamba UNet (VM-UNet). Specifically, the Visual State Space (VSS) block +is introduced as the foundation block to capture extensive contextual +information, and an asymmetrical encoder-decoder structure is constructed. We +conduct comprehensive experiments on the ISIC17, ISIC18, and Synapse datasets, +and the results indicate that VM-UNet performs competitively in medical image +segmentation tasks. To our best knowledge, this is the first medical image +segmentation model constructed based on the pure SSM-based model. We aim to +establish a baseline and provide valuable insights for the future development +of more efficient and effective SSM-based segmentation systems. Our code is +available at https://github.com/JCruan519/VM-UNet. + +
+
+ comment: 12 pages, 2 figures, 3 tables. Work in progress +
+
+
+
+
+ + ☆ Deep Spectral Improvement for Unsupervised Image Instance Segmentation + + +
+ Deep spectral methods reframe the image decomposition process as a graph +partitioning task by extracting features using self-supervised learning and +utilizing the Laplacian of the affinity matrix to obtain eigensegments. +However, instance segmentation has received less attention compared to other +tasks within the context of deep spectral methods. This paper addresses the +fact that not all channels of the feature map extracted from a self-supervised +backbone contain sufficient information for instance segmentation purposes. In +fact, Some channels are noisy and hinder the accuracy of the task. To overcome +this issue, this paper proposes two channel reduction modules: Noise Channel +Reduction (NCR) and Deviation-based Channel Reduction (DCR). The NCR retains +channels with lower entropy, as they are less likely to be noisy, while DCR +prunes channels with low standard deviation, as they lack sufficient +information for effective instance segmentation. Furthermore, the paper +demonstrates that the dot product, commonly used in deep spectral methods, is +not suitable for instance segmentation due to its sensitivity to feature map +values, potentially leading to incorrect instance segments. A new similarity +metric called Bray-Curtis over Chebyshev (BoC) is proposed to address this +issue. It takes into account the distribution of features in addition to their +values, providing a more robust similarity measure for instance segmentation. +Quantitative and qualitative results on the Youtube-VIS2019 dataset highlight +the improvements achieved by the proposed channel reduction methods and the use +of BoC instead of the conventional dot product for creating the affinity +matrix. These improvements are observed in terms of mean Intersection over +Union and extracted instance segments, demonstrating enhanced instance +segmentation performance. The code is available on: +https://github.com/farnooshar/SpecUnIIS + +
+
+ comment: 11 pages, 13 figures and 5 tables +
+
+
+
+
+ + ☆ AI Art Neural Constellation: Revealing the Collective and Contrastive + State of AI-Generated and Human Art + + +
+ Discovering the creative potentials of a random signal to various artistic +expressions in aesthetic and conceptual richness is a ground for the recent +success of generative machine learning as a way of art creation. To understand +the new artistic medium better, we conduct a comprehensive analysis to position +AI-generated art within the context of human art heritage. Our comparative +analysis is based on an extensive dataset, dubbed ``ArtConstellation,'' +consisting of annotations about art principles, likability, and emotions for +6,000 WikiArt and 3,200 AI-generated artworks. After training various +state-of-the-art generative models, art samples are produced and compared with +WikiArt data on the last hidden layer of a deep-CNN trained for style +classification. We actively examined the various art principles to interpret +the neural representations and used them to drive the comparative knowledge +about human and AI-generated art. A key finding in the semantic analysis is +that AI-generated artworks are visually related to the principle concepts for +modern period art made in 1800-2000. In addition, through Out-Of-Distribution +(OOD) and In-Distribution (ID) detection in CLIP space, we find that +AI-generated artworks are ID to human art when they depict landscapes and +geometric abstract figures, while detected as OOD when the machine art consists +of deformed and twisted figures. We observe that machine-generated art is +uniquely characterized by incomplete and reduced figuration. Lastly, we +conducted a human survey about emotional experience. Color composition and +familiar subjects are the key factors of likability and emotions in art +appreciation. We propose our whole methodologies and collected dataset as our +analytical framework to contrast human and AI-generated art, which we refer to +as ``ArtNeuralConstellation''. Code is available at: +https://github.com/faixan-khan/ArtNeuralConstellation + +
+
+
+
+
+ + ☆ BECLR: Batch Enhanced Contrastive Few-Shot Learning ICLR 2024 + + +
+ Learning quickly from very few labeled samples is a fundamental attribute +that separates machines and humans in the era of deep representation learning. +Unsupervised few-shot learning (U-FSL) aspires to bridge this gap by discarding +the reliance on annotations at training time. Intrigued by the success of +contrastive learning approaches in the realm of U-FSL, we structurally approach +their shortcomings in both pretraining and downstream inference stages. We +propose a novel Dynamic Clustered mEmory (DyCE) module to promote a highly +separable latent representation space for enhancing positive sampling at the +pretraining phase and infusing implicit class-level insights into unsupervised +contrastive learning. We then tackle the, somehow overlooked yet critical, +issue of sample bias at the few-shot inference stage. We propose an iterative +Optimal Transport-based distribution Alignment (OpTA) strategy and demonstrate +that it efficiently addresses the problem, especially in low-shot scenarios +where FSL approaches suffer the most from sample bias. We later on discuss that +DyCE and OpTA are two intertwined pieces of a novel end-to-end approach (we +coin as BECLR), constructively magnifying each other's impact. We then present +a suite of extensive quantitative and qualitative experimentation to +corroborate that BECLR sets a new state-of-the-art across ALL existing U-FSL +benchmarks (to the best of our knowledge), and significantly outperforms the +best of the current baselines (codebase available at: +https://github.com/stypoumic/BECLR). + +
+
+ comment: ICLR 2024 Spotlight Presentation +
+
+
+
+
+ + ☆ Uncertainty-Aware Perceiver + + +
+ The Perceiver makes few architectural assumptions about the relationship +among its inputs with quadratic scalability on its memory and computation time. +Indeed, the Perceiver model outpaces or is competitive with ResNet-50 and ViT +in terms of accuracy to some degree. However, the Perceiver does not take +predictive uncertainty and calibration into account. The Perceiver also +generalizes its performance on three datasets, three models, one evaluation +metric, and one hyper-parameter setting. Worst of all, the Perceiver's relative +performance improvement against other models is marginal. Furthermore, its +reduction of architectural prior is not substantial; is not equivalent to its +quality. Thereby, I invented five mutations of the Perceiver, the +Uncertainty-Aware Perceivers, that obtain uncertainty estimates and measured +their performance on three metrics. Experimented with CIFAR-10 and CIFAR-100, +the Uncertainty-Aware Perceivers make considerable performance enhancement +compared to the Perceiver. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Learning Mutual Excitation for Hand-to-Hand and Human-to-Human + Interaction Recognition + + +
+ Recognizing interactive actions, including hand-to-hand interaction and +human-to-human interaction, has attracted increasing attention for various +applications in the field of video analysis and human-robot interaction. +Considering the success of graph convolution in modeling topology-aware +features from skeleton data, recent methods commonly operate graph convolution +on separate entities and use late fusion for interactive action recognition, +which can barely model the mutual semantic relationships between pairwise +entities. To this end, we propose a mutual excitation graph convolutional +network (me-GCN) by stacking mutual excitation graph convolution (me-GC) +layers. Specifically, me-GC uses a mutual topology excitation module to firstly +extract adjacency matrices from individual entities and then adaptively model +the mutual constraints between them. Moreover, me-GC extends the above idea and +further uses a mutual feature excitation module to extract and merge deep +features from pairwise entities. Compared with graph convolution, our proposed +me-GC gradually learns mutual information in each layer and each stage of graph +convolution operations. Extensive experiments on a challenging hand-to-hand +interaction dataset, i.e., the Assembely101 dataset, and two large-scale +human-to-human interaction datasets, i.e., NTU60-Interaction and +NTU120-Interaction consistently verify the superiority of our proposed method, +which outperforms the state-of-the-art GCN-based and Transformer-based methods. + +
+
+
+
+
+ + ☆ Exploiting Low-level Representations for Ultra-Fast Road Segmentation + + +
+ Achieving real-time and accuracy on embedded platforms has always been the +pursuit of road segmentation methods. To this end, they have proposed many +lightweight networks. However, they ignore the fact that roads are "stuff" +(background or environmental elements) rather than "things" (specific +identifiable objects), which inspires us to explore the feasibility of +representing roads with low-level instead of high-level features. Surprisingly, +we find that the primary stage of mainstream network models is sufficient to +represent most pixels of the road for segmentation. Motivated by this, we +propose a Low-level Feature Dominated Road Segmentation network (LFD-RoadSeg). +Specifically, LFD-RoadSeg employs a bilateral structure. The spatial detail +branch is firstly designed to extract low-level feature representation for the +road by the first stage of ResNet-18. To suppress texture-less regions mistaken +as the road in the low-level feature, the context semantic branch is then +designed to extract the context feature in a fast manner. To this end, in the +second branch, we asymmetrically downsample the input image and design an +aggregation module to achieve comparable receptive fields to the third stage of +ResNet-18 but with less time consumption. Finally, to segment the road from the +low-level feature, a selective fusion module is proposed to calculate +pixel-wise attention between the low-level representation and context feature, +and suppress the non-road low-level response by this attention. On KITTI-Road, +LFD-RoadSeg achieves a maximum F1-measure (MaxF) of 95.21% and an average +precision of 93.71%, while reaching 238 FPS on a single TITAN Xp and 54 FPS on +a Jetson TX2, all with a compact model size of just 936k parameters. The source +code is available at https://github.com/zhouhuan-hust/LFD-RoadSeg. + +
+
+ comment: 11 pages, 7 figures, TEEE TITS +
+
+
+
+
+ + ☆ Physics-Inspired Degradation Models for Hyperspectral Image Fusion + + +
+ The fusion of a low-spatial-resolution hyperspectral image (LR-HSI) with a +high-spatial-resolution multispectral image (HR-MSI) has garnered increasing +research interest. However, most fusion methods solely focus on the fusion +algorithm itself and overlook the degradation models, which results in +unsatisfactory performance in practical scenarios. To fill this gap, we propose +physics-inspired degradation models (PIDM) to model the degradation of LR-HSI +and HR-MSI, which comprises a spatial degradation network (SpaDN) and a +spectral degradation network (SpeDN). SpaDN and SpeDN are designed based on two +insights. First, we employ spatial warping and spectral modulation operations +to simulate lens aberrations, thereby introducing non-uniformity into the +spatial and spectral degradation processes. Second, we utilize asymmetric +downsampling and parallel downsampling operations to separately reduce the +spatial and spectral resolutions of the images, thus ensuring the matching of +spatial and spectral degradation processes with specific physical +characteristics. Once SpaDN and SpeDN are established, we adopt a +self-supervised training strategy to optimize the network parameters and +provide a plug-and-play solution for fusion methods. Comprehensive experiments +demonstrate that our proposed PIDM can boost the fusion performance of existing +fusion methods in practical scenarios. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of YOLO Architectures in Computer Vision: From + YOLOv1 to YOLOv8 and YOLO-NAS + + +
+ YOLO has become a central real-time object detection system for robotics, +driverless cars, and video monitoring applications. We present a comprehensive +analysis of YOLO's evolution, examining the innovations and contributions in +each iteration from the original YOLO up to YOLOv8, YOLO-NAS, and YOLO with +Transformers. We start by describing the standard metrics and postprocessing; +then, we discuss the major changes in network architecture and training tricks +for each model. Finally, we summarize the essential lessons from YOLO's +development and provide a perspective on its future, highlighting potential +research directions to enhance real-time object detection systems. + +
+
+ comment: 36 pages, 21 figures, 4 tables, published in Machine Learning and + Knowledge Extraction. This version contains the last changes made to the + published version +
+
+
+
+
+ + ♻ ☆ Sample as You Infer: Predictive Coding With Langevin Dynamics + + +
+ We present a novel algorithm for parameter learning in generic deep +generative models that builds upon the predictive coding (PC) framework of +computational neuroscience. Our approach modifies the standard PC algorithm to +bring performance on-par and exceeding that obtained from standard variational +auto-encoder (VAE) training. By injecting Gaussian noise into the PC inference +procedure we re-envision it as an overdamped Langevin sampling, which +facilitates optimisation with respect to a tight evidence lower bound (ELBO). +We improve the resultant encoder-free training method by incorporating an +encoder network to provide an amortised warm-start to our Langevin sampling and +test three different objectives for doing so. Finally, to increase robustness +to the sampling step size and reduce sensitivity to curvature, we validate a +lightweight and easily computable form of preconditioning, inspired by Riemann +Manifold Langevin and adaptive optimizers from the SGD literature. We compare +against VAEs by training like-for-like generative models using our technique +against those trained with standard reparameterisation-trick-based ELBOs. We +observe our method out-performs or matches performance across a number of +metrics, including sample quality, while converging in a fraction of the number +of SGD training iterations. + +
+
+ comment: FID values updated to use a fixed 50,000 samples for all experiments + - Jeffrey's divergence now consistently best performing. Dynov2 based metrics + removed due to inconsistency of results - and since not industry standard. + Multiple beta values tested in Fig 4. Theta LR for VAEs; beta and inf LR for + LPC now tuned for results. Figure 5B updated; curves now correspond to + results in Table 1 +
+
+
+
+
+ + ♻ ☆ Learning Any-View 6DoF Robotic Grasping in Cluttered Scenes via Neural + Surface Rendering + + +
+ A significant challenge for real-world robotic manipulation is the effective +6DoF grasping of objects in cluttered scenes from any single viewpoint without +the need for additional scene exploration. This work reinterprets grasping as +rendering and introduces NeuGraspNet, a novel method for 6DoF grasp detection +that leverages advances in neural volumetric representations and surface +rendering. It encodes the interaction between a robot's end-effector and an +object's surface by jointly learning to render the local object surface and +learning grasping functions in a shared feature space. The approach uses global +(scene-level) features for grasp generation and local (grasp-level) neural +surface features for grasp evaluation. This enables effective, fully implicit +6DoF grasp quality prediction, even in partially observed scenes. NeuGraspNet +operates on random viewpoints, common in mobile manipulation scenarios, and +outperforms existing implicit and semi-implicit grasping methods. The +real-world applicability of the method has been demonstrated with a mobile +manipulator robot, grasping in open, cluttered spaces. Project website at +https://sites.google.com/view/neugraspnet + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Adaptive aggregation of Monte Carlo augmented decomposed filters for + efficient group-equivariant convolutional neural network + + +
+ Group-equivariant convolutional neural networks (G-CNN) heavily rely on +parameter sharing to increase CNN's data efficiency and performance. However, +the parameter-sharing strategy greatly increases the computational burden for +each added parameter, which hampers its application to deep neural network +models. In this paper, we address these problems by proposing a +non-parameter-sharing approach for group equivariant neural networks. The +proposed methods adaptively aggregate a diverse range of filters by a weighted +sum of stochastically augmented decomposed filters. We give theoretical proof +about how the group equivariance can be achieved by our methods. Our method +applies to both continuous and discrete groups, where the augmentation is +implemented using Monte Carlo sampling and bootstrap resampling, respectively. +We demonstrate that our methods serve as an efficient extension of standard +CNN. Experiments on group equivariant tests show how our methods can achieve +superior performance to parameter-sharing group equivariant networks. +Experiments on image classification and image denoising tasks show that in +certain scenarios, with a suitable set of filter bases, our method helps +improve the performance of standard CNNs and build efficient lightweight image +denoising networks. The code will be available at +https://github.com/ZhaoWenzhao/MCG_CNN. + +
+
+
+
+
+ + ♻ ☆ Fine-tuning can cripple your foundation model; preserving features may + be the solution + + +
+ Pre-trained foundation models, due to their enormous capacity and exposure to +vast amounts of data during pre-training, are known to have learned plenty of +real-world concepts. An important step in making these pre-trained models +extremely effective on downstream tasks is to fine-tune them on related +datasets. While various fine-tuning methods have been devised and have been +shown to be highly effective, we observe that a fine-tuned model's ability to +recognize concepts on tasks $\textit{different}$ from the downstream one is +reduced significantly compared to its pre-trained counterpart. This is an +undesirable effect of fine-tuning as a substantial amount of resources was used +to learn these pre-trained concepts in the first place. We call this phenomenon +"concept forgetting" and via experiments show that most end-to-end fine-tuning +approaches suffer heavily from this side effect. To this end, we propose a +simple fix to this problem by designing a new fine-tuning method called +$\textit{LDIFS}$ (short for $\ell_2$ distance in feature space) that, while +learning new concepts related to the downstream task, allows a model to +preserve its pre-trained knowledge as well. Through extensive experiments on 10 +fine-tuning tasks we show that LDIFS significantly reduces concept forgetting. +Additionally, we show that LDIFS is highly effective in performing continual +fine-tuning on a sequence of tasks as well, in comparison with both fine-tuning +as well as continual learning baselines. + +
+
+
+
+
+ + ♻ ☆ Towards a Better Understanding of the Computer Vision Research Community + in Africa + + +
+ Computer vision is a broad field of study that encompasses different tasks +(e.g., object detection). Although computer vision is relevant to the African +communities in various applications, yet computer vision research is +under-explored in the continent and constructs only 0.06% of top-tier +publications in the last ten years. In this paper, our goal is to have a better +understanding of the computer vision research conducted in Africa and provide +pointers on whether there is equity in research or not. We do this through an +empirical analysis of the African computer vision publications that are Scopus +indexed, where we collect around 63,000 publications over the period 2012-2022. +We first study the opportunities available for African institutions to publish +in top-tier computer vision venues. We show that African publishing trends in +top-tier venues over the years do not exhibit consistent growth, unlike other +continents such as North America or Asia. Moreover, we study all computer +vision publications beyond top-tier venues in different African regions to find +that mainly Northern and Southern Africa are publishing in computer vision with +68.5% and 15.9% of publications, resp. Nonetheless, we highlight that both +Eastern and Western Africa are exhibiting a promising increase with the last +two years closing the gap with Southern Africa. Additionally, we study the +collaboration patterns in these publications to find that most of these exhibit +international collaborations rather than African ones. We also show that most +of these publications include an African author that is a key contributor as +the first or last author. Finally, we present the most recurring keywords in +computer vision publications per African region. + +
+
+ comment: Published in EAAMO'23 under ACM License. This work is part of our + African computer vision grassroots research in Ro'ya - CV4Africa, + https://ro-ya-cv4africa.github.io/homepage/ +
+
+
+
+
+ + ♻ ☆ MobileARLoc: On-device Robust Absolute Localisation for Pervasive + Markerless Mobile AR + + +
+ Recent years have seen significant improvement in absolute camera pose +estimation, paving the way for pervasive markerless Augmented Reality (AR). +However, accurate absolute pose estimation techniques are computation- and +storage-heavy, requiring computation offloading. As such, AR systems rely on +visual-inertial odometry (VIO) to track the device's relative pose between +requests to the server. However, VIO suffers from drift, requiring frequent +absolute repositioning. This paper introduces MobileARLoc, a new framework for +on-device large-scale markerless mobile AR that combines an absolute pose +regressor (APR) with a local VIO tracking system. Absolute pose regressors +(APRs) provide fast on-device pose estimation at the cost of reduced accuracy. +To address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback +loop where VIO pose estimations refine the APR predictions. The VIO system +identifies reliable predictions of APR, which are then used to compensate for +the VIO drift. We comprehensively evaluate MobileARLoc through dataset +simulations. MobileARLoc halves the error compared to the underlying APR and +achieve fast (80\,ms) on-device inference speed. + +
+
+ comment: Accepted for publication at the 3rd edition of the Pervasive and + Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024). + This article supersedes arXiv:2308.05394 +
+
+
+
+
+ + ♻ ☆ Explore Synergistic Interaction Across Frames for Interactive Video + Object Segmentation + + +
+ Interactive Video Object Segmentation (iVOS) is a challenging task that +requires real-time human-computer interaction. To improve the user experience, +it is important to consider the user's input habits, segmentation quality, +running time and memory consumption.However, existing methods compromise user +experience with single input mode and slow running speed. Specifically, these +methods only allow the user to interact with one single frame, which limits the +expression of the user's intent.To overcome these limitations and better align +with people's usage habits, we propose a framework that can accept multiple +frames simultaneously and explore synergistic interaction across frames (SIAF). +Concretely, we designed the Across-Frame Interaction Module that enables users +to annotate different objects freely on multiple frames. The AFI module will +migrate scribble information among multiple interactive frames and generate +multi-frame masks. Additionally, we employ the id-queried mechanism to process +multiple objects in batches. Furthermore, for a more efficient propagation and +lightweight model, we design a truncated re-propagation strategy to replace the +previous multi-round fusion module, which employs an across-round memory that +stores important interaction information. Our SwinB-SIAF achieves new +state-of-the-art performance on DAVIS 2017 (89.6%, J&F@60). Moreover, our +R50-SIAF is more than 3 faster than the state-of-the-art competitor under +challenging multi-object scenarios. + +
+
+
+
+
+ + ♻ ☆ A Survey on African Computer Vision Datasets, Topics and Researchers + + +
+ Computer vision encompasses a range of tasks such as object detection, +semantic segmentation, and 3D reconstruction. Despite its relevance to African +communities, research in this field within Africa represents only 0.06% of +top-tier publications over the past decade. This study undertakes a thorough +analysis of 63,000 Scopus-indexed computer vision publications from Africa, +spanning from 2012 to 2022. The aim is to provide a survey of African computer +vision topics, datasets and researchers. A key aspect of our study is the +identification and categorization of African Computer Vision datasets using +large language models that automatically parse abstracts of these publications. +We also provide a compilation of unofficial African Computer Vision datasets +distributed through challenges or data hosting platforms, and provide a full +taxonomy of dataset categories. Our survey also pinpoints computer vision +topics trends specific to different African regions, indicating their unique +focus areas. Additionally, we carried out an extensive survey to capture the +views of African researchers on the current state of computer vision research +in the continent and the structural barriers they believe need urgent +attention. In conclusion, this study catalogs and categorizes Computer Vision +datasets and topics contributed or initiated by African institutions and +identifies barriers to publishing in top-tier Computer Vision venues. This +survey underscores the importance of encouraging African researchers and +institutions in advancing computer vision research in the continent. It also +stresses on the need for research topics to be more aligned with the needs of +African communities. + +
+
+ comment: Under Review, Community Work of Ro'ya Grassroots, + https://ro-ya-cv4africa.github.io/homepage/.Journal extension of our + conference paper, arXiv admin note: text overlap with arXiv:2305.06773 +
+
+
+
+
+ + ♻ ☆ BanglaNet: Bangla Handwritten Character Recognition using Ensembling of + Convolutional Neural Network + + +
+ Handwritten character recognition is a crucial task because of its abundant +applications. The recognition task of Bangla handwritten characters is +especially challenging because of the cursive nature of Bangla characters and +the presence of compound characters with more than one way of writing. In this +paper, a classification model based on the ensembling of several Convolutional +Neural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic +characters, compound characters, numerals, and modifiers. Three different +models based on the idea of state-of-the-art CNN models like Inception, ResNet, +and DenseNet have been trained with both augmented and non-augmented inputs. +Finally, all these models are averaged or ensembled to get the finishing model. +Rigorous experimentation on three benchmark Bangla handwritten characters +datasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited +significant recognition accuracies compared to some recent CNN-based research. +The top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and +the top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb, +BanglaLekha-Isolated, and Ekush datasets respectively. + +
+
+
+
+
+ + ♻ ☆ TransMed: Large Language Models Enhance Vision Transformer for + Biomedical Image Classification + + +
+ Few-shot learning has been studied to adapt models to tasks with very few +samples. It holds profound significance, particularly in clinical tasks, due to +the high annotation cost of medical images. Several works have explored +few-shot learning on medical images, yet they still require a large number of +medical images for pre-training models to gain domain-specific priors. Vision +foundation models recently have achieved remarkable success in natural images. +Hence, adapting rapidly advancing vision foundation models from natural images +to few-shot clinical tasks holds great promise. MedFMC has recently organized a +challenge to shed more light on this topic at NeurIPS 2023. In this work, we +present our challenge solution. We observe that a simple variant of fine-tuning +with partial freezing shows remarkable performance. Empirical evidence +demonstrates that this approach could outperform various common fine-tuning +methods under limited sample sizes. Additionally, we explore enhanced +utilization of semantic supervision to boost performance. We propose a novel +approach that contextualizes labels via large language models (LLMs). Our +findings reveal that the context generated by LLMs significantly enhances the +discrimination of semantic embeddings for similar categories, resulting in a +notable performance improvement of 3%-5% in 1-shot settings compared to +commonly employed one-hot labels and other semantic supervision methods. Our +solution secures the 1st place in the MedFMC challenge. + +
+
+
+
+
+ + ♻ ☆ Cyto R-CNN and CytoNuke Dataset: Towards reliable whole-cell + segmentation in bright-field histological images + + +
+ Background: Cell segmentation in bright-field histological slides is a +crucial topic in medical image analysis. Having access to accurate segmentation +allows researchers to examine the relationship between cellular morphology and +clinical observations. Unfortunately, most segmentation methods known today are +limited to nuclei and cannot segmentate the cytoplasm. + Material & Methods: We present a new network architecture Cyto R-CNN that is +able to accurately segment whole cells (with both the nucleus and the +cytoplasm) in bright-field images. We also present a new dataset CytoNuke, +consisting of multiple thousand manual annotations of head and neck squamous +cell carcinoma cells. Utilizing this dataset, we compared the performance of +Cyto R-CNN to other popular cell segmentation algorithms, including QuPath's +built-in algorithm, StarDist and Cellpose. To evaluate segmentation +performance, we calculated AP50, AP75 and measured 17 morphological and +staining-related features for all detected cells. We compared these +measurements to the gold standard of manual segmentation using the +Kolmogorov-Smirnov test. + Results: Cyto R-CNN achieved an AP50 of 58.65% and an AP75 of 11.56% in +whole-cell segmentation, outperforming all other methods (QuPath +$19.46/0.91\%$; StarDist $45.33/2.32\%$; Cellpose $31.85/5.61\%$). Cell +features derived from Cyto R-CNN showed the best agreement to the gold standard +($\bar{D} = 0.15$) outperforming QuPath ($\bar{D} = 0.22$), StarDist ($\bar{D} += 0.25$) and Cellpose ($\bar{D} = 0.23$). + Conclusion: Our newly proposed Cyto R-CNN architecture outperforms current +algorithms in whole-cell segmentation while providing more reliable cell +measurements than any other model. This could improve digital pathology +workflows, potentially leading to improved diagnosis. Moreover, our published +dataset can be used to develop further models in the future. + +
+
+
+
+
+ + ♻ ☆ Exploring Homogeneous and Heterogeneous Consistent Label Associations + for Unsupervised Visible-Infrared Person ReID + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) aims to +retrieve pedestrian images of the same identity from different modalities +without annotations. While prior work focuses on establishing cross-modality +pseudo-label associations to bridge the modality-gap, they ignore maintaining +the instance-level homogeneous and heterogeneous consistency in pseudo-label +space, resulting in coarse associations. In response, we introduce a +Modality-Unified Label Transfer (MULT) module that simultaneously accounts for +both homogeneous and heterogeneous fine-grained instance-level structures, +yielding high-quality cross-modality label associations. It models both +homogeneous and heterogeneous affinities, leveraging them to define the +inconsistency for the pseudo-labels and then minimize it, leading to +pseudo-labels that maintain alignment across modalities and consistency within +intra-modality structures. Additionally, a straightforward plug-and-play Online +Cross-memory Label Refinement (OCLR) module is proposed to further mitigate the +impact of noisy pseudo-labels while simultaneously aligning different +modalities, coupled with a Modality-Invariant Representation Learning (MIRL) +framework. Experiments demonstrate that our proposed method outperforms +existing USL-VI-ReID methods, highlighting the superiority of our MULT in +comparison to other cross-modality association methods. The code will be +available. + +
+
+
+
+
+ + ♻ ☆ Multi-Robot Relative Pose Estimation in SE(2) with Observability + Analysis: A Comparison of Extended Kalman Filtering and Robust Pose Graph + Optimization + + +
+ In this study, we address multi-robot localization issues, with a specific +focus on cooperative localization and observability analysis of relative pose +estimation. Cooperative localization involves enhancing each robot's +information through a communication network and message passing. If odometry +data from a target robot can be transmitted to the ego robot, observability of +their relative pose estimation can be achieved through range-only or +bearing-only measurements, provided both robots have non-zero linear +velocities. In cases where odometry data from a target robot are not directly +transmitted but estimated by the ego robot, both range and bearing measurements +are necessary to ensure observability of relative pose estimation. For +ROS/Gazebo simulations, we explore four sensing and communication structures. +We compare extended Kalman filtering (EKF) and pose graph optimization (PGO) +estimation using different robust loss functions (filtering and smoothing with +varying batch sizes of sliding windows) in terms of estimation accuracy. In +hardware experiments, two Turtlebot3 equipped with UWB modules are used for +real-world inter-robot relative pose estimation, applying both EKF and PGO and +comparing their performance. + +
+
+ comment: 20 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ Beyond Prototypes: Semantic Anchor Regularization for Better + Representation Learning AAAI 2024 + + +
+ One of the ultimate goals of representation learning is to achieve +compactness within a class and well-separability between classes. Many +outstanding metric-based and prototype-based methods following the +Expectation-Maximization paradigm, have been proposed for this objective. +However, they inevitably introduce biases into the learning process, +particularly with long-tail distributed training data. In this paper, we reveal +that the class prototype is not necessarily to be derived from training +features and propose a novel perspective to use pre-defined class anchors +serving as feature centroid to unidirectionally guide feature learning. +However, the pre-defined anchors may have a large semantic distance from the +pixel features, which prevents them from being directly applied. To address +this issue and generate feature centroid independent from feature learning, a +simple yet effective Semantic Anchor Regularization (SAR) is proposed. SAR +ensures the interclass separability of semantic anchors in the semantic space +by employing a classifier-aware auxiliary cross-entropy loss during training +via disentanglement learning. By pulling the learned features to these semantic +anchors, several advantages can be attained: 1) the intra-class compactness and +naturally inter-class separability, 2) induced bias or errors from feature +learning can be avoided, and 3) robustness to the long-tailed problem. The +proposed SAR can be used in a plug-and-play manner in the existing models. +Extensive experiments demonstrate that the SAR performs better than previous +sophisticated prototype-based methods. The implementation is available at +https://github.com/geyanqi/SAR. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ GeoDiffusion: Text-Prompted Geometric Control for Object Detection Data + Generation ICLR 2024 + + +
+ Diffusion models have attracted significant attention due to the remarkable +ability to create content and generate data for tasks like image +classification. However, the usage of diffusion models to generate the +high-quality object detection data remains an underexplored area, where not +only image-level perceptual quality but also geometric conditions such as +bounding boxes and camera views are essential. Previous studies have utilized +either copy-paste synthesis or layout-to-image (L2I) generation with +specifically designed modules to encode the semantic layouts. In this paper, we +propose the GeoDiffusion, a simple framework that can flexibly translate +various geometric conditions into text prompts and empower pre-trained +text-to-image (T2I) diffusion models for high-quality detection data +generation. Unlike previous L2I methods, our GeoDiffusion is able to encode not +only the bounding boxes but also extra geometric conditions such as camera +views in self-driving scenes. Extensive experiments demonstrate GeoDiffusion +outperforms previous L2I methods while maintaining 4x training time faster. To +the best of our knowledge, this is the first work to adopt diffusion models for +layout-to-image generation with geometric conditions and demonstrate that +L2I-generated images can be beneficial for improving the performance of object +detectors. + +
+
+ comment: Accept by ICLR 2024. Project Page: + https://kaichen1998.github.io/projects/geodiffusion/ +
+
+
+
+
+ + ♻ ☆ BiSwift: Bandwidth Orchestrator for Multi-Stream Video Analytics on Edge + + +
+ High-definition (HD) cameras for surveillance and road traffic have +experienced tremendous growth, demanding intensive computation resources for +real-time analytics. Recently, offloading frames from the front-end device to +the back-end edge server has shown great promise. In multi-stream competitive +environments, efficient bandwidth management and proper scheduling are crucial +to ensure both high inference accuracy and high throughput. To achieve this +goal, we propose BiSwift, a bi-level framework that scales the concurrent +real-time video analytics by a novel adaptive hybrid codec integrated with +multi-level pipelines, and a global bandwidth controller for multiple video +streams. The lower-level front-back-end collaborative mechanism (called +adaptive hybrid codec) locally optimizes the accuracy and accelerates +end-to-end video analytics for a single stream. The upper-level scheduler aims +to accuracy fairness among multiple streams via the global bandwidth +controller. The evaluation of BiSwift shows that BiSwift is able to real-time +object detection on 9 streams with an edge device only equipped with an NVIDIA +RTX3070 (8G) GPU. BiSwift improves 10%$\sim$21% accuracy and presents +1.2$\sim$9$\times$ throughput compared with the state-of-the-art video +analytics pipelines. + +
+
+ comment: Accepted by 2024 IEEE INFOCOM +
+
+
+
+
+ + ♻ ☆ LayerAct: Advanced activation mechanism utilizing layer-direction + normalization for CNNs with BatchNorm + + +
+ In this work, we propose a novel activation mechanism aimed at establishing +layer-level activation (LayerAct) functions for CNNs with BatchNorm. These +functions are designed to be more noise-robust compared to existing +element-level activation functions by reducing the layer-level fluctuation of +the activation outputs due to shift in inputs. Moreover, the LayerAct functions +achieve this noise-robustness independent of the activation's saturation state, +which limits the activation output space and complicates efficient training. We +present an analysis and experiments demonstrating that LayerAct functions +exhibit superior noise-robustness compared to element-level activation +functions, and empirically show that these functions have a zero-like mean +activation. Experimental results with three clean and three out-of-distribution +benchmark datasets for image classification tasks show that LayerAct functions +excel in handling noisy datasets, outperforming element-level activation +functions, while the performance on clean datasets is also superior in most +cases. + +
+
+ comment: 10 pages, 3 figures, 3 tables except appendix +
+
+
+
+
+ + ♻ ☆ AnomalyCLIP: Object-agnostic Prompt Learning for Zero-shot Anomaly + Detection + + +
+ Zero-shot anomaly detection (ZSAD) requires detection models trained using +auxiliary data to detect anomalies without any training sample in a target +dataset. It is a crucial task when training data is not accessible due to +various concerns, \eg, data privacy, yet it is challenging since the models +need to generalize to anomalies across different domains where the appearance +of foreground objects, abnormal regions, and background features, such as +defects/tumors on different products/organs, can vary significantly. Recently +large pre-trained vision-language models (VLMs), such as CLIP, have +demonstrated strong zero-shot recognition ability in various vision tasks, +including anomaly detection. However, their ZSAD performance is weak since the +VLMs focus more on modeling the class semantics of the foreground objects +rather than the abnormality/normality in the images. In this paper we introduce +a novel approach, namely AnomalyCLIP, to adapt CLIP for accurate ZSAD across +different domains. The key insight of AnomalyCLIP is to learn object-agnostic +text prompts that capture generic normality and abnormality in an image +regardless of its foreground objects. This allows our model to focus on the +abnormal image regions rather than the object semantics, enabling generalized +normality and abnormality recognition on diverse types of objects. Large-scale +experiments on 17 real-world anomaly detection datasets show that AnomalyCLIP +achieves superior zero-shot performance of detecting and segmenting anomalies +in datasets of highly diverse class semantics from various defect inspection +and medical imaging domains. Code will be made available at +https://github.com/zqhang/AnomalyCLIP. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Position bias in features + + +
+ The purpose of modeling document relevance for search engines is to rank +better in subsequent searches. Document-specific historical click-through rates +can be important features in a dynamic ranking system which updates as we +accumulate more sample. This paper describes the properties of several such +features, and tests them in controlled experiments. Extending the inverse +propensity weighting method to documents creates an unbiased estimate of +document relevance. This feature can approximate relevance accurately, leading +to near-optimal ranking in ideal circumstances. However, it has high variance +that is increasing with respect to the degree of position bias. Furthermore, +inaccurate position bias estimation leads to poor performance. Under several +scenarios this feature can perform worse than biased click-through rates. This +paper underscores the need for accurate position bias estimation, and is unique +in suggesting simultaneous use of biased and unbiased position bias features. + +
+
+
+
+
+ + ☆ eXplainable Bayesian Multi-Perspective Generative Retrieval + + +
+ Modern deterministic retrieval pipelines prioritize achieving +state-of-the-art performance but often lack interpretability in +decision-making. These models face challenges in assessing uncertainty, leading +to overconfident predictions. To overcome these limitations, we integrate +uncertainty calibration and interpretability into a retrieval pipeline. +Specifically, we introduce Bayesian methodologies and multi-perspective +retrieval to calibrate uncertainty within a retrieval pipeline. We incorporate +techniques such as LIME and SHAP to analyze the behavior of a black-box +reranker model. The importance scores derived from these explanation +methodologies serve as supplementary relevance scores to enhance the base +reranker model. We evaluate the resulting performance enhancements achieved +through uncertainty calibration and interpretable reranking on Question +Answering and Fact Checking tasks. Our methods demonstrate substantial +performance improvements across three KILT datasets. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ Delivery Optimized Discovery in Behavioral User Segmentation under + Budget Constrain + + +
+ Users' behavioral footprints online enable firms to discover behavior-based +user segments (or, segments) and deliver segment specific messages to users. +Following the discovery of segments, delivery of messages to users through +preferred media channels like Facebook and Google can be challenging, as only a +portion of users in a behavior segment find match in a medium, and only a +fraction of those matched actually see the message (exposure). Even high +quality discovery becomes futile when delivery fails. Many sophisticated +algorithms exist for discovering behavioral segments; however, these ignore the +delivery component. The problem is compounded because (i) the discovery is +performed on the behavior data space in firms' data (e.g., user clicks), while +the delivery is predicated on the static data space (e.g., geo, age) as defined +by media; and (ii) firms work under budget constraint. We introduce a +stochastic optimization based algorithm for delivery optimized discovery of +behavioral user segmentation and offer new metrics to address the joint +optimization. We leverage optimization under a budget constraint for delivery +combined with a learning-based component for discovery. Extensive experiments +on a public dataset from Google and a proprietary dataset show the +effectiveness of our approach by simultaneously improving delivery metrics, +reducing budget spend and achieving strong predictive performance in discovery. + +
+
+
+
+
+ + ☆ Modified K-means with Cluster Assignment -- Application to COVID-19 Data + + +
+ Text extraction is a highly subjective problem which depends on the dataset +that one is working on and the kind of summarization details that needs to be +extracted out. All the steps ranging from preprocessing of the data, to the +choice of an optimal model for predictions, depends on the problem and the +corpus at hand. In this paper, we describe a text extraction model where the +aim is to extract word specified information relating to the semantics such +that we can get all related and meaningful information about that word in a +succinct format. This model can obtain meaningful results and can augment +ubiquitous search model or a normal clustering or topic modelling algorithms. +By utilizing new technique called two cluster assignment technique with K-means +model, we improved the ontology of the retrieved text. We further apply the +vector average damping technique for flexible movement of clusters. Our +experimental results on a recent corpus of Covid-19 shows that we obtain good +results based on main keywords. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Entire Chain Uplift Modeling with Context-Enhanced Learning for + Intelligent Marketing WWW2024 + + +
+ Uplift modeling, vital in online marketing, seeks to accurately measure the +impact of various strategies, such as coupons or discounts, on different users +by predicting the Individual Treatment Effect (ITE). In an e-commerce setting, +user behavior follows a defined sequential chain, including impression, click, +and conversion. Marketing strategies exert varied uplift effects at each stage +within this chain, impacting metrics like click-through and conversion rate. +Despite its utility, existing research has neglected to consider the inter-task +across all stages impacts within a specific treatment and has insufficiently +utilized the treatment information, potentially introducing substantial bias +into subsequent marketing decisions. We identify these two issues as the +chain-bias problem and the treatment-unadaptive problem. This paper introduces +the Entire Chain UPlift method with context-enhanced learning (ECUP), devised +to tackle these issues. ECUP consists of two primary components: 1) the Entire +Chain-Enhanced Network, which utilizes user behavior patterns to estimate ITE +throughout the entire chain space, models the various impacts of treatments on +each task, and integrates task prior information to enhance context awareness +across all stages, capturing the impact of treatment on different tasks, and 2) +the Treatment-Enhanced Network, which facilitates fine-grained treatment +modeling through bit-level feature interactions, thereby enabling adaptive +feature adjustment. Extensive experiments on public and industrial datasets +validate ECUPs effectiveness. Moreover, ECUP has been deployed on the Meituan +food delivery platform, serving millions of daily active users, with the +related dataset released for future research. + +
+
+ comment: Accepted by WWW2024 +
+
+
+
+
+ + ☆ Video Editing for Video Retrieval + + +
+ Though pre-training vision-language models have demonstrated significant +benefits in boosting video-text retrieval performance from large-scale web +videos, fine-tuning still plays a critical role with manually annotated clips +with start and end times, which requires considerable human effort. To address +this issue, we explore an alternative cheaper source of annotations, single +timestamps, for video-text retrieval. We initialise clips from timestamps in a +heuristic way to warm up a retrieval model. Then a video clip editing method is +proposed to refine the initial rough boundaries to improve retrieval +performance. A student-teacher network is introduced for video clip editing. +The teacher model is employed to edit the clips in the training set whereas the +student model trains on the edited clips. The teacher weights are updated from +the student's after the student's performance increases. Our method is model +agnostic and applicable to any retrieval models. We conduct experiments based +on three state-of-the-art retrieval models, COOT, VideoCLIP and CLIP4Clip. +Experiments conducted on three video retrieval datasets, YouCook2, DiDeMo and +ActivityNet-Captions show that our edited clips consistently improve retrieval +performance over initial clips across all the three retrieval models. + +
+
+
+
+
+ + ♻ ☆ Language is All a Graph Needs EACL 2024 + + +
+ The emergence of large-scale pre-trained language models has revolutionized +various AI research domains. Transformers-based Large Language Models (LLMs) +have gradually replaced CNNs and RNNs to unify fields of computer vision and +natural language processing. Compared with independent data samples such as +images, videos or texts, graphs usually contain rich structural and relational +information. Meanwhile, language, especially natural language, being one of the +most expressive mediums, excels in describing complex structures. However, +existing work on incorporating graph problems into the generative language +modeling framework remains very limited. Considering the rising prominence of +LLMs, it becomes essential to explore whether LLMs can also replace GNNs as the +foundation model for graphs. In this paper, we propose InstructGLM +(Instruction-finetuned Graph Language Model) with highly scalable prompts based +on natural language instructions. We use natural language to describe +multi-scale geometric structure of the graph and then instruction finetune an +LLM to perform graph tasks, which enables Generative Graph Learning. Our method +surpasses all GNN baselines on ogbn-arxiv, Cora and PubMed datasets, +underscoring its effectiveness and sheds light on generative LLMs as new +foundation model for graph machine learning. Our code is open-sourced at +https://github.com/agiresearch/InstructGLM. + +
+
+ comment: In EACL 2024 +
+
+
+
+
+ + ♻ ☆ An In-depth Investigation of User Response Simulation for Conversational + Search + + +
+ Conversational search has seen increased recent attention in both the IR and +NLP communities. It seeks to clarify and solve users' search needs through +multi-turn natural language interactions. However, most existing systems are +trained and demonstrated with recorded or artificial conversation logs. +Eventually, conversational search systems should be trained, evaluated, and +deployed in an open-ended setting with unseen conversation trajectories. A key +challenge is that training and evaluating such systems both require a +human-in-the-loop, which is expensive and does not scale. One strategy is to +simulate users, thereby reducing the scaling costs. However, current user +simulators are either limited to only responding to yes-no questions from the +conversational search system or unable to produce high-quality responses in +general. + In this paper, we show that existing user simulation systems could be +significantly improved by a smaller finetuned natural language generation +model. However, rather than merely reporting it as the new state-of-the-art, we +consider it a strong baseline and present an in-depth investigation of +simulating user response for conversational search. Our goal is to supplement +existing work with an insightful hand-analysis of unsolved challenges by the +baseline and propose our solutions. The challenges we identified include (1) a +blind spot that is difficult to learn, and (2) a specific type of misevaluation +in the standard setup. We propose a new generation system to effectively cover +the training blind spot and suggest a new evaluation setup to avoid +misevaluation. Our proposed system leads to significant improvements over +existing systems and large language models such as GPT-4. Additionally, our +analysis provides insights into the nature of user simulation to facilitate +future work. + +
+
+ comment: To appear in The Web Conference 2024, 8 pages with Appendices +
+
+
+
+
+ + ♻ ☆ Zero-1-to-3: Domain-level Zero-shot Cognitive Diagnosis via One Batch of + Early-bird Students towards Three Diagnostic Objectives AAAI2024 + + +
+ Cognitive diagnosis seeks to estimate the cognitive states of students by +exploring their logged practice quiz data. It plays a pivotal role in +personalized learning guidance within intelligent education systems. In this +paper, we focus on an important, practical, yet often underexplored task: +domain-level zero-shot cognitive diagnosis (DZCD), which arises due to the +absence of student practice logs in newly launched domains. Recent cross-domain +diagnostic models have been demonstrated to be a promising strategy for DZCD. +These methods primarily focus on how to transfer student states across domains. +However, they might inadvertently incorporate non-transferable information into +student representations, thereby limiting the efficacy of knowledge transfer. +To tackle this, we propose Zero-1-to-3, a domain-level zero-shot cognitive +diagnosis framework via one batch of early-bird students towards three +diagnostic objectives. Our approach initiates with pre-training a diagnosis +model with dual regularizers, which decouples student states into domain-shared +and domain-specific parts. The shared cognitive signals can be transferred to +the target domain, enriching the cognitive priors for the new domain, which +ensures the cognitive state propagation objective. Subsequently, we devise a +strategy to generate simulated practice logs for cold-start students through +analyzing the behavioral patterns from early-bird students, fulfilling the +domain-adaption goal. Consequently, we refine the cognitive states of +cold-start students as diagnostic outcomes via virtual data, aligning with the +diagnosis-oriented goal. Finally, extensive experiments on six real-world +datasets highlight the efficacy of our model for DZCD and its practical +application in question recommendation. The code is publicly available at +https://github.com/bigdata-ustc/Zero-1-to-3. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Detection of ChatGPT Fake Science with the xFakeBibs Learning Algorithm + + +
+ ChatGPT is becoming a new reality. In this paper, we demonstrate a method for +distinguishing ChatGPT-generated publications from those produced by +scientists. The objective of this work is to introduce a newly designed +supervised network-driven algorithm that illustrates how to predict +machine-generated content. The premise is that ChatGPT content exhibits +behavior that is distinctive and can be set apart from scientific articles. The +algorithm was trained and tested on three disease-specific publications, with +each model constructed from 100 abstracts. Additionally, the algorithm +underwent k-Folds calibration (depending on the availability of the data) to +establish a lower-upper bound range of acceptance. The network training model +of ChatGPT showed a lower number of nodes and a higher number of edges when +compared with models of real article abstracts. The algorithm was executed in +single-mode to predict the class of one type of dataset at a time and achieved +>94%. It was also executed in multi-mode on mixed documents of ChatGPT and +PubMed abstracts. The algorithm remarkably predicted real articles with a +precision of 100% and, on rare occasions, 96%-98%. However, ChatGPT content was +often misclassified as real publications with up to 88% accuracy in all +datasets of the three diseases. Our results also showed that the year of +publications mixed with ChatGPT-generated content may play a factor in +detecting the correct class, where the older the publication, the better the +prediction. + +
+
+ comment: 14 pages, 6 figures, 4 tables, 2 algorithms +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ☆ M$^3$Face: A Unified Multi-Modal Multilingual Framework for Human Face + Generation and Editing + + +
+ Human face generation and editing represent an essential task in the era of +computer vision and the digital world. Recent studies have shown remarkable +progress in multi-modal face generation and editing, for instance, using face +segmentation to guide image generation. However, it may be challenging for some +users to create these conditioning modalities manually. Thus, we introduce +M3Face, a unified multi-modal multilingual framework for controllable face +generation and editing. This framework enables users to utilize only text input +to generate controlling modalities automatically, for instance, semantic +segmentation or facial landmarks, and subsequently generate face images. We +conduct extensive qualitative and quantitative experiments to showcase our +frameworks face generation and editing capabilities. Additionally, we propose +the M3CelebA Dataset, a large-scale multi-modal and multilingual face dataset +containing high-quality images, semantic segmentations, facial landmarks, and +different captions for each image in multiple languages. The code and the +dataset will be released upon publication. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ ExTTNet: A Deep Learning Algorithm for Extracting Table Texts from + Invoice Images + + +
+ In this work, product tables in invoices are obtained autonomously via a deep +learning model, which is named as ExTTNet. Firstly, text is obtained from +invoice images using Optical Character Recognition (OCR) techniques. Tesseract +OCR engine [37] is used for this process. Afterwards, the number of existing +features is increased by using feature extraction methods to increase the +accuracy. Labeling process is done according to whether each text obtained as a +result of OCR is a table element or not. In this study, a multilayer artificial +neural network model is used. The training has been carried out with an Nvidia +RTX 3090 graphics card and taken $162$ minutes. As a result of the training, +the F1 score is $0.92$. + +
+
+ comment: 6 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Diffusion Cross-domain Recommendation + + +
+ It is always a challenge for recommender systems to give high-quality +outcomes to cold-start users. One potential solution to alleviate the data +sparsity problem for cold-start users in the target domain is to add data from +the auxiliary domain. Finding a proper way to extract knowledge from an +auxiliary domain and transfer it into a target domain is one of the main +objectives for cross-domain recommendation (CDR) research. Among the existing +methods, mapping approach is a popular one to implement cross-domain +recommendation models (CDRs). For models of this type, a mapping module plays +the role of transforming data from one domain to another. It primarily +determines the performance of mapping approach CDRs. Recently, diffusion +probability models (DPMs) have achieved impressive success for image synthesis +related tasks. They involve recovering images from noise-added samples, which +can be viewed as a data transformation process with outstanding performance. To +further enhance the performance of CDRs, we first reveal the potential +connection between DPMs and mapping modules of CDRs, and then propose a novel +CDR model named Diffusion Cross-domain Recommendation (DiffCDR). More +specifically, we first adopt the theory of DPM and design a Diffusion Module +(DIM), which generates user's embedding in target domain. To reduce the +negative impact of randomness introduced in DIM and improve the stability, we +employ an Alignment Module to produce the aligned user embeddings. In addition, +we consider the label data of the target domain and form the task-oriented loss +function, which enables our DiffCDR to adapt to specific tasks. By conducting +extensive experiments on datasets collected from reality, we demonstrate the +effectiveness and adaptability of DiffCDR to outperform baseline models on +various CDR tasks in both cold-start and warm-start scenarios. + +
+
+
+
+
+ + ☆ Enhancing Complex Question Answering over Knowledge Graphs through + Evidence Pattern Retrieval WWW 2024 + + +
+ Information retrieval (IR) methods for KGQA consist of two stages: subgraph +extraction and answer reasoning. We argue current subgraph extraction methods +underestimate the importance of structural dependencies among evidence facts. +We propose Evidence Pattern Retrieval (EPR) to explicitly model the structural +dependencies during subgraph extraction. We implement EPR by indexing the +atomic adjacency pattern of resource pairs. Given a question, we perform dense +retrieval to obtain atomic patterns formed by resource pairs. We then enumerate +their combinations to construct candidate evidence patterns. These evidence +patterns are scored using a neural model, and the best one is selected to +extract a subgraph for downstream answer reasoning. Experimental results +demonstrate that the EPR-based approach has significantly improved the F1 +scores of IR-KGQA methods by over 10 points on ComplexWebQuestions and achieves +competitive performance on WebQuestionsSP. + +
+
+ comment: Accepted to TheWebConf'24 (WWW 2024). This is a preprint version; the + CR version will include more details. Github: + https://github.com/nju-websoft/EPR-KGQA +
+
+
+
+
+ + ☆ PatSTEG: Modeling Formation Dynamics of Patent Citation Networks via The + Semantic-Topological Evolutionary Graph + + +
+ Patent documents in the patent database (PatDB) are crucial for research, +development, and innovation as they contain valuable technical information. +However, PatDB presents a multifaceted challenge compared to publicly available +preprocessed databases due to the intricate nature of the patent text and the +inherent sparsity within the patent citation network. Although patent text +analysis and citation analysis bring new opportunities to explore patent data +mining, no existing work exploits the complementation of them. To this end, we +propose a joint semantic-topological evolutionary graph learning approach +(PatSTEG) to model the formation dynamics of patent citation networks. More +specifically, we first create a real-world dataset of Chinese patents named +CNPat and leverage its patent texts and citations to construct a patent +citation network. Then, PatSTEG is modeled to study the evolutionary dynamics +of patent citation formation by considering the semantic and topological +information jointly. Extensive experiments are conducted on CNPat and public +datasets to prove the superiority of PatSTEG over other state-of-the-art +methods. All the results provide valuable references for patent literature +research and technical exploration. + +
+
+
+
+
+ + ☆ Position Paper: Why the Shooting in the Dark Method Dominates + Recommender Systems Practice; A Call to Abandon Anti-Utopian Thinking + + +
+ Applied recommender systems research is in a curious position. While there is +a very rigorous protocol for measuring performance by A/B testing, best +practice for finding a `B' to test does not explicitly target performance but +rather targets a proxy measure. The success or failure of a given A/B test then +depends entirely on if the proposed proxy is better correlated to performance +than the previous proxy. No principle exists to identify if one proxy is better +than another offline, leaving the practitioners shooting in the dark. The +purpose of this position paper is to question this anti-Utopian thinking and +argue that a non-standard use of the deep learning stacks actually has the +potential to unlock reward optimizing recommendation. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Prototypical Contrastive Learning through Alignment and Uniformity for + Recommendation + + +
+ Graph Collaborative Filtering (GCF), one of the most widely adopted +recommendation system methods, effectively captures intricate relationships +between user and item interactions. Graph Contrastive Learning (GCL) based GCF +has gained significant attention as it leverages self-supervised techniques to +extract valuable signals from real-world scenarios. However, many methods +usually learn the instances of discrimination tasks that involve the +construction of contrastive pairs through random sampling. GCL approaches +suffer from sampling bias issues, where the negatives might have a semantic +structure similar to that of the positives, thus leading to a loss of effective +feature representation. To address these problems, we present the +\underline{Proto}typical contrastive learning through \underline{A}lignment and +\underline{U}niformity for recommendation, which is called \textbf{ProtoAU}. +Specifically, we first propose prototypes (cluster centroids) as a latent space +to ensure consistency across different augmentations from the origin graph, +aiming to eliminate the need for random sampling of contrastive pairs. +Furthermore, the absence of explicit negatives means that directly optimizing +the consistency loss between instance and prototype could easily result in +dimensional collapse issues. Therefore, we propose aligning and maintaining +uniformity in the prototypes of users and items as optimization objectives to +prevent falling into trivial solutions. Finally, we conduct extensive +experiments on four datasets and evaluate their performance on the task of link +prediction. Experimental results demonstrate that the proposed ProtoAU +outperforms other representative methods. The source codes of our proposed +ProtoAU are available at \url{https://github.com/oceanlvr/ProtoAU}. + +
+
+
+
+
+ + ☆ Locally-Adaptive Quantization for Streaming Vector Search + + +
+ Retrieving the most similar vector embeddings to a given query among a +massive collection of vectors has long been a key component of countless +real-world applications. The recently introduced Retrieval-Augmented Generation +is one of the most prominent examples. For many of these applications, the +database evolves over time by inserting new data and removing outdated data. In +these cases, the retrieval problem is known as streaming similarity search. +While Locally-Adaptive Vector Quantization (LVQ), a highly efficient vector +compression method, yields state-of-the-art search performance for non-evolving +databases, its usefulness in the streaming setting has not been yet +established. In this work, we study LVQ in streaming similarity search. In +support of our evaluation, we introduce two improvements of LVQ: Turbo LVQ and +multi-means LVQ that boost its search performance by up to 28% and 27%, +respectively. Our studies show that LVQ and its new variants enable blazing +fast vector search, outperforming its closest competitor by up to 9.4x for +identically distributed data and by up to 8.8x under the challenging scenario +of data distribution shifts (i.e., where the statistical distribution of the +data changes over time). We release our contributions as part of Scalable +Vector Search, an open-source library for high-performance similarity search. + +
+
+
+
+
+ + ☆ Improving Large-Scale k-Nearest Neighbor Text Categorization with Label + Autoencoders + + +
+ In this paper, we introduce a multi-label lazy learning approach to deal with +automatic semantic indexing in large document collections in the presence of +complex and structured label vocabularies with high inter-label correlation. +The proposed method is an evolution of the traditional k-Nearest Neighbors +algorithm which uses a large autoencoder trained to map the large label space +to a reduced size latent space and to regenerate the predicted labels from this +latent space. We have evaluated our proposal in a large portion of the MEDLINE +biomedical document collection which uses the Medical Subject Headings (MeSH) +thesaurus as a controlled vocabulary. In our experiments we propose and +evaluate several document representation approaches and different label +autoencoder configurations. + +
+
+ comment: 22 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Driven Recommendation System Algorithm + + +
+ In this paper, we propose a novel graph neural network-based recommendation +model called KGLN, which leverages Knowledge Graph (KG) information to enhance +the accuracy and effectiveness of personalized recommendations. We first use a +single-layer neural network to merge individual node features in the graph, and +then adjust the aggregation weights of neighboring entities by incorporating +influence factors. The model evolves from a single layer to multiple layers +through iteration, enabling entities to access extensive multi-order associated +entity information. The final step involves integrating features of entities +and users to produce a recommendation score. The model performance was +evaluated by comparing its effects on various aggregation methods and influence +factors. In tests over the MovieLen-1M and Book-Crossing datasets, KGLN shows +an Area Under the ROC curve (AUC) improvement of 0.3% to 5.9% and 1.1% to 8.2%, +respectively, which is better than existing benchmark methods like LibFM, +DeepFM, Wide&Deep, and RippleNet. + +
+
+
+
+
+ + ♻ ☆ Bringing order into the realm of Transformer-based language models for + artificial intelligence and law + + +
+ Transformer-based language models (TLMs) have widely been recognized to be a +cutting-edge technology for the successful development of deep-learning-based +solutions to problems and applications that require natural language processing +and understanding. Like for other textual domains, TLMs have indeed pushed the +state-of-the-art of AI approaches for many tasks of interest in the legal +domain. Despite the first Transformer model being proposed about six years ago, +there has been a rapid progress of this technology at an unprecedented rate, +whereby BERT and related models represent a major reference, also in the legal +domain. This article provides the first systematic overview of TLM-based +methods for AI-driven problems and tasks in the legal sphere. A major goal is +to highlight research advances in this field so as to understand, on the one +hand, how the Transformers have contributed to the success of AI in supporting +legal processes, and on the other hand, what are the current limitations and +opportunities for further research development. + +
+
+ comment: Please refer to the published version: Greco, C.M., Tagarelli, A. + (2023) Bringing order into the realm of Transformer-based language models for + artificial intelligence and law. Artif Intell Law, Springer Nature. November + 2023. https://doi.org/10.1007/s10506-023-09374-7 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Wavelet-Decoupling Contrastive Enhancement Network for Fine-Grained + Skeleton-Based Action Recognition ICASSP 2024 + + +
+ Skeleton-based action recognition has attracted much attention, benefiting +from its succinctness and robustness. However, the minimal inter-class +variation in similar action sequences often leads to confusion. The inherent +spatiotemporal coupling characteristics make it challenging to mine the subtle +differences in joint motion trajectories, which is critical for distinguishing +confusing fine-grained actions. To alleviate this problem, we propose a +Wavelet-Attention Decoupling (WAD) module that utilizes discrete wavelet +transform to effectively disentangle salient and subtle motion features in the +time-frequency domain. Then, the decoupling attention adaptively recalibrates +their temporal responses. To further amplify the discrepancies in these subtle +motion features, we propose a Fine-grained Contrastive Enhancement (FCE) module +to enhance attention towards trajectory features by contrastive learning. +Extensive experiments are conducted on the coarse-grained dataset NTU RGB+D and +the fine-grained dataset FineGYM. Our methods perform competitively compared to +state-of-the-art methods and can discriminate confusing fine-grained actions +well. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ LLIC: Large Receptive Field Transform Coding with Adaptive Weights for + Learned Image Compression + + +
+ Effective Receptive field (ERF) plays an important role in transform coding, +which determines how much redundancy can be removed at most during transform +and how many spatial priors can be utilized to synthesize textures during +inverse transform. Existing methods rely on stacks of small kernels, whose ERF +remains not large enough instead, or heavy non-local attention mechanisms, +which limit the potential of high resolution image coding. To tackle this +issue, we propose Large Receptive Field Transform Coding with Adaptive Weights +for Learned Image Compression (LLIC). Specifically, for the first time in +learned image compression community, we introduce a few large kernel-based +depth-wise convolutions to reduce more redundancy while maintaining modest +complexity. Due to wide range of image diversity, we propose to enhance the +adaptability of convolutions via generating weights in a self-conditioned +manner. The large kernels cooperate with non-linear embedding and gate +mechanisms for better expressiveness and lighter point-wise interactions. We +also investigate improved training techniques to fully exploit the potential of +large kernels. In addition, to enhance the interactions among channels, we +propose the adaptive channel-wise bit allocation via generating channel +importance factor in a self-conditioned manner. To demonstrate the +effectiveness of proposed transform coding, we align the entropy model to +compare with existing transform methods and obtain models LLIC-STF, LLIC-ELIC, +LLIC-TCM. Extensive experiments demonstrate our proposed LLIC models have +significant improvements over corresponding baselines and achieve +state-of-the-art performances and better trade-off between performance and +complexity. + +
+
+ comment: Fix typos +
+
+
+
+
+ + ♻ ☆ Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal + Selective Self-Training + + +
+ End-to-end (E2E) spoken language understanding (SLU) is constrained by the +cost of collecting speech-semantics pairs, especially when label domains +change. Hence, we explore \textit{zero-shot} E2E SLU, which learns E2E SLU +without speech-semantics pairs, instead using only speech-text and +text-semantics pairs. Previous work achieved zero-shot by pseudolabeling all +speech-text transcripts with a natural language understanding (NLU) model +learned on text-semantics corpora. However, this method requires the domains of +speech-text and text-semantics to match, which often mismatch due to separate +collections. Furthermore, using the entire collected speech-text corpus from +any domains leads to \textit{imbalance} and \textit{noise} issues. To address +these, we propose \textit{cross-modal selective self-training} (CMSST). CMSST +tackles imbalance by clustering in a joint space of the three modalities +(speech, text, and semantics) and handles label noise with a selection network. +We also introduce two benchmarks for zero-shot E2E SLU, covering matched and +found speech (mismatched) settings. Experiments show that CMSST improves +performance in both two settings, with significantly reduced sample sizes and +training time. Our code and data are released in +https://github.com/amazon-science/zero-shot-E2E-slu. + +
+
+ comment: 18 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ GPT-4V with Emotion: A Zero-shot Benchmark for Generalized Emotion + Recognition + + +
+ Recently, GPT-4 with Vision (GPT-4V) has demonstrated remarkable visual +capabilities across various tasks, but its performance in emotion recognition +has not been fully evaluated. To bridge this gap, we present the quantitative +evaluation results of GPT-4V on 19 benchmark datasets covering 5 tasks: visual +sentiment analysis, micro-expression recognition, facial emotion recognition, +dynamic facial emotion recognition, and multimodal emotion recognition. This +paper collectively refers to these tasks as ``Generalized Emotion Recognition +(GER)''. Through experimental analysis, we observe that GPT-4V generally +outperforms supervised systems in visual sentiment analysis, highlighting its +powerful visual understanding capabilities. Meanwhile, GPT-4V shows the ability +to integrate multimodal clues and exploit temporal information, which is also +critical for emotion recognition. Despite these achievements, GPT-4V is +primarily tailored for general-purpose domains, which cannot recognize +micro-expressions that require specialized knowledge. To the best of our +knowledge, this paper provides the first quantitative assessment of GPT-4V for +the GER tasks, offering valuable insights to researchers in this field. It can +also serve as a zero-shot benchmark for subsequent research. Our code and +evaluation results are available at: +https://github.com/zeroQiaoba/gpt4v-emotion. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 89 + +
+
+
+ + ☆ Position Paper: Generalized grammar rules and structure-based + generalization beyond classical equivariance for lexical tasks and + transduction + + +
+ Compositional generalization is one of the main properties which +differentiates lexical learning in humans from state-of-art neural networks. We +propose a general framework for building models that can generalize +compositionally using the concept of Generalized Grammar Rules (GGRs), a class +of symmetry-based compositional constraints for transduction tasks, which we +view as a transduction analogue of equivariance constraints in physics-inspired +tasks. Besides formalizing generalized notions of symmetry for language +transduction, our framework is general enough to contain many existing works as +special cases. We present ideas on how GGRs might be implemented, and in the +process draw connections to reinforcement learning and other areas of research. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ TravelPlanner: A Benchmark for Real-World Planning with Language Agents + + +
+ Planning has been part of the core pursuit for artificial intelligence since +its conception, but earlier AI agents mostly focused on constrained settings +because many of the cognitive substrates necessary for human-level planning +have been lacking. Recently, language agents powered by large language models +(LLMs) have shown interesting capabilities such as tool use and reasoning. Are +these language agents capable of planning in more complex settings that are out +of the reach of prior AI agents? To advance this investigation, we propose +TravelPlanner, a new planning benchmark that focuses on travel planning, a +common real-world planning scenario. It provides a rich sandbox environment, +various tools for accessing nearly four million data records, and 1,225 +meticulously curated planning intents and reference plans. Comprehensive +evaluations show that the current language agents are not yet capable of +handling such complex planning tasks-even GPT-4 only achieves a success rate of +0.6%. Language agents struggle to stay on task, use the right tools to collect +information, or keep track of multiple constraints. However, we note that the +mere possibility for language agents to tackle such a complex problem is in +itself non-trivial progress. TravelPlanner provides a challenging yet +meaningful testbed for future language agents. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ MAGDi: Structured Distillation of Multi-Agent Interaction Graphs + Improves Reasoning in Smaller Language Models + + +
+ Multi-agent interactions between Large Language Model (LLM) agents have shown +major improvements on diverse reasoning tasks. However, these involve long +generations from multiple models across several rounds, making them expensive. +Moreover, these multi-agent approaches fail to provide a final, single model +for efficient inference. To address this, we introduce MAGDi, a new method for +structured distillation of the reasoning interactions between multiple LLMs +into smaller LMs. MAGDi teaches smaller models by representing multi-agent +interactions as graphs, augmenting a base student model with a graph encoder, +and distilling knowledge using three objective functions: next-token +prediction, a contrastive loss between correct and incorrect reasoning, and a +graph-based objective to model the interaction structure. Experiments on seven +widely-used commonsense and math reasoning benchmarks show that MAGDi improves +the reasoning capabilities of smaller models, outperforming several methods +that distill from a single teacher and multiple teachers. Moreover, MAGDi also +demonstrates an order of magnitude higher efficiency over its teachers. We +conduct extensive analyses to show that MAGDi (1) enhances the generalizability +to out-of-domain tasks, (2) scales positively with the size and strength of the +base student model, and (3) obtains larger improvements (via our multi-teacher +training) when applying self-consistency - an inference technique that relies +on model diversity. + +
+
+ comment: 15 pages; First two authors contributed equally; GitHub: + https://github.com/dinobby/MAGDi +
+
+
+
+
+ + ☆ KB-Plugin: A Plug-and-play Framework for Large Language Models to Induce + Programs over Low-resourced Knowledge Bases + + +
+ Program induction (PI) has become a promising paradigm for using knowledge +bases (KBs) to help large language models (LLMs) answer complex +knowledge-intensive questions. Nonetheless, PI typically relies on a large +number of parallel question-program pairs to make the LLM aware of the schema +of the given KB, and is thus challenging for many low-resourced KBs that lack +annotated data. To this end, we propose KB-Plugin, a plug-and-play framework +that enables LLMs to induce programs over any low-resourced KB. Firstly, +KB-Plugin adopts self-supervised learning to encode the detailed schema +information of a given KB into a pluggable module, namely schema plugin. +Secondly, KB-Plugin utilizes abundant annotated data from a rich-resourced KB +to train another pluggable module, namely PI plugin, which can help the LLM +extract question-relevant schema information from the schema plugin of any KB +and utilize this information to induce programs over this KB. Experiments on +five heterogeneous KBQA datasets show that KB-Plugin achieves better or +comparable performance with 25$\times$ smaller backbone LLM compared to SoTA PI +methods for low-resourced KBs, and even approaches the performance of +supervised methods. Our code and data are available at +https://github.com/THU-KEG/KB-Plugin. + +
+
+
+
+
+ + ☆ Style Vectors for Steering Generative Large Language Model EACL2024 + + +
+ This research explores strategies for steering the output of large language +models (LLMs) towards specific styles, such as sentiment, emotion, or writing +style, by adding style vectors to the activations of hidden layers during text +generation. We show that style vectors can be simply computed from recorded +layer activations for input texts in a specific style in contrast to more +complex training-based approaches. Through a series of experiments, we +demonstrate the effectiveness of activation engineering using such style +vectors to influence the style of generated text in a nuanced and +parameterisable way, distinguishing it from prompt engineering. The presented +research constitutes a significant step towards developing more adaptive and +effective AI-empowered interactive systems. + +
+
+ comment: Will be published as findings paper at EACL2024 - 18th Conference of + the European Chapter of the Association for Computational Linguistics +
+
+
+
+
+ + ☆ Nomic Embed: Training a Reproducible Long Context Text Embedder + + +
+ This technical report describes the training of nomic-embed-text-v1, the +first fully reproducible, open-source, open-weights, open-data, 8192 context +length English text embedding model that outperforms both OpenAI Ada-002 and +OpenAI text-embedding-3-small on short and long-context tasks. We release the +training code and model weights under an Apache 2 license. In contrast with +other open-source models, we release a training data loader with 235 million +curated text pairs that allows for the full replication of nomic-embed-text-v1. +You can find code and data to replicate the model at +https://github.com/nomic-ai/contrastors + +
+
+
+
+
+ + ☆ Towards Sustainable Workplace Mental Health: A Novel Approach to Early + Intervention and Support + + +
+ Employee well-being is a critical concern in the contemporary workplace, as +highlighted by the American Psychological Association's 2021 report, indicating +that 71% of employees experience stress or tension. This stress contributes +significantly to workplace attrition and absenteeism, with 61% of attrition and +16% of sick days attributed to poor mental health. A major challenge for +employers is that employees often remain unaware of their mental health issues +until they reach a crisis point, resulting in limited utilization of corporate +well-being benefits. This research addresses this challenge by presenting a +groundbreaking stress detection algorithm that provides real-time support +preemptively. Leveraging automated chatbot technology, the algorithm +objectively measures mental health levels by analyzing chat conversations, +offering personalized treatment suggestions in real-time based on linguistic +biomarkers. The study explores the feasibility of integrating these innovations +into practical learning applications within real-world contexts and introduces +a chatbot-style system integrated into the broader employee experience +platform. This platform, encompassing various features, aims to enhance overall +employee well-being, detect stress in real time, and proactively engage with +individuals to improve support effectiveness, demonstrating a 22% increase when +assistance is provided early. Overall, the study emphasizes the importance of +fostering a supportive workplace environment for employees' mental health. + +
+
+
+
+
+ + ☆ BAT: Learning to Reason about Spatial Sounds with Large Language Models + + +
+ Spatial sound reasoning is a fundamental human skill, enabling us to navigate +and interpret our surroundings based on sound. In this paper we present BAT, +which combines the spatial sound perception ability of a binaural acoustic +scene analysis model with the natural language reasoning capabilities of a +large language model (LLM) to replicate this innate ability. To address the +lack of existing datasets of in-the-wild spatial sounds, we synthesized a +binaural audio dataset using AudioSet and SoundSpaces 2.0. Next, we developed +SpatialSoundQA, a spatial sound-based question-answering dataset, offering a +range of QA tasks that train BAT in various aspects of spatial sound perception +and reasoning. The acoustic front end encoder of BAT is a novel spatial audio +encoder named Spatial Audio Spectrogram Transformer, or Spatial-AST, which by +itself achieves strong performance across sound event detection, spatial +localization, and distance estimation. By integrating Spatial-AST with LLaMA-2 +7B model, BAT transcends standard Sound Event Localization and Detection (SELD) +tasks, enabling the model to reason about the relationships between the sounds +in its environment. Our experiments demonstrate BAT's superior performance on +both spatial sound perception and reasoning, showcasing the immense potential +of LLMs in navigating and interpreting complex spatial audio environments. + +
+
+ comment: Preprint, work in progress +
+
+
+
+
+ + ☆ TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent + Constitution + + +
+ The emergence of LLM-based agents has garnered considerable attention, yet +their trustworthiness remains an under-explored area. As agents can directly +interact with the physical environment, their reliability and safety is +critical. This paper presents an Agent-Constitution-based agent framework, +TrustAgent, an initial investigation into improving the safety dimension of +trustworthiness in LLM-based agents. This framework consists of threefold +strategies: pre-planning strategy which injects safety knowledge to the model +prior to plan generation, in-planning strategy which bolsters safety during +plan generation, and post-planning strategy which ensures safety by +post-planning inspection. Through experimental analysis, we demonstrate how +these approaches can effectively elevate an LLM agent's safety by identifying +and preventing potential dangers. Furthermore, we explore the intricate +relationships between safety and helpfulness, and between the model's reasoning +ability and its efficacy as a safe agent. This paper underscores the imperative +of integrating safety awareness and trustworthiness into the design and +deployment of LLM-based agents, not only to enhance their performance but also +to ensure their responsible integration into human-centric environments. Data +and code are available at https://github.com/agiresearch/TrustAgent. + +
+
+ comment: 16 pages, 3 figures, 5 tables, comments and suggestions are welcome +
+
+
+
+
+ + ☆ Automating Sound Change Prediction for Phylogenetic Inference: A + Tukanoan Case Study + + +
+ We describe a set of new methods to partially automate linguistic +phylogenetic inference given (1) cognate sets with their respective protoforms +and sound laws, (2) a mapping from phones to their articulatory features and +(3) a typological database of sound changes. We train a neural network on these +sound change data to weight articulatory distances between phones and predict +intermediate sound change steps between historical protoforms and their modern +descendants, replacing a linguistic expert in part of a parsimony-based +phylogenetic inference algorithm. In our best experiments on Tukanoan +languages, this method produces trees with a Generalized Quartet Distance of +0.12 from a tree that used expert annotations, a significant improvement over +other semi-automated baselines. We discuss potential benefits and drawbacks to +our neural approach and parsimony-based tree prediction. We also experiment +with a minimal generalization learner for automatic sound law induction, +finding it comparably effective to sound laws from expert annotation. Our code +is publicly available at https://github.com/cmu-llab/aiscp. + +
+
+ comment: Accepted to LChange 2023 +
+
+
+
+
+ + ☆ How Paralingual are Paralinguistic Representations? A Case Study in + Speech Emotion Recognition + + +
+ Pre-trained Models (PTMs) have facilitated substantial progress in the field +of Speech Emotion Recognition (SER). SER is an area with applications ranging +from HumanComputer Interaction to Healthcare. Recent studies have leveraged +various PTM representations as input features for downstream models for SER. +PTM specifically pre-trained for paralinguistic tasks have obtained +state-of-the-art (SOTA) performance for SER. However, such PTM haven't been +evaluated for SER in multilingual settings and experimented only with English. +So, we fill this gap, by performing a comprehensive comparative study of five +PTMs (TRILLsson, wav2vec2, XLS-R, x-vector, Whisper) for assessing the +effectiveness of paralingual PTM (TRILLsson) for SER across multiple languages. +Representations from TRILLsson achieved the best performance among all the +PTMs. This demonstrates that TRILLsson is able to effectively capture the +various paralinguistic features from speech data for better SER. We also show +that downstream models using TRILLsson representations achieve SOTA performance +in terms of accuracy across various multi-lingual datasets. + +
+
+
+
+
+ + ☆ Deep Active Learning for Data Mining from Conflict Text Corpora + + +
+ High-resolution event data on armed conflict and related processes have +revolutionized the study of political contention with datasets like UCDP GED, +ACLED etc. However, most of these datasets limit themselves to collecting +spatio-temporal (high-resolution) and intensity data. Information on dynamics, +such as targets, tactics, purposes etc. are rarely collected owing to the +extreme workload of collecting data. However, most datasets rely on a rich +corpus of textual data allowing further mining of further information connected +to each event. This paper proposes one such approach that is inexpensive and +high performance, leveraging active learning - an iterative process of +improving a machine learning model based on sequential (guided) human input. +Active learning is employed to then step-wise train (fine-tuning) of a large, +encoder-only language model adapted for extracting sub-classes of events +relating to conflict dynamics. The approach shows performance similar to human +(gold-standard) coding while reducing the amount of required human annotation +by as much as 99%. + +
+
+ comment: 40 pages, 6 figures. Paper presented at the Using LLMs and + Text-as-Data in Political Science Research Workshop at the University of + Barcelona, 29 January 2024 +
+
+
+
+
+ + ☆ An Empirical Analysis of Diversity in Argument Summarization EACL2024 + + +
+ Presenting high-level arguments is a crucial task for fostering participation +in online societal discussions. Current argument summarization approaches miss +an important facet of this task -- capturing diversity -- which is important +for accommodating multiple perspectives. We introduce three aspects of +diversity: those of opinions, annotators, and sources. We evaluate approaches +to a popular argument summarization task called Key Point Analysis, which shows +how these approaches struggle to (1) represent arguments shared by few people, +(2) deal with data from various sources, and (3) align with subjectivity in +human-provided annotations. We find that both general-purpose LLMs and +dedicated KPA models exhibit this behavior, but have complementary strengths. +Further, we observe that diversification of training data may ameliorate +generalization. Addressing diversity in argument summarization requires a mix +of strategies to deal with subjectivity. + +
+
+ comment: Accepted at EACL2024 (main proceedings) +
+
+
+
+
+ + ☆ Decoding Speculative Decoding + + +
+ Speculative Decoding is a widely used technique to speed up inference for +Large Language Models (LLMs) without modifying its outcome. When performing +inference on an LLM, speculative decoding uses a smaller draft model which +generates speculative tokens and then uses the target LLM to verify those draft +tokens. The speedup provided by speculative decoding heavily depends on the +choice of the draft model. It has been widely suggested to select a draft model +that provides a high probability of the generated token being accepted by the +LLM to achieve the highest throughput. However, our experiments indicate the +contrary with throughput diminishing as the probability of generated tokens to +be accepted by the target model increases. To understand this phenomenon, we +perform extensive experiments to characterize the different factors that affect +speculative decoding and how those factors interact and affect the speedups. +Based on our experiments we describe an analytical model which can be used to +decide the right draft model for a given workload. Further, using our insights +we design a new draft model for LLaMA-65B which can provide 30% higher +throughput than existing draft models. + +
+
+
+
+
+ + ☆ K-Level Reasoning with Large Language Models + + +
+ While Large Language Models (LLMs) have demonstrated their proficiency in +complex reasoning tasks, their performance in dynamic, interactive, and +competitive scenarios - such as business strategy and stock market analysis - +remains underexplored. To bridge this gap, we formally explore the dynamic +reasoning capabilities of LLMs for decision-making in rapidly evolving +environments. We introduce two game theory-based pilot challenges that mirror +the complexities of real-world dynamic decision-making. These challenges are +well-defined, enabling clear, controllable, and precise evaluation of LLMs' +dynamic reasoning abilities. Through extensive experiments, we find that +existing reasoning methods tend to falter in dynamic settings that require +k-level thinking - a key concept not tackled by previous works. To address +this, we propose a novel reasoning approach for LLMs, named "K-Level +Reasoning". This approach adopts the perspective of rivals to recursively +employ k-level thinking based on available historical information, which +significantly improves the prediction accuracy of rivals' subsequent moves and +informs more strategic decision-making. This research not only sets a robust +quantitative benchmark for the assessment of dynamic reasoning but also +markedly enhances the proficiency of LLMs in dynamic contexts. + +
+
+
+
+
+ + ☆ Multilingual Gradient Word-Order Typology from Universal Dependencies EACL 2024 + + +
+ While information from the field of linguistic typology has the potential to +improve performance on NLP tasks, reliable typological data is a prerequisite. +Existing typological databases, including WALS and Grambank, suffer from +inconsistencies primarily caused by their categorical format. Furthermore, +typological categorisations by definition differ significantly from the +continuous nature of phenomena, as found in natural language corpora. In this +paper, we introduce a new seed dataset made up of continuous-valued data, +rather than categorical data, that can better reflect the variability of +language. While this initial dataset focuses on word-order typology, we also +present the methodology used to create the dataset, which can be easily adapted +to generate data for a broader set of features and languages. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ☆ Distractor Generation for Multiple-Choice Questions: A Survey of + Methods, Datasets, and Evaluation + + +
+ Distractors are important in learning evaluation. This paper surveys +distractor generation tasks using English multiple-choice question datasets for +textual and multimodal contexts. In particular, this paper presents a thorough +literature review of the recent studies on distractor generation tasks, +discusses multiple choice components and their characteristics, analyzes the +related datasets, and summarizes the evaluation metrics of distractor +generation. Our investigation reveals that more than half of datasets are +human-generated from educational sources in specific domains such as Science +and English, which are largely text-based, with a lack of open domain and +multimodal datasets. + +
+
+
+
+
+ + ☆ A Hybrid Strategy for Chat Transcript Summarization + + +
+ Text summarization is the process of condensing a piece of text to fewer +sentences, while still preserving its content. Chat transcript, in this +context, is a textual copy of a digital or online conversation between a +customer (caller) and agent(s). This paper presents an indigenously (locally) +developed hybrid method that first combines extractive and abstractive +summarization techniques in compressing ill-punctuated or un-punctuated chat +transcripts to produce more readable punctuated summaries and then optimizes +the overall quality of summarization through reinforcement learning. Extensive +testing, evaluations, comparisons, and validation have demonstrated the +efficacy of this approach for large-scale deployment of chat transcript +summarization, in the absence of manually generated reference (annotated) +summaries. + +
+
+ comment: Journal Paper (13 Pages, 7 Figures, 4 Tables). arXiv admin note: text + overlap with arXiv:2103.10599 +
+
+
+
+
+ + ☆ Code-Switched Language Identification is Harder Than You Think EACL 2024 + + +
+ Code switching (CS) is a very common phenomenon in written and spoken +communication but one that is handled poorly by many natural language +processing applications. Looking to the application of building CS corpora, we +explore CS language identification (LID) for corpus building. We make the task +more realistic by scaling it to more languages and considering models with +simpler architectures for faster inference. We also reformulate the task as a +sentence-level multi-label tagging problem to make it more tractable. Having +defined the task, we investigate three reasonable models for this task and +define metrics which better reflect desired performance. We present empirical +evidence that no current approach is adequate and finally provide +recommendations for future work in this area. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ☆ A Comparative Analysis of Conversational Large Language Models in + Knowledge-Based Text Generation EACL 2024 + + +
+ Generating natural language text from graph-structured data is essential for +conversational information seeking. Semantic triples derived from knowledge +graphs can serve as a valuable source for grounding responses from +conversational agents by providing a factual basis for the information they +communicate. This is especially relevant in the context of large language +models, which offer great potential for conversational interaction but are +prone to hallucinating, omitting, or producing conflicting information. In this +study, we conduct an empirical analysis of conversational large language models +in generating natural language text from semantic triples. We compare four +large language models of varying sizes with different prompting techniques. +Through a series of benchmark experiments on the WebNLG dataset, we analyze the +models' performance and identify the most common issues in the generated +predictions. Our findings show that the capabilities of large language models +in triple verbalization can be significantly improved through few-shot +prompting, post-processing, and efficient fine-tuning techniques, particularly +for smaller models that exhibit lower zero-shot performance. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ☆ AMOR: A Recipe for Building Adaptable Modular Knowledge Agents Through + Process Feedback + + +
+ The notable success of large language models (LLMs) has sparked an upsurge in +building language agents to complete various complex tasks. We present AMOR, an +agent framework based on open-source LLMs, which reasons with external +knowledge bases and adapts to specific domains through human supervision to the +reasoning process. AMOR builds reasoning logic over a finite state machine +(FSM) that solves problems through autonomous executions and transitions over +disentangled modules. This allows humans to provide direct feedback to the +individual modules, and thus naturally forms process supervision. Based on this +reasoning and feedback framework, we develop AMOR through two-stage +fine-tuning: warm-up and adaptation. The former fine-tunes the LLM with +examples automatically constructed from various public datasets and enables +AMOR to generalize across different knowledge environments, while the latter +tailors AMOR to specific domains using process feedback. Extensive experiments +across multiple domains demonstrate the advantage of AMOR to strong baselines, +thanks to its FSM-based reasoning and process feedback mechanism. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ The Queen of England is not England's Queen: On the Lack of Factual + Coherency in PLMs EACL + + +
+ Factual knowledge encoded in Pre-trained Language Models (PLMs) enriches +their representations and justifies their use as knowledge bases. Previous work +has focused on probing PLMs for factual knowledge by measuring how often they +can correctly predict an object entity given a subject and a relation, and +improving fact retrieval by optimizing the prompts used for querying PLMs. In +this work, we consider a complementary aspect, namely the coherency of factual +knowledge in PLMs, i.e., how often can PLMs predict the subject entity given +its initial prediction of the object entity. This goes beyond evaluating how +much PLMs know, and focuses on the internal state of knowledge inside them. Our +results indicate that PLMs have low coherency using manually written, optimized +and paraphrased prompts, but including an evidence paragraph leads to +substantial improvement. This shows that PLMs fail to model inverse relations +and need further enhancements to be able to handle retrieving facts from their +parameters in a coherent manner, and to be considered as knowledge bases. + +
+
+ comment: Accepted to EACL Findings 2024 +
+
+
+
+
+ + ☆ The effect of diversity on group decision-making + + +
+ We explore different aspects of cognitive diversity and its effect on the +success of group deliberation. To evaluate this, we use 500 dialogues from +small, online groups discussing the Wason Card Selection task - the DeliData +corpus. Leveraging the corpus, we perform quantitative analysis evaluating +three different measures of cognitive diversity. First, we analyse the effect +of group size as a proxy measure for diversity. Second, we evaluate the effect +of the size of the initial idea pool. Finally, we look into the content of the +discussion by analysing discussed solutions, discussion patterns, and how +conversational probing can improve those characteristics. + Despite the reputation of groups for compounding bias, we show that small +groups can, through dialogue, overcome intuitive biases and improve individual +decision-making. Across a large sample and different operationalisations, we +consistently find that greater cognitive diversity is associated with more +successful group deliberation. Code and data used for the analysis are +available in the anonymised repository: https://anonymous.4open.science/ +r/cogsci24-FD6D + +
+
+
+
+
+ + ☆ Different Tastes of Entities: Investigating Human Label Variation in + Named Entity Annotations EACL 2024 + + +
+ Named Entity Recognition (NER) is a key information extraction task with a +long-standing tradition. While recent studies address and aim to correct +annotation errors via re-labeling efforts, little is known about the sources of +human label variation, such as text ambiguity, annotation error, or guideline +divergence. This is especially the case for high-quality datasets and beyond +English CoNLL03. This paper studies disagreements in expert-annotated named +entity datasets for three languages: English, Danish, and Bavarian. We show +that text ambiguity and artificial guideline changes are dominant factors for +diverse annotations among high-quality revisions. We survey student annotations +on a subset of difficult entities and substantiate the feasibility and +necessity of manifold annotations for understanding named entity ambiguities +from a distributional perspective. + +
+
+ comment: 9 pages; Accepted at UnImplicit workshop at EACL 2024 +
+
+
+
+
+ + ☆ Sequence Shortening for Context-Aware Machine Translation ACL + + +
+ Context-aware Machine Translation aims to improve translations of sentences +by incorporating surrounding sentences as context. Towards this task, two main +architectures have been applied, namely single-encoder (based on concatenation) +and multi-encoder models. In this study, we show that a special case of +multi-encoder architecture, where the latent representation of the source +sentence is cached and reused as the context in the next step, achieves higher +accuracy on the contrastive datasets (where the models have to rank the correct +translation among the provided sentences) and comparable BLEU and COMET scores +as the single- and multi-encoder approaches. Furthermore, we investigate the +application of Sequence Shortening to the cached representations. We test three +pooling-based shortening techniques and introduce two novel methods - Latent +Grouping and Latent Selecting, where the network learns to group tokens or +selects the tokens to be cached as context. Our experiments show that the two +methods achieve competitive BLEU and COMET scores and accuracies on the +contrastive datasets to the other tested methods while potentially allowing for +higher interpretability and reducing the growth of memory requirements with +increased context size. + +
+
+ comment: Findings of the ACL: EACL 2024 +
+
+
+
+
+ + ☆ On Measuring Context Utilization in Document-Level MT Systems + + +
+ Document-level translation models are usually evaluated using general metrics +such as BLEU, which are not informative about the benefits of context. Current +work on context-aware evaluation, such as contrastive methods, only measure +translation accuracy on words that need context for disambiguation. Such +measures cannot reveal whether the translation model uses the correct +supporting context. We propose to complement accuracy-based evaluation with +measures of context utilization. We find that perturbation-based analysis +(comparing models' performance when provided with correct versus random +context) is an effective measure of overall context utilization. For a +finer-grained phenomenon-specific evaluation, we propose to measure how much +the supporting context contributes to handling context-dependent discourse +phenomena. We show that automatically-annotated supporting context gives +similar conclusions to human-annotated context and can be used as alternative +for cases where human annotations are not available. Finally, we highlight the +importance of using discourse-rich datasets when assessing context utilization. + +
+
+
+
+
+ + ☆ StepCoder: Improve Code Generation with Reinforcement Learning from + Compiler Feedback + + +
+ The advancement of large language models (LLMs) has significantly propelled +the field of code generation. Previous work integrated reinforcement learning +(RL) with compiler feedback for exploring the output space of LLMs to enhance +code generation quality. However, the lengthy code generated by LLMs in +response to complex human requirements makes RL exploration a challenge. Also, +since the unit tests may not cover the complicated code, optimizing LLMs by +using these unexecuted code snippets is ineffective. To tackle these +challenges, we introduce StepCoder, a novel RL framework for code generation, +consisting of two main components: CCCS addresses the exploration challenge by +breaking the long sequences code generation task into a Curriculum of Code +Completion Subtasks, while FGO only optimizes the model by masking the +unexecuted code segments to provide Fine-Grained Optimization. In addition, we +furthermore construct the APPS+ dataset for RL training, which is manually +verified to ensure the correctness of unit tests. Experimental results show +that our method improves the ability to explore the output space and +outperforms state-of-the-art approaches in corresponding benchmarks. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ LLM-based NLG Evaluation: Current Status and Challenges + + +
+ Evaluating natural language generation (NLG) is a vital but challenging +problem in artificial intelligence. Traditional evaluation metrics mainly +capturing content (e.g. n-gram) overlap between system outputs and references +are far from satisfactory, and large language models (LLMs) such as ChatGPT +have demonstrated great potential in NLG evaluation in recent years. Various +automatic evaluation methods based on LLMs have been proposed, including +metrics derived from LLMs, prompting LLMs, and fine-tuning LLMs with labeled +evaluation data. In this survey, we first give a taxonomy of LLM-based NLG +evaluation methods, and discuss their pros and cons, respectively. We also +discuss human-LLM collaboration for NLG evaluation. Lastly, we discuss several +open problems in this area and point out future research directions. + +
+
+
+
+
+ + ☆ LoTR: Low Tensor Rank Weight Adaptation + + +
+ In this paper we generalize and extend an idea of low-rank adaptation (LoRA) +of large language models (LLMs) based on Transformer architecture. Widely used +LoRA-like methods of fine-tuning LLMs are based on matrix factorization of +gradient update. We introduce LoTR, a novel approach for parameter-efficient +fine-tuning of LLMs which represents a gradient update to parameters in a form +of tensor decomposition. Low-rank adapter for each layer is constructed as a +product of three matrices, and tensor structure arises from sharing left and +right multipliers of this product among layers. Simultaneous compression of a +sequence of layers with low-rank tensor representation allows LoTR to archive +even better parameter efficiency then LoRA especially for deep models. +Moreover, the core tensor does not depend on original weight dimension and can +be made arbitrary small, which allows for extremely cheap and fast downstream +fine-tuning. + +
+
+ comment: Submitted +
+
+
+
+
+ + ☆ Dive into the Chasm: Probing the Gap between In- and Cross-Topic + Generalization EACL 2024 + + +
+ Pre-trained language models (LMs) perform well in In-Topic setups, where +training and testing data come from the same topics. However, they face +challenges in Cross-Topic scenarios where testing data is derived from distinct +topics -- such as Gun Control. This study analyzes various LMs with three +probing-based experiments to shed light on the reasons behind the In- vs. +Cross-Topic generalization gap. Thereby, we demonstrate, for the first time, +that generalization gaps and the robustness of the embedding space vary +significantly across LMs. Additionally, we assess larger LMs and underscore the +relevance of our analysis for recent models. Overall, diverse pre-training +objectives, architectural regularization, or data deduplication contribute to +more robust LMs and diminish generalization gaps. Our research contributes to a +deeper understanding and comparison of language models across different +generalization scenarios. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ☆ Continual Learning for Large Language Models: A Survey + + +
+ Large language models (LLMs) are not amenable to frequent re-training, due to +high training costs arising from their massive scale. However, updates are +necessary to endow LLMs with new skills and keep them up-to-date with rapidly +evolving human knowledge. This paper surveys recent works on continual learning +for LLMs. Due to the unique nature of LLMs, we catalog continue learning +techniques in a novel multi-staged categorization scheme, involving continual +pretraining, instruction tuning, and alignment. We contrast continual learning +for LLMs with simpler adaptation methods used in smaller models, as well as +with other enhancement strategies like retrieval-augmented generation and model +editing. Moreover, informed by a discussion of benchmarks and evaluation, we +identify several challenges and future work directions for this crucial task. + +
+
+
+
+
+ + ☆ What Makes Medical Claims (Un)Verifiable? Analyzing Entity and Relation + Properties for Fact Verification EACL 2024 + + +
+ Biomedical claim verification fails if no evidence can be discovered. In +these cases, the fact-checking verdict remains unknown and the claim is +unverifiable. To improve upon this, we have to understand if there are any +claim properties that impact its verifiability. In this work we assume that +entities and relations define the core variables in a biomedical claim's +anatomy and analyze if their properties help us to differentiate verifiable +from unverifiable claims. In a study with trained annotation experts we prompt +them to find evidence for biomedical claims, and observe how they refine search +queries for their evidence search. This leads to the first corpus for +scientific fact verification annotated with subject-relation-object triplets, +evidence documents, and fact-checking verdicts (the BEAR-Fact corpus). We find +(1) that discovering evidence for negated claims (e.g., X-does-not-cause-Y) is +particularly challenging. Further, we see that annotators process queries +mostly by adding constraints to the search and by normalizing entities to +canonical names. (2) We compare our in-house annotations with a small +crowdsourcing setting where we employ medical experts and laypeople. We find +that domain expertise does not have a substantial effect on the reliability of +annotations. Finally, (3), we demonstrate that it is possible to reliably +estimate the success of evidence retrieval purely from the claim text~(.82\F), +whereas identifying unverifiable claims proves more challenging (.27\F). The +dataset is available at http://www.ims.uni-stuttgart.de/data/bioclaim. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ☆ Describing Images $\textit{Fast and Slow}$: Quantifying and Predicting + the Variation in Human Signals during Visuo-Linguistic Processes EACL 2024 + + +
+ There is an intricate relation between the properties of an image and how +humans behave while describing the image. This behavior shows ample variation, +as manifested in human signals such as eye movements and when humans start to +describe the image. Despite the value of such signals of visuo-linguistic +variation, they are virtually disregarded in the training of current pretrained +models, which motivates further investigation. Using a corpus of Dutch image +descriptions with concurrently collected eye-tracking data, we explore the +nature of the variation in visuo-linguistic signals, and find that they +correlate with each other. Given this result, we hypothesize that variation +stems partly from the properties of the images, and explore whether image +representations encoded by pretrained vision encoders can capture such +variation. Our results indicate that pretrained models do so to a +weak-to-moderate degree, suggesting that the models lack biases about what +makes a stimulus complex for humans and what leads to variations in human +outputs. + +
+
+ comment: To appear in EACL 2024 +
+
+
+
+
+ + ☆ Beyond the Answers: Reviewing the Rationality of Multiple Choice + Question Answering for the Evaluation of Large Language Models + + +
+ In the field of natural language processing (NLP), Large Language Models +(LLMs) have precipitated a paradigm shift, markedly enhancing performance in +natural language generation tasks. Despite these advancements, the +comprehensive evaluation of LLMs remains an inevitable challenge for the +community. Recently, the utilization of Multiple Choice Question Answering +(MCQA) as a benchmark for LLMs has gained considerable traction. This study +investigates the rationality of MCQA as an evaluation method for LLMs. If LLMs +genuinely understand the semantics of questions, their performance should +exhibit consistency across the varied configurations derived from the same +questions. Contrary to this expectation, our empirical findings suggest a +notable disparity in the consistency of LLM responses, which we define as +REsponse VAriability Syndrome (REVAS) of the LLMs, indicating that current +MCQA-based benchmarks may not adequately capture the true capabilities of LLMs, +which underscores the need for more robust evaluation mechanisms in assessing +the performance of LLMs. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Skip $\textbackslash n$: A simple method to reduce hallucination in + Large Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks ('$\textbackslash n\textbackslash n$'), +where the content before and after '$\textbackslash n\textbackslash n$' in the +training data frequently exhibit significant semantic changes. This pattern +leads the model to infer that the contents following '$\textbackslash +n\textbackslash n$' should be obviously different from the preceding contents +with less hallucinatory descriptions, thereby increasing the probability of +hallucinatory descriptions subsequent to the '$\textbackslash n\textbackslash +n$'. We have validated this hypothesis on multiple publicly available LVLMs. +Besides, we find that deliberately inserting '$\textbackslash n\textbackslash +n$' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of `\textbackslash n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Two Approaches to Diachronic Normalization of Polish Texts + + +
+ This paper discusses two approaches to the diachronic normalization of Polish +texts: a rule-based solution that relies on a set of handcrafted patterns, and +a neural normalization model based on the text-to-text transfer transformer +architecture. The training and evaluation data prepared for the task are +discussed in detail, along with experiments conducted to compare the proposed +normalization solutions. A quantitative and qualitative analysis is made. It is +shown that at the current stage of inquiry into the problem, the rule-based +solution outperforms the neural one on 3 out of 4 variants of the prepared +dataset, although in practice both approaches have distinct advantages and +disadvantages. + +
+
+ comment: Accepted to the LaTeCH-CLfL 2024 workshop +
+
+
+
+
+ + ☆ Can MLLMs Perform Text-to-Image In-Context Learning? + + +
+ The evolution from Large Language Models (LLMs) to Multimodal Large Language +Models (MLLMs) has spurred research into extending In-Context Learning (ICL) to +its multimodal counterpart. Existing such studies have primarily concentrated +on image-to-text ICL. However, the Text-to-Image ICL (T2I-ICL), with its unique +characteristics and potential applications, remains underexplored. To address +this gap, we formally define the task of T2I-ICL and present CoBSAT, the first +T2I-ICL benchmark dataset, encompassing ten tasks. Utilizing our dataset to +benchmark six state-of-the-art MLLMs, we uncover considerable difficulties +MLLMs encounter in solving T2I-ICL. We identify the primary challenges as the +inherent complexity of multimodality and image generation. To overcome these +challenges, we explore strategies like fine-tuning and Chain-of-Thought +prompting, demonstrating notable improvements. Our code and dataset are +available at \url{https://github.com/UW-Madison-Lee-Lab/CoBSAT}. + +
+
+
+
+
+ + ☆ The Human and the Mechanical: logos, truthfulness, and ChatGPT + + +
+ The paper addresses the question of whether it is appropriate to talk about +`mechanical minds' at all, and whether ChatGPT models can indeed be thought of +as realizations of that. Our paper adds a semantic argument to the current +debate. The act of human assertion requires the formation of a veridicality +judgment. Modification of assertions with modals (John must be at home) and the +use of subjective elements (John is obviously at home) indicate that the +speaker is manipulating her judgments and, in a cooperative context, intends +her epistemic state to be transparent to the addressee. Veridicality judgments +are formed on the basis of two components: (i) evidence that relates to reality +(exogenous evidence) and (ii) endogenous evidence, such as preferences and +private beliefs. `Mechanical minds' lack these two components: (i) they do not +relate to reality and (ii) do not have endogenous evidence. Therefore they lack +the ability to form a belief about the world and a veridicality judgments +altogether. They can only mimic that judgment, but the output is not ground in +the very foundations for it. + +
+
+ comment: Under submission +
+
+
+
+
+ + ☆ In-Context Learning for Few-Shot Nested Named Entity Recognition + + +
+ In nested Named entity recognition (NER), entities are nested with each +other, and thus requiring more data annotations to address. This leads to the +development of few-shot nested NER, where the prevalence of pretrained language +models with in-context learning (ICL) offers promising solutions. In this work, +we introduce an effective and innovative ICL framework for the setting of +few-shot nested NER. We improve the ICL prompt by devising a novel example +demonstration selection mechanism, EnDe retriever. In EnDe retriever, we employ +contrastive learning to perform three types of representation learning, in +terms of semantic similarity, boundary similarity, and label similarity, to +generate high-quality demonstration examples. Extensive experiments over three +nested NER and four flat NER datasets demonstrate the efficacy of our system. + +
+
+ comment: 5 figures +
+
+
+
+
+ + ☆ Towards a Unified Language Model for Knowledge-Intensive Tasks Utilizing + External Corpus + + +
+ The advent of large language models (LLMs) has showcased their efficacy +across various domains, yet they often hallucinate, especially in +knowledge-intensive tasks that require external knowledge sources. To improve +factual accuracy of language models, retrieval-augmented generation (RAG) has +emerged as a popular solution. However, traditional retrieval modules often +rely on large-scale document indexes, which can be disconnected from generative +tasks. Through generative retrieval (GR) approach, language models can achieve +superior retrieval performance by directly generating relevant document +identifiers (DocIDs). However, the relationship between GR and downstream +tasks, as well as the potential of LLMs in GR, remains unexplored. In this +paper, we present a unified language model that utilizes external corpus to +handle various knowledge-intensive tasks by seamlessly integrating generative +retrieval, closed-book generation, and RAG. In order to achieve effective +retrieval and generation through a unified continuous decoding process, we +introduce the following mechanisms: (1) a ranking-oriented DocID decoding +strategy, which improves ranking ability by directly learning from a DocID +ranking list; (2) a continuous generation strategy to facilitate effective and +efficient RAG; (3) well-designed auxiliary DocID understanding tasks to enhance +the model's comprehension of DocIDs and their relevance to downstream tasks. +Our approach is evaluated on the widely used KILT benchmark using two variants +of backbone models: an encoder-decoder T5 model and a decoder-only LLM, Llama2. +Experimental results showcase the superior performance of our models in both +retrieval and downstream knowledge-intensive tasks. + +
+
+
+
+
+ + ☆ Efficient Prompt Caching via Embedding Similarity + + +
+ Large language models (LLMs) have achieved huge success in numerous natural +language process (NLP) tasks. However, it faces the challenge of significant +resource consumption during inference. In this paper, we aim to improve the +inference efficiency of LLMs by prompt caching, i.e., if the current prompt can +be answered by the same response of a previous prompt, one can directly utilize +that previous response without calling the LLM. Specifically, we focus on the +prediction accuracy of prompt caching for single-round question-answering tasks +via embedding similarity. The existing embeddings of prompts mostly focus on +whether two prompts are semantically similar, which is not necessarily +equivalent to whether the same response can answer them. Therefore, we propose +a distillation-based method to fine-tune the existing embeddings for better +caching prediction. Theoretically, we provide finite-sample guarantees for the +convergence of our method under different types of loss functions. Empirically, +we carefully construct a hard dataset based on Kwiatkowski et al. (2019) where +the existing embedding model (Wang et al., 2022) only achieves an AUC of 0.51. +We then fine-tune the above embedding model, which significantly improves the +AUC of caching prediction from 0.51 to 0.81. We also conduct simulations +demonstrating that our trained models achieve better caching efficiency than +the previous embedding model. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ☆ Streaming Sequence Transduction through Dynamic Compression + + +
+ We introduce STAR (Stream Transduction with Anchor Representations), a novel +Transformer-based model designed for efficient sequence-to-sequence +transduction over streams. STAR dynamically segments input streams to create +compressed anchor representations, achieving nearly lossless compression (12x) +in Automatic Speech Recognition (ASR) and outperforming existing methods. +Moreover, STAR demonstrates superior segmentation and latency-quality +trade-offs in simultaneous speech-to-text tasks, optimizing latency, memory +footprint, and quality. + +
+
+
+
+
+ + ☆ LLM-Detector: Improving AI-Generated Chinese Text Detection with + Open-Source LLM Instruction Tuning + + +
+ ChatGPT and other general large language models (LLMs) have achieved +remarkable success, but they have also raised concerns about the misuse of +AI-generated texts. Existing AI-generated text detection models, such as based +on BERT and RoBERTa, are prone to in-domain over-fitting, leading to poor +out-of-domain (OOD) detection performance. In this paper, we first collected +Chinese text responses generated by human experts and 9 types of LLMs, for +which to multiple domains questions, and further created a dataset that mixed +human-written sentences and sentences polished by LLMs. We then proposed +LLM-Detector, a novel method for both document-level and sentence-level text +detection through Instruction Tuning of LLMs. Our method leverages the wealth +of knowledge LLMs acquire during pre-training, enabling them to detect the text +they generate. Instruction tuning aligns the model's responses with the user's +expected text detection tasks. Experimental results show that previous methods +struggle with sentence-level AI-generated text detection and OOD detection. In +contrast, our proposed method not only significantly outperforms baseline +methods in both sentence-level and document-level text detection but also +demonstrates strong generalization capabilities. Furthermore, since +LLM-Detector is trained based on open-source LLMs, it is easy to customize for +deployment. + +
+
+ comment: 17 pages, 13 tables, 7 figures +
+
+
+
+
+ + ☆ CABINET: Content Relevance based Noise Reduction for Table Question + Answering ICLR 2024 + + +
+ Table understanding capability of Large Language Models (LLMs) has been +extensively studied through the task of question-answering (QA) over tables. +Typically, only a small part of the whole table is relevant to derive the +answer for a given question. The irrelevant parts act as noise and are +distracting information, resulting in sub-optimal performance due to the +vulnerability of LLMs to noise. To mitigate this, we propose CABINET (Content +RelevAnce-Based NoIse ReductioN for TablE QuesTion-Answering) - a framework to +enable LLMs to focus on relevant tabular data by suppressing extraneous +information. CABINET comprises an Unsupervised Relevance Scorer (URS), trained +differentially with the QA LLM, that weighs the table content based on its +relevance to the input question before feeding it to the question-answering LLM +(QA LLM). To further aid the relevance scorer, CABINET employs a weakly +supervised module that generates a parsing statement describing the criteria of +rows and columns relevant to the question and highlights the content of +corresponding table cells. CABINET significantly outperforms various tabular +LLM baselines, as well as GPT3-based in-context learning methods, is more +robust to noise, maintains outperformance on tables of varying sizes, and +establishes new SoTA performance on WikiTQ, FeTaQA, and WikiSQL datasets. We +release our code and datasets at https://github.com/Sohanpatnaik106/CABINET_QA. + +
+
+ comment: Accepted at ICLR 2024 (spotlight) +
+
+
+
+
+ + ☆ AccentFold: A Journey through African Accents for Zero-Shot ASR + Adaptation to Target Accents EACL + + +
+ Despite advancements in speech recognition, accented speech remains +challenging. While previous approaches have focused on modeling techniques or +creating accented speech datasets, gathering sufficient data for the multitude +of accents, particularly in the African context, remains impractical due to +their sheer diversity and associated budget constraints. To address these +challenges, we propose \textit{AccentFold}, a method that exploits spatial +relationships between learned accent embeddings to improve downstream Automatic +Speech Recognition (ASR). Our exploratory analysis of speech embeddings +representing 100+ African accents reveals interesting spatial accent +relationships highlighting geographic and genealogical similarities, capturing +consistent phonological, and morphological regularities, all learned +empirically from speech. Furthermore, we discover accent relationships +previously uncharacterized by the Ethnologue. Through empirical evaluation, we +demonstrate the effectiveness of AccentFold by showing that, for +out-of-distribution (OOD) accents, sampling accent subsets for training based +on AccentFold information outperforms strong baselines a relative WER +improvement of 4.6%. AccentFold presents a promising approach for improving ASR +performance on accented speech, particularly in the context of African accents, +where data scarcity and budget constraints pose significant challenges. Our +findings emphasize the potential of leveraging linguistic relationships to +improve zero-shot ASR adaptation to target accents. + +
+
+ comment: Accepted to EACL Findings 2024 +
+
+
+
+
+ + ☆ A Multi-Agent Conversational Recommender System + + +
+ Due to strong capabilities in conducting fluent, multi-turn conversations +with users, Large Language Models (LLMs) have the potential to further improve +the performance of Conversational Recommender System (CRS). Unlike the aimless +chit-chat that LLM excels at, CRS has a clear target. So it is imperative to +control the dialogue flow in the LLM to successfully recommend appropriate +items to the users. Furthermore, user feedback in CRS can assist the system in +better modeling user preferences, which has been ignored by existing studies. +However, simply prompting LLM to conduct conversational recommendation cannot +address the above two key challenges. + In this paper, we propose Multi-Agent Conversational Recommender System +(MACRS) which contains two essential modules. First, we design a multi-agent +act planning framework, which can control the dialogue flow based on four +LLM-based agents. This cooperative multi-agent framework will generate various +candidate responses based on different dialogue acts and then choose the most +appropriate response as the system response, which can help MACRS plan suitable +dialogue acts. Second, we propose a user feedback-aware reflection mechanism +which leverages user feedback to reason errors made in previous turns to adjust +the dialogue act planning, and higher-level user information from implicit +semantics. We conduct extensive experiments based on user simulator to +demonstrate the effectiveness of MACRS in recommendation and user preferences +collection. Experimental results illustrate that MACRS demonstrates an +improvement in user interaction experience compared to directly using LLMs. + +
+
+
+
+
+ + ☆ PokéLLMon: A Human-Parity Agent for Pokémon Battles with Large + Language Models + + +
+ We introduce \textsc{Pok\'eLLMon}, the first LLM-embodied agent that achieves +human-parity performance in tactical battle games, as demonstrated in Pok\'emon +battles. The design of \textsc{Pok\'eLLMon} incorporates three key strategies: +(i) In-context reinforcement learning that instantly consumes text-based +feedback derived from battles to iteratively refine the policy; (ii) +Knowledge-augmented generation that retrieves external knowledge to counteract +hallucination and enables the agent to act timely and properly; (iii) +Consistent action generation to mitigate the \textit{panic switching} +phenomenon when the agent faces a powerful opponent and wants to elude the +battle. We show that online battles against human demonstrates +\textsc{Pok\'eLLMon}'s human-like battle strategies and just-in-time decision +making, achieving 49\% of win rate in the Ladder competitions and 56\% of win +rate in the invited battles. Our implementation and playable battle logs are +available at: \url{https://github.com/git-disl/PokeLLMon}. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ DTS-SQL: Decomposed Text-to-SQL with Small Large Language Models + + +
+ Leading models for the text-to-SQL task heavily rely on proprietary Large +Language Models (LLMs), posing concerns over data privacy. Closing the +performance gap between small open-source models and large proprietary models +is crucial to mitigate this reliance. To this end, we introduce a novel +two-stage fine-tuning approach that decomposes the task into two simpler tasks. +Through comprehensive evaluation on two large cross-domain datasets and two +small LLMs, we show that this approach improves execution accuracy by 3 to 7 +percent, effectively aligning the performance of open-source models with their +proprietary counterparts. + +
+
+
+
+
+ + ☆ Interpretation of Intracardiac Electrograms Through Textual + Representations + + +
+ Understanding the irregular electrical activity of atrial fibrillation (AFib) +has been a key challenge in electrocardiography. For serious cases of AFib, +catheter ablations are performed to collect intracardiac electrograms (EGMs). +EGMs offer intricately detailed and localized electrical activity of the heart +and are an ideal modality for interpretable cardiac studies. Recent +advancements in artificial intelligence (AI) has allowed some works to utilize +deep learning frameworks to interpret EGMs during AFib. Additionally, language +models (LMs) have shown exceptional performance in being able to generalize to +unseen domains, especially in healthcare. In this study, we are the first to +leverage pretrained LMs for finetuning of EGM interpolation and AFib +classification via masked language modeling. We formulate the EGM as a textual +sequence and present competitive performances on AFib classification compared +against other representations. Lastly, we provide a comprehensive +interpretability study to provide a multi-perspective intuition of the model's +behavior, which could greatly benefit the clinical use. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Vaccine: Perturbation-aware Alignment for Large Language Model + + +
+ The new paradigm of finetuning-as-a-service introduces a new attack surface +for Large Language Models (LLMs): a few harmful data uploaded by users can +easily trick the finetuning to produce an alignment-broken model. We conduct an +empirical analysis and uncover a \textit{harmful embedding drift} phenomenon, +showing a probable cause of the alignment-broken effect. Inspired by our +findings, we propose Vaccine, a perturbation-aware alignment technique to +mitigate the security risk of users finetuning. The core idea of Vaccine is to +produce invariant hidden embeddings by progressively adding crafted +perturbation to them in the alignment phase. This enables the embeddings to +withstand harmful perturbation from un-sanitized user data in the finetuning +phase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna) +demonstrate that Vaccine can boost the robustness of alignment against harmful +prompts induced embedding drift while reserving reasoning ability towards +benign prompts. Our code is available at +\url{https://github.com/git-disl/Vaccine}. + +
+
+
+
+
+ + ☆ Reasoning Capacity in Multi-Agent Systems: Limitations, Challenges and + Human-Centered Solutions + + +
+ Remarkable performance of large language models (LLMs) in a variety of tasks +brings forth many opportunities as well as challenges of utilizing them in +production settings. Towards practical adoption of LLMs, multi-agent systems +hold great promise to augment, integrate, and orchestrate LLMs in the larger +context of enterprise platforms that use existing proprietary data and models +to tackle complex real-world tasks. Despite the tremendous success of these +systems, current approaches rely on narrow, single-focus objectives for +optimization and evaluation, often overlooking potential constraints in +real-world scenarios, including restricted budgets, resources and time. +Furthermore, interpreting, analyzing, and debugging these systems requires +different components to be evaluated in relation to one another. This demand is +currently not feasible with existing methodologies. In this postion paper, we +introduce the concept of reasoning capacity as a unifying criterion to enable +integration of constraints during optimization and establish connections among +different components within the system, which also enable a more holistic and +comprehensive approach to evaluation. We present a formal definition of +reasoning capacity and illustrate its utility in identifying limitations within +each component of the system. We then argue how these limitations can be +addressed with a self-reflective process wherein human-feedback is used to +alleviate shortcomings in reasoning and enhance overall consistency of the +system. + +
+
+
+
+
+ + ☆ Let's Negotiate! A Survey of Negotiation Dialogue Systems EACL 2024 + + +
+ Negotiation is a crucial ability in human communication. Recently, there has +been a resurgent research interest in negotiation dialogue systems, whose goal +is to create intelligent agents that can assist people in resolving conflicts +or reaching agreements. Although there have been many explorations into +negotiation dialogue systems, a systematic review of this task has not been +performed to date. We aim to fill this gap by investigating recent studies in +the field of negotiation dialogue systems, and covering benchmarks, evaluations +and methodologies within the literature. We also discuss potential future +directions, including multi-modal, multi-party and cross-cultural negotiation +scenarios. Our goal is to provide the community with a systematic overview of +negotiation dialogue systems and to inspire future research. + +
+
+ comment: Accepted by EACL 2024 (findings). arXiv admin note: substantial text + overlap with arXiv:2212.09072 +
+
+
+
+
+ + ☆ Specialized Language Models with Cheap Inference from Limited Domain + Data + + +
+ Large language models have emerged as a versatile tool but are challenging to +apply to tasks lacking large inference budgets and large in-domain training +sets. This work formalizes these constraints and distinguishes four important +variables: the pretraining budget (for training before the target domain is +known), the specialization budget (for training after the target domain is +known), the inference budget, and the in-domain training set size. Across these +settings, we compare different approaches from the machine learning literature. +Limited by inference cost, we find better alternatives to the standard practice +of training very large vanilla transformer models. In particular, we show that +hyper-networks and mixture of experts have better perplexity for large +pretraining budgets, while small models trained on importance sampled datasets +are attractive for large specialization budgets. + +
+
+
+
+
+ + ☆ Reading Between the Tweets: Deciphering Ideological Stances of + Interconnected Mixed-Ideology Communities + + +
+ Recent advances in NLP have improved our ability to understand the nuanced +worldviews of online communities. Existing research focused on probing +ideological stances treats liberals and conservatives as separate groups. +However, this fails to account for the nuanced views of the organically formed +online communities and the connections between them. In this paper, we study +discussions of the 2020 U.S. election on Twitter to identify complex +interacting communities. Capitalizing on this interconnectedness, we introduce +a novel approach that harnesses message passing when finetuning language models +(LMs) to probe the nuanced ideologies of these communities. By comparing the +responses generated by LMs and real-world survey results, our method shows +higher alignment than existing baselines, highlighting the potential of using +LMs in revealing complex ideologies within and across interconnected +mixed-ideology communities. + +
+
+
+
+
+ + ♻ ☆ Two Failures of Self-Consistency in the Multi-Step Reasoning of LLMs + + +
+ Large language models (LLMs) have achieved widespread success on a variety of +in-context few-shot tasks, but this success is typically evaluated via +correctness rather than consistency. We argue that self-consistency is an +important criteria for valid multi-step reasoning in tasks where the solution +is composed of the answers to multiple sub-steps. We propose two types of +self-consistency that are particularly important for multi-step reasoning -- +hypothetical consistency (a model's ability to predict what its output would be +in a hypothetical other context) and compositional consistency (consistency of +a model's final outputs when intermediate sub-steps are replaced with the +model's outputs for those steps). We demonstrate that multiple variants of the +GPT-3/-4 models exhibit poor consistency rates across both types of consistency +on a variety of tasks. + +
+
+ comment: Accepted to TMLR: https://openreview.net/forum?id=5nBqY1y96B +
+
+
+
+
+ + ♻ ☆ Is Self-Repair a Silver Bullet for Code Generation? ICLR 2024 + + +
+ Large language models have shown remarkable aptitude in code generation, but +still struggle to perform complex tasks. Self-repair -- in which the model +debugs and repairs its own code -- has recently become a popular way to boost +performance in these settings. However, despite its increasing popularity, +existing studies of self-repair have been limited in scope; in many settings, +its efficacy thus remains poorly understood. In this paper, we analyze Code +Llama, GPT-3.5 and GPT-4's ability to perform self-repair on problems taken +from HumanEval and APPS. We find that when the cost of carrying out repair is +taken into account, performance gains are often modest, vary a lot between +subsets of the data, and are sometimes not present at all. We hypothesize that +this is because self-repair is bottlenecked by the model's ability to provide +feedback on its own code; using a stronger model to artificially boost the +quality of the feedback, we observe substantially larger performance gains. +Similarly, a small-scale study in which we provide GPT-4 with feedback from +human participants suggests that even for the strongest models, self-repair +still lags far behind what can be achieved with human-level debugging. + +
+
+ comment: Accepted to ICLR 2024. Added additional Code Llama experiments and + fixed a data processing error harming Code Llama's reported self-repair + performance on HumanEval +
+
+
+
+
+ + ♻ ☆ DSPy Assertions: Computational Constraints for Self-Refining Language + Model Pipelines + + +
+ Chaining language model (LM) calls as composable modules is fueling a new way +of programming, but ensuring LMs adhere to important constraints requires +heuristic "prompt engineering". We introduce LM Assertions, a programming +construct for expressing computational constraints that LMs should satisfy. We +integrate our constructs into the recent DSPy programming model for LMs, and +present new strategies that allow DSPy to compile programs with LM Assertions +into more reliable and accurate systems. We also propose strategies to use +assertions at inference time for automatic self-refinement with LMs. We report +on four diverse case studies for text generation and find that LM Assertions +improve not only compliance with imposed rules but also downstream task +performance, passing constraints up to 164% more often and generating up to 37% +more higher-quality responses. Our reference implementation of LM Assertions is +integrated into DSPy at https://github.com/stanfordnlp/dspy + +
+
+ comment: Arnav*, Manish*, Shangyin* contributed equally to this work +
+
+
+
+
+ + ♻ ☆ NoFunEval: Funny How Code LMs Falter on Requirements Beyond Functional + Correctness + + +
+ Existing evaluation benchmarks of language models of code (code LMs) focus +almost exclusively on whether the LMs can generate functionally-correct code. +In real-world software engineering, developers think beyond functional +correctness. They have requirements on "how" a functionality should be +implemented to meet overall system design objectives like efficiency, security, +and maintainability. They would also trust the code LMs more if the LMs +demonstrate robust understanding of requirements and code semantics. + We propose a new benchmark NoFunEval to evaluate code LMs on non-functional +requirements and simple classification instances for both functional and +non-functional requirements. We propose a prompting method, Coding Concepts +(CoCo), as a way for a developer to communicate the domain knowledge to the +LMs. We conduct an extensive evaluation of twenty-two code LMs. Our finding is +that they generally falter when tested on our benchmark, hinting at fundamental +blindspots in their training setups. Surprisingly, even the classification +accuracy on functional-correctness instances derived from the popular HumanEval +benchmark is low, calling in question the depth of their comprehension and the +source of their success in generating functionally-correct code in the first +place. We will release our benchmark and evaluation scripts publicly at +https://aka.ms/NoFunEval. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Foundation Model's Embedded Representations May Detect Distribution + Shift + + +
+ Sampling biases can cause distribution shifts between train and test datasets +for supervised learning tasks, obscuring our ability to understand the +generalization capacity of a model. This is especially important considering +the wide adoption of pre-trained foundational neural networks -- whose behavior +remains poorly understood -- for transfer learning (TL) tasks. We present a +case study for TL on the Sentiment140 dataset and show that many pre-trained +foundation models encode different representations of Sentiment140's manually +curated test set $M$ from the automatically labeled training set $P$, +confirming that a distribution shift has occurred. We argue training on $P$ and +measuring performance on $M$ is a biased measure of generalization. Experiments +on pre-trained GPT-2 show that the features learnable from $P$ do not improve +(and in fact hamper) performance on $M$. Linear probes on pre-trained GPT-2's +representations are robust and may even outperform overall fine-tuning, +implying a fundamental importance for discerning distribution shift in +train/test splits for model interpretation. + +
+
+ comment: 17 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ How Powerful are Decoder-Only Transformer Neural Models? + + +
+ In this article we prove that the general transformer neural model +undergirding modern large language models (LLMs) is Turing complete under +reasonable assumptions. This is the first work to directly address the Turing +completeness of the underlying technology employed in GPT-x as past work has +focused on the more expressive, full auto-encoder transformer architecture. +From this theoretical analysis, we show that the sparsity/compressibility of +the word embedding is an important consideration for Turing completeness to +hold. We also show that Transformers are are a variant of B machines studied by +Hao Wang. + +
+
+
+
+
+ + ♻ ☆ CroissantLLM: A Truly Bilingual French-English Language Model + + +
+ We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T +English and French tokens, to bring to the research and industrial community a +high-performance, fully open-sourced bilingual model that runs swiftly on +consumer-grade local hardware. To that end, we pioneer the approach of training +an intrinsically bilingual model with a 1:1 English-to-French pretraining data +ratio, a custom tokenizer, and bilingual finetuning datasets. We release the +training dataset, notably containing a French split with manually curated, +high-quality, and varied data sources. To assess performance outside of +English, we craft a novel benchmark, FrenchBench, consisting of an array of +classification and generation tasks, covering various orthogonal aspects of +model performance in the French Language. Additionally, rooted in transparency +and to foster further Large Language Model research, we release codebases, and +dozens of checkpoints across various model sizes, training data distributions, +and training steps, as well as fine-tuned Chat models, and strong translation +models. We evaluate our model through the FMTI framework, and validate 81 % of +the transparency criteria, far beyond the scores of even most open initiatives. +This work enriches the NLP landscape, breaking away from previous +English-centric work in order to strengthen our understanding of +multilinguality in language models. + +
+
+
+
+
+ + ♻ ☆ Topic Bias in Emotion Classification EACL 2024 + + +
+ Emotion corpora are typically sampled based on keyword/hashtag search or by +asking study participants to generate textual instances. In any case, these +corpora are not uniform samples representing the entirety of a domain. We +hypothesize that this practice of data acquisition leads to unrealistic +correlations between overrepresented topics in these corpora that harm the +generalizability of models. Such topic bias could lead to wrong predictions for +instances like "I organized the service for my aunt's funeral." when funeral +events are over-represented for instances labeled with sadness, despite the +emotion of pride being more appropriate here. In this paper, we study this +topic bias both from the data and the modeling perspective. We first label a +set of emotion corpora automatically via topic modeling and show that emotions +in fact correlate with specific topics. Further, we see that emotion +classifiers are confounded by such topics. Finally, we show that the +established debiasing method of adversarial correction via gradient reversal +mitigates the issue. Our work points out issues with existing emotion corpora +and that more representative resources are required for fair evaluation of +models predicting affective concepts from text. + +
+
+ comment: accepted to W-NUT at EACL 2024 +
+
+
+
+
+ + ♻ ☆ Leveraging Professional Radiologists' Expertise to Enhance LLMs' + Evaluation for Radiology Reports + + +
+ In radiology, Artificial Intelligence (AI) has significantly advanced report +generation, but automatic evaluation of these AI-produced reports remains +challenging. Current metrics, such as Conventional Natural Language Generation +(NLG) and Clinical Efficacy (CE), often fall short in capturing the semantic +intricacies of clinical contexts or overemphasize clinical details, undermining +report clarity. To overcome these issues, our proposed method synergizes the +expertise of professional radiologists with Large Language Models (LLMs), like +GPT-3.5 and GPT-4 1. Utilizing In-Context Instruction Learning (ICIL) and Chain +of Thought (CoT) reasoning, our approach aligns LLM evaluations with +radiologist standards, enabling detailed comparisons between human and AI +generated reports. This is further enhanced by a Regression model that +aggregates sentence evaluation scores. Experimental results show that our +"Detailed GPT-4 (5-shot)" model achieves a 0.48 score, outperforming the METEOR +metric by 0.19, while our "Regressed GPT-4" model shows even greater alignment +with expert evaluations, exceeding the best existing metric by a 0.35 margin. +Moreover, the robustness of our explanations has been validated through a +thorough iterative strategy. We plan to publicly release annotations from +radiology experts, setting a new standard for accuracy in future assessments. +This underscores the potential of our approach in enhancing the quality +assessment of AI-driven medical reports. + +
+
+
+
+
+ + ♻ ☆ A Linguistic Comparison between Human and ChatGPT-Generated + Conversations ICPR + + +
+ This study explores linguistic differences between human and LLM-generated +dialogues, using 19.5K dialogues generated by ChatGPT-3.5 as a companion to the +EmpathicDialogues dataset. The research employs Linguistic Inquiry and Word +Count (LIWC) analysis, comparing ChatGPT-generated conversations with human +conversations across 118 linguistic categories. Results show greater +variability and authenticity in human dialogues, but ChatGPT excels in +categories such as social processes, analytical style, cognition, attentional +focus, and positive emotional tone, reinforcing recent findings of LLMs being +"more human than human." However, no significant difference was found in +positive or negative affect between ChatGPT and human dialogues. Classifier +analysis of dialogue embeddings indicates implicit coding of the valence of +affect despite no explicit mention of affect in the conversations. The research +also contributes a novel, companion ChatGPT-generated dataset of conversations +between two independent chatbots, which were designed to replicate a corpus of +human conversations available for open access and used widely in AI research on +language modeling. Our findings increase understanding of ChatGPT's linguistic +capabilities and inform ongoing efforts to distinguish between human and +LLM-generated text, which is critical in detecting AI-generated fakes, +misinformation, and disinformation. + +
+
+ comment: Preprint. Pending review and feedback from ICPRAI2024 +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Exact Optimization of Language Model Alignment + + +
+ The alignment of language models with human preferences is vital for their +application in real-world tasks. The problem is formulated as optimizing the +model's policy to maximize the expected reward that reflects human preferences +with minimal deviation from the initial policy. While considered as a +straightforward solution, reinforcement learning (RL) suffers from high +variance in policy updates, which impedes efficient policy improvement. +Recently, direct preference optimization (DPO) was proposed to directly +optimize the policy from preference data. Though simple to implement, DPO is +derived based on the optimal policy that is not assured to be achieved in +practice, which undermines its convergence to the intended solution. + In this paper, we propose efficient exact optimization (EXO) of the alignment +objective. We prove that EXO is guaranteed to optimize in the same direction as +the RL algorithms asymptotically for arbitary parametrization of the policy, +while enables efficient optimization by circumventing the complexities +associated with RL algorithms. We compare our method to DPO with both +theoretical and empirical analyses, and further demonstrate the advantages of +our method over existing approaches on realistic human preference data. + +
+
+ comment: 24 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ The Role of Data Curation in Image Captioning + + +
+ Image captioning models are typically trained by treating all samples +equally, neglecting to account for mismatched or otherwise difficult data +points. In contrast, recent work has shown the effectiveness of training models +by scheduling the data using curriculum learning strategies. This paper +contributes to this direction by actively curating difficult samples in +datasets without increasing the total number of samples. We explore the effect +of using three data curation methods within the training process: complete +removal of an sample, caption replacement, or image replacement via a +text-to-image generation model. Experiments on the Flickr30K and COCO datasets +with the BLIP and BEiT-3 models demonstrate that these curation methods do +indeed yield improved image captioning models, underscoring their efficacy. + +
+
+
+
+
+ + ♻ ☆ Scaling Sparse Fine-Tuning to Large Language Models + + +
+ Large Language Models (LLMs) are difficult to fully fine-tune (e.g., with +instructions or human feedback) due to their sheer number of parameters. A +family of parameter-efficient sparse fine-tuning methods have proven promising +in terms of performance but their memory requirements increase proportionally +to the size of the LLMs. In this work, we scale sparse fine-tuning to +state-of-the-art LLMs like LLaMA 2 7B and 13B. We propose SpIEL, a novel sparse +fine-tuning method which, for a desired density level, maintains an array of +parameter indices and the deltas of these parameters relative to their +pretrained values. It iterates over: (a) updating the active deltas, (b) +pruning indices (based on the change of magnitude of their deltas) and (c) +regrowth of indices. For regrowth, we explore two criteria based on either the +accumulated gradients of a few candidate parameters or their approximate +momenta estimated using the efficient SM3 optimizer. We experiment with +instruction-tuning of LLMs on standard dataset mixtures, finding that SpIEL is +often superior to popular parameter-efficient fine-tuning methods like LoRA +(low-rank adaptation) in terms of performance and comparable in terms of run +time. We additionally show that SpIEL is compatible with both quantization and +efficient optimizers, to facilitate scaling to ever-larger model sizes. We +release the code for SpIEL at https://github.com/AlanAnsell/peft and for the +instruction-tuning experiments at https://github.com/ducdauge/sft-llm. + +
+
+
+
+
+ + ♻ ☆ Geo-Encoder: A Chunk-Argument Bi-Encoder Framework for Chinese + Geographic Re-Ranking EACL 2024 + + +
+ Chinese geographic re-ranking task aims to find the most relevant addresses +among retrieved candidates, which is crucial for location-related services such +as navigation maps. Unlike the general sentences, geographic contexts are +closely intertwined with geographical concepts, from general spans (e.g., +province) to specific spans (e.g., road). Given this feature, we propose an +innovative framework, namely Geo-Encoder, to more effectively integrate Chinese +geographical semantics into re-ranking pipelines. Our methodology begins by +employing off-the-shelf tools to associate text with geographical spans, +treating them as chunking units. Then, we present a multi-task learning module +to simultaneously acquire an effective attention matrix that determines chunk +contributions to extra semantic representations. Furthermore, we put forth an +asynchronous update mechanism for the proposed addition task, aiming to guide +the model capable of effectively focusing on specific chunks. Experiments on +two distinct Chinese geographic re-ranking datasets, show that the Geo-Encoder +achieves significant improvements when compared to state-of-the-art baselines. +Notably, it leads to a substantial improvement in the Hit@1 score of MGEO-BERT, +increasing it by 6.22% from 62.76 to 68.98 on the GeoTES dataset. + +
+
+ comment: 15 pages, 5 figures, EACL 2024 main +
+
+
+
+
+ + ♻ ☆ UNSEE: Unsupervised Non-contrastive Sentence Embeddings EACL 2024 + + +
+ We present UNSEE: Unsupervised Non-Contrastive Sentence Embeddings, a novel +approach that outperforms SimCSE in the Massive Text Embedding benchmark. Our +exploration begins by addressing the challenge of representation collapse, a +phenomenon observed when contrastive objectives in SimCSE are replaced with +non-contrastive objectives. To counter this issue, we propose a straightforward +solution known as the target network, effectively mitigating representation +collapse. The introduction of the target network allows us to leverage +non-contrastive objectives, maintaining training stability while achieving +performance improvements comparable to contrastive objectives. Our method has +achieved peak performance in non-contrastive sentence embeddings through +meticulous fine-tuning and optimization. This comprehensive effort has yielded +superior sentence representation models, showcasing the effectiveness of our +approach. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ♻ ☆ Language Models as Inductive Reasoners EACL 2024 + + +
+ Inductive reasoning is a core component of human intelligence. In the past +research of inductive reasoning within computer science, formal language is +used as representations of knowledge (facts and rules, more specifically). +However, formal language can cause systematic problems for inductive reasoning +such as disability of handling raw input such as natural language, +sensitiveness to mislabeled data, and incapacity to handle ambiguous input. To +this end, we propose a new paradigm (task) for inductive reasoning, which is to +induce natural language rules from natural language facts, and create a dataset +termed DEER containing 1.2k rule-fact pairs for the task, where rules and facts +are written in natural language. New automatic metrics are also proposed and +analysed for the evaluation of this task. With DEER, we investigate a modern +approach for inductive reasoning where we use natural language as +representation for knowledge instead of formal language and use pretrained +language models as "reasoners". Moreover, we provide the first and +comprehensive analysis of how well pretrained language models can induce +natural language rules from natural language facts. We also propose a new +framework drawing insights from philosophy literature for this task, which we +show in the experiment section that surpasses baselines in both automatic and +human evaluations. We discuss about our future perspectives for inductive +reasoning in Section 7. Dataset and code are available at +https://github.com/ZonglinY/Inductive_Reasoning. + +
+
+ comment: Accepted by EACL 2024 (main) +
+
+
+
+
+ + ♻ ☆ Document-Level In-Context Few-Shot Relation Extraction via Pre-Trained + Language Models + + +
+ Relation extraction aims at inferring structured human knowledge from textual +documents. State-of-the-art methods based on language models commonly have two +limitations: (1) they require named entities to be either given as input or +infer them, which introduces additional noise, and (2) they require human +annotations of documents. As a remedy, we present a novel framework for +document-level in-context few-shot relation extraction via pre-trained language +models. We achieve crucial benefits in that we eliminate the need for both +named entity recognition and human annotation of documents. Unlike existing +methods based on fine-tuning, our framework is flexible in that it can be +easily updated for a new set of relations without re-training. We evaluate our +framework using DocRED, the largest publicly available dataset for +document-level relation extraction, and demonstrate that our framework achieves +state-of-the-art performance. Finally, we show that our framework actually +performs much better than the original labels from the development set of +DocRED. To the best of our knowledge, we are the first to reformulate the +document-level relation extraction task as a tailored in-context few-shot +learning paradigm. + +
+
+
+
+
+ + ♻ ☆ Bridging Cultural Nuances in Dialogue Agents through Cultural Value + Surveys EACL 2024 + + +
+ The cultural landscape of interactions with dialogue agents is a compelling +yet relatively unexplored territory. It's clear that various sociocultural +aspects -- from communication styles and beliefs to shared metaphors and +knowledge -- profoundly impact these interactions. To delve deeper into this +dynamic, we introduce cuDialog, a first-of-its-kind benchmark for dialogue +generation with a cultural lens. We also develop baseline models capable of +extracting cultural attributes from dialogue exchanges, with the goal of +enhancing the predictive accuracy and quality of dialogue agents. To +effectively co-learn cultural understanding and multi-turn dialogue +predictions, we propose to incorporate cultural dimensions with dialogue +encoding features. Our experimental findings highlight that incorporating +cultural value surveys boosts alignment with references and cultural markers, +demonstrating its considerable influence on personalization and dialogue +quality. To facilitate further exploration in this exciting domain, we publish +our benchmark publicly accessible at https://github.com/yongcaoplus/cuDialog. + +
+
+ comment: 17pages, 7 figures, EACL 2024 findings +
+
+
+
+
+ + ♻ ☆ Soaring from 4K to 400K: Extending LLM's Context with Activation Beacon + + +
+ The utilization of long contexts poses a big challenge for LLMs due to their +limited context window size. Although the context window can be extended +through fine-tuning, it will result in a considerable cost at both training and +inference time, and exert an unfavorable impact to the LLM's original +capabilities. In this work, we propose a new method called Activation Beacon, +which condenses LLM's raw activations into compact forms such that the LLM can +perceive a longer context with a limited context window. Activation Beacon is +introduced as a plug-in module, which fully preserves the LLM's original +capability in short contexts. It works with the sliding window to streamingly +process the long context, which leads to a competitive memory and time +efficiency in both training and inference. Activation Beacon is trained with +short-sequence data of diversified condensing ratios. Thanks to such a +treatment, it can be effectively learned to support different context lengths +with a small training cost. Our experiment verifies Activation Beacon's +effectiveness of context extension: it can remarkably accomplish high-quality +extension of Llama-2-7B's context by $\times100$ times (from 4K to 400K); +meanwhile, it can also achieve superior performances across a variety of +long-context language modeling and understanding tasks. The source code and +model checkpoint are available at +\url{https://github.com/FlagOpen/FlagEmbedding}. + +
+
+
+
+
+ + ♻ ☆ Deception Abilities Emerged in Large Language Models + + +
+ Large language models (LLMs) are currently at the forefront of intertwining +artificial intelligence (AI) systems with human communication and everyday +life. Thus, aligning them with human values is of great importance. However, +given the steady increase in reasoning abilities, future LLMs are under +suspicion of becoming able to deceive human operators and utilizing this +ability to bypass monitoring efforts. As a prerequisite to this, LLMs need to +possess a conceptual understanding of deception strategies. This study reveals +that such strategies emerged in state-of-the-art LLMs, such as GPT-4, but were +non-existent in earlier LLMs. We conduct a series of experiments showing that +state-of-the-art LLMs are able to understand and induce false beliefs in other +agents, that their performance in complex deception scenarios can be amplified +utilizing chain-of-thought reasoning, and that eliciting Machiavellianism in +LLMs can alter their propensity to deceive. In sum, revealing hitherto unknown +machine behavior in LLMs, our study contributes to the nascent field of machine +psychology. + +
+
+
+
+
+ + ♻ ☆ Spike No More: Stabilizing the Pre-training of Large Language Models + + +
+ Loss spikes often occur during pre-training of large language models. The +spikes degrade the performance of large language models and sometimes ruin the +pre-training. Since the pre-training needs a vast computational budget, we +should avoid such spikes. To investigate the cause of loss spikes, we focus on +gradients of internal layers. Through theoretical analyses, we reveal two +causes of the exploding gradients, and provide requirements to prevent the +explosion. In addition, we propose a method to satisfy the requirements by +combining the initialization method and a simple modification to embeddings. We +conduct various experiments to verify our theoretical analyses empirically. +Experimental results indicate that the combination is effective in preventing +spikes during pre-training. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Multi-Relational Hyperbolic Word Embeddings from Natural Language + Definitions EACL 2024 + + +
+ Natural language definitions possess a recursive, self-explanatory semantic +structure that can support representation learning methods able to preserve +explicit conceptual relations and constraints in the latent space. This paper +presents a multi-relational model that explicitly leverages such a structure to +derive word embeddings from definitions. By automatically extracting the +relations linking defined and defining terms from dictionaries, we demonstrate +how the problem of learning word embeddings can be formalised via a +translational framework in Hyperbolic space and used as a proxy to capture the +global semantic structure of definitions. An extensive empirical analysis +demonstrates that the framework can help imposing the desired structural +constraints while preserving the semantic mapping required for controllable and +interpretable traversal. Moreover, the experiments reveal the superiority of +the Hyperbolic word embeddings over the Euclidean counterparts and demonstrate +that the multi-relational approach can obtain competitive results when compared +to state-of-the-art neural models, with the advantage of being intrinsically +more efficient and interpretable. + +
+
+ comment: Accepted at the 18th Conference of the European Chapter of the + Association for Computational Linguistics (EACL 2024), camera-ready +
+
+
+
+
+ + ♻ ☆ DQNC2S: DQN-based Cross-stream Crisis event Summarizer ECIR 2024 + + +
+ Summarizing multiple disaster-relevant data streams simultaneously is +particularly challenging as existing Retrieve&Re-ranking strategies suffer from +the inherent redundancy of multi-stream data and limited scalability in a +multi-query setting. This work proposes an online approach to crisis timeline +generation based on weak annotation with Deep Q-Networks. It selects on-the-fly +the relevant pieces of text without requiring neither human annotations nor +content re-ranking. This makes the inference time independent of the number of +input queries. The proposed approach also incorporates a redundancy filter into +the reward function to effectively handle cross-stream content overlaps. The +achieved ROUGE and BERTScore results are superior to those of best-performing +models on the CrisisFACTS 2022 benchmark. + +
+
+ comment: accepted at ECIR 2024 +
+
+
+
+
+ + ♻ ☆ An Assessment on Comprehending Mental Health through Large Language + Models + + +
+ Mental health challenges pose considerable global burdens on individuals and +communities. Recent data indicates that more than 20% of adults may encounter +at least one mental disorder in their lifetime. On the one hand, the +advancements in large language models have facilitated diverse applications, +yet a significant research gap persists in understanding and enhancing the +potential of large language models within the domain of mental health. On the +other hand, across various applications, an outstanding question involves the +capacity of large language models to comprehend expressions of human mental +health conditions in natural language. This study presents an initial +evaluation of large language models in addressing this gap. Due to this, we +compare the performance of Llama-2 and ChatGPT with classical Machine as well +as Deep learning models. Our results on the DAIC-WOZ dataset show that +transformer-based models, like BERT or XLNet, outperform the large language +models. + +
+
+
+
+
+ + ♻ ☆ Contrastive Preference Optimization: Pushing the Boundaries of LLM + Performance in Machine Translation + + +
+ Moderate-sized large language models (LLMs) -- those with 7B or 13B +parameters -- exhibit promising machine translation (MT) performance. However, +even the top-performing 13B LLM-based translation models, like ALMA, does not +match the performance of state-of-the-art conventional encoder-decoder +translation models or larger-scale LLMs such as GPT-4. In this study, we bridge +this performance gap. We first assess the shortcomings of supervised +fine-tuning for LLMs in the MT task, emphasizing the quality issues present in +the reference data, despite being human-generated. Then, in contrast to SFT +which mimics reference translations, we introduce Contrastive Preference +Optimization (CPO), a novel approach that trains models to avoid generating +adequate but not perfect translations. Applying CPO to ALMA models with only +22K parallel sentences and 12M parameters yields significant improvements. The +resulting model, called ALMA-R, can match or exceed the performance of the WMT +competition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets. + +
+
+
+
+
+ + ♻ ☆ Evil Geniuses: Delving into the Safety of LLM-based Agents + + +
+ Rapid advancements in large language models (LLMs) have revitalized in +LLM-based agents, exhibiting impressive human-like behaviors and cooperative +capabilities in various scenarios. However, these agents also bring some +exclusive risks, stemming from the complexity of interaction environments and +the usability of tools. This paper delves into the safety of LLM-based agents +from three perspectives: agent quantity, role definition, and attack level. +Specifically, we initially propose to employ a template-based attack strategy +on LLM-based agents to find the influence of agent quantity. In addition, to +address interaction environment and role specificity issues, we introduce Evil +Geniuses (EG), an effective attack method that autonomously generates prompts +related to the original role to examine the impact across various role +definitions and attack levels. EG leverages Red-Blue exercises, significantly +improving the generated prompt aggressiveness and similarity to original roles. +Our evaluations on CAMEL, Metagpt and ChatDev based on GPT-3.5 and GPT-4, +demonstrate high success rates. Extensive evaluation and discussion reveal that +these agents are less robust, prone to more harmful behaviors, and capable of +generating stealthier content than LLMs, highlighting significant safety +challenges and guiding future research. Our code is available at +https://github.com/T1aNS1R/Evil-Geniuses. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Sentence Representations: From the BERT Epoch + to the ChatGPT Era and Beyond EACL'24 + + +
+ Sentence representations are a critical component in NLP applications such as +retrieval, question answering, and text classification. They capture the +meaning of a sentence, enabling machines to understand and reason over human +language. In recent years, significant progress has been made in developing +methods for learning sentence representations, including unsupervised, +supervised, and transfer learning approaches. However there is no literature +review on sentence representations till now. In this paper, we provide an +overview of the different methods for sentence representation learning, +focusing mostly on deep learning models. We provide a systematic organization +of the literature, highlighting the key contributions and challenges in this +area. Overall, our review highlights the importance of this area in natural +language processing, the progress made in sentence representation learning, and +the challenges that remain. We conclude with directions for future research, +suggesting potential avenues for improving the quality and efficiency of +sentence representations. + +
+
+ comment: Accepted to EACL'24 +
+
+
+
+
+ + ♻ ☆ A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for + Verifiers of Reasoning Chains + + +
+ Prompting language models to provide step-by-step answers (e.g., +"Chain-of-Thought") is the prominent approach for complex reasoning tasks, +where more accurate reasoning chains typically improve downstream task +performance. Recent literature discusses automatic methods to verify reasoning +steps to evaluate and improve their correctness. However, no fine-grained +step-level datasets are available to enable thorough evaluation of such +verification methods, hindering progress in this direction. We introduce +Reveal: Reasoning Verification Evaluation, a new dataset to benchmark automatic +verifiers of complex Chain-of-Thought reasoning in open-domain question +answering settings. Reveal includes comprehensive labels for the relevance, +attribution to evidence passages, and logical correctness of each reasoning +step in a language model's answer, across a wide variety of datasets and +state-of-the-art language models. + +
+
+ comment: Dataset at https://huggingface.co/datasets/google/reveal +
+
+
+
+
+ + ♻ ☆ CBQ: Cross-Block Quantization for Large Language Models + + +
+ Post-training quantization (PTQ) has played a key role in compressing large +language models (LLMs) with ultra-low costs. However, existing PTQ methods only +focus on handling the outliers within one layer or one block, which ignores the +dependency of blocks and leads to severe performance degradation in low-bit +settings. In this paper, we propose CBQ, a cross-block reconstruction-based PTQ +method for LLMs. CBQ employs a cross-block dependency using a homologous +reconstruction scheme, establishing long-range dependencies across multiple +blocks to minimize error accumulation. Furthermore, CBQ incorporates a +coarse-to-fine preprocessing (CFP) strategy for suppressing weight and +activation outliers, coupled with an adaptive LoRA-Rounding technique for +precise weight quantization. These innovations enable CBQ to not only handle +extreme outliers effectively but also improve overall quantization accuracy. +Extensive experiments show that CBQ achieves superior low-bit quantization +(W4A4, W4A8, W2A16) and outperforms existing state-of-the-art methods across +various LLMs and datasets. Notably, CBQ quantizes the 4-bit LLAMA1-65B model +within only 4.3 hours on a single GPU, achieving a commendable tradeoff between +performance and quantization efficiency. + +
+
+
+
+
+ + ♻ ☆ A Survey of Large Language Models in Medicine: Principles, Applications, + and Challenges + + +
+ Large language models (LLMs), such as ChatGPT, have received substantial +attention due to their capabilities for understanding and generating human +language. LLMs in medicine to assist physicians for patient care are emerging +as a promising research direction in both artificial intelligence and clinical +medicine. This review provides a comprehensive overview of the principles, +applications, and challenges faced by LLMs in medicine. We address the +following specific questions: 1) How should medical LLMs be built? 2) What are +the measures for the downstream performance of medical LLMs? 3) How should +medical LLMs be utilized in real-world clinical practice? 4) What challenges +arise from the use of medical LLMs? and 5) How should we better construct and +utilize medical LLMs? This review aims to provide insights into the +opportunities and challenges of LLMs in medicine, and serve as a practical +resource for constructing effective medical LLMs. We also maintain and +regularly updated list of practical guides on medical LLMs at +https://github.com/AI-in-Health/MedLLMsPracticalGuide. + +
+
+ comment: Preprint. Version 3. 54 pages +
+
+
+
+
+ + ♻ ☆ Neuron Patching: Neuron-level Model Editing on Code Generation and LLMs + + +
+ Large Language Models are successfully adopted in software engineering, +especially in code generation. Updating these models with new knowledge is very +expensive, and is often required to fully realize their value. In this paper, +we propose a novel and effective model editing approach, \textsc{MENT}, to +patch LLMs in coding tasks. Based on the mechanism of generative LLMs, +\textsc{MENT} enables model editing in next-token predictions, and further +supports common coding tasks. \textsc{MENT} is effective, efficient, and +reliable. It can correct a neural model by patching 1 or 2 neurons. As the +pioneer work on neuron-level model editing of generative models, we formalize +the editing process and introduce the involved concepts. Besides, we also +introduce new measures to evaluate its generalization ability, and build a +benchmark for further study. Our approach is evaluated on three coding tasks, +including API-seq recommendation, line-level code generation, and +pseudocode-to-code transaction. It outperforms the state-of-the-art by a +significant margin on both effectiveness and efficiency measures. In addition, +we demonstrate the usages of \textsc{MENT} for LLM reasoning in software +engineering. By editing the LLM knowledge with \textsc{MENT}, the directly or +indirectly dependent behaviors in the chain-of-thought change accordingly and +automatically. + +
+
+ comment: 12 pages, 5 figures, 6 tables, under peer review +
+
+
+
+
+ + ♻ ☆ On the Semantics of LM Latent Space: A Vocabulary-defined Approach + + +
+ Understanding the latent space of language models (LM) is crucial to refining +their performance and interpretability. Existing analyses often fall short in +providing disentangled (model-centric) insights into LM semantics, and neglect +essential aspects of LM adaption. In response, we introduce a pioneering method +called vocabulary-defined semantics, which establishes a reference frame within +the LM latent space, ensuring disentangled semantic analysis grounded in LM +vocabulary. Our approach transcends prior entangled analysis, leveraging LM +vocabulary for model-centric insights. Furthermore, we propose a novel +technique to compute logits, emphasising differentiability and local isotropy, +and introduce a neural clustering module for semantically calibrating data +representations during LM adaptation. Through extensive experiments across +diverse text understanding datasets, our approach outperforms state-of-the-art +methods of retrieval-augmented generation and parameter-efficient finetuning, +showcasing its efficacy and broad applicability. Our findings not only shed +light on LM mechanics, but also offer practical solutions to enhance LM +performance and interpretability. + +
+
+ comment: under peer review +
+
+
+
+
+ + ♻ ☆ Recent Advances in Hate Speech Moderation: Multimodality and the Role of + Large Models + + +
+ In the evolving landscape of online communication, moderating hate speech +(HS) presents an intricate challenge, compounded by the multimodal nature of +digital content. This comprehensive survey delves into the recent strides in HS +moderation, spotlighting the burgeoning role of large language models (LLMs) +and large multimodal models (LMMs). Our exploration begins with a thorough +analysis of current literature, revealing the nuanced interplay between +textual, visual, and auditory elements in propagating HS. We uncover a notable +trend towards integrating these modalities, primarily due to the complexity and +subtlety with which HS is disseminated. A significant emphasis is placed on the +advances facilitated by LLMs and LMMs, which have begun to redefine the +boundaries of detection and moderation capabilities. We identify existing gaps +in research, particularly in the context of underrepresented languages and +cultures, and the need for solutions to handle low-resource settings. The +survey concludes with a forward-looking perspective, outlining potential +avenues for future research, including the exploration of novel AI +methodologies, the ethical governance of AI in moderation, and the development +of more nuanced, context-aware systems. This comprehensive overview aims to +catalyze further research and foster a collaborative effort towards more +sophisticated, responsible, and human-centric approaches to HS moderation in +the digital era. WARNING: This paper contains offensive examples. + +
+
+ comment: Preprint; Under-Review +
+
+
+
+
+ + ♻ ☆ SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in + Chinese + + +
+ We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate +the mathematical reasoning abilities of Chinese language models. SC-Math6 is +designed as an upgraded Chinese version of the GSM8K dataset with enhanced +difficulty, diversity, and application scope. It consists of over 2000 +mathematical word problems requiring multi-step reasoning and providing natural +language solutions. We propose an innovative scheme to quantify the reasoning +capability of large models based on performance over problems with different +reasoning steps. Experiments on 13 representative Chinese models demonstrate a +clear stratification of reasoning levels, with top models like GPT-4 showing +superior performance. SC-Math6 fills the gap in Chinese mathematical reasoning +benchmarks and provides a comprehensive testbed to advance the intelligence of +Chinese language models. + +
+
+ comment: Dataset revised and finalized, results updated with new model; 8 + pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ The Neglected Tails of Vision-Language Models + + +
+ Vision-language models (VLMs) excel in zero-shot recognition but their +performance varies greatly across different visual concepts. For example, +although CLIP achieves impressive accuracy on ImageNet (60-80%), its +performance drops below 10% for more than ten concepts like night snake, +presumably due to their limited presence in the pretraining data. However, +measuring the frequency of concepts in VLMs' large-scale datasets is +challenging. We address this by using large language models (LLMs) to count the +number of pretraining texts that contain synonyms of these concepts. Our +analysis confirms that popular datasets, such as LAION, exhibit a long-tailed +concept distribution, yielding biased performance in VLMs. We also find that +downstream applications of VLMs, including visual chatbots (e.g., GPT-4V) and +text-to-image models (e.g., Stable Diffusion), often fail to recognize or +generate images of rare concepts identified by our method. To mitigate the +imbalanced performance of zero-shot VLMs, we propose REtrieval-Augmented +Learning (REAL). First, instead of prompting VLMs using the original class +names, REAL uses their most frequent synonyms found in pretraining texts. This +simple change already outperforms costly human-engineered and LLM-enriched +prompts over nine benchmark datasets. Second, REAL trains a linear classifier +on a small yet balanced set of pretraining data retrieved using concept +synonyms. REAL surpasses the previous zero-shot SOTA, using 400x less storage +and 10,000x less training time! + +
+
+ comment: Project Page: + https://shubhamprshr27.github.io/neglected-tails-of-vlms/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 108 + +
+
+
+ + ☆ Immersive Video Compression using Implicit Neural Representations + + +
+ Recent work on implicit neural representations (INRs) has evidenced their +potential for efficiently representing and encoding conventional video content. +In this paper we, for the first time, extend their application to immersive +(multi-view) videos, by proposing MV-HiNeRV, a new INR-based immersive video +codec. MV-HiNeRV is an enhanced version of a state-of-the-art INR-based video +codec, HiNeRV, which was developed for single-view video compression. We have +modified the model to learn a different group of feature grids for each view, +and share the learnt network parameters among all views. This enables the model +to effectively exploit the spatio-temporal and the inter-view redundancy that +exists within multi-view videos. The proposed codec was used to compress +multi-view texture and depth video sequences in the MPEG Immersive Video (MIV) +Common Test Conditions, and tested against the MIV Test model (TMIV) that uses +the VVenC video codec. The results demonstrate the superior performance of +MV-HiNeRV, with significant coding gains (up to 72.33%) over TMIV. The +implementation of MV-HiNeRV will be published for further development and +evaluation. + +
+
+
+
+
+ + ☆ NeuroCine: Decoding Vivid Video Sequences from Human Brain Activties + + +
+ In the pursuit to understand the intricacies of human brain's visual +processing, reconstructing dynamic visual experiences from brain activities +emerges as a challenging yet fascinating endeavor. While recent advancements +have achieved success in reconstructing static images from non-invasive brain +recordings, the domain of translating continuous brain activities into video +format remains underexplored. In this work, we introduce NeuroCine, a novel +dual-phase framework to targeting the inherent challenges of decoding fMRI +data, such as noises, spatial redundancy and temporal lags. This framework +proposes spatial masking and temporal interpolation-based augmentation for +contrastive learning fMRI representations and a diffusion model enhanced by +dependent prior noise for video generation. Tested on a publicly available fMRI +dataset, our method shows promising results, outperforming the previous +state-of-the-art models by a notable margin of ${20.97\%}$, ${31.00\%}$ and +${12.30\%}$ respectively on decoding the brain activities of three subjects in +the fMRI dataset, as measured by SSIM. Additionally, our attention analysis +suggests that the model aligns with existing brain structures and functions, +indicating its biological plausibility and interpretability. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Boximator: Generating Rich and Controllable Motions for Video Synthesis + + +
+ Generating rich and controllable motion is a pivotal challenge in video +synthesis. We propose Boximator, a new approach for fine-grained motion +control. Boximator introduces two constraint types: hard box and soft box. +Users select objects in the conditional frame using hard boxes and then use +either type of boxes to roughly or rigorously define the object's position, +shape, or motion path in future frames. Boximator functions as a plug-in for +existing video diffusion models. Its training process preserves the base +model's knowledge by freezing the original weights and training only the +control module. To address training challenges, we introduce a novel +self-tracking technique that greatly simplifies the learning of box-object +correlations. Empirically, Boximator achieves state-of-the-art video quality +(FVD) scores, improving on two base models, and further enhanced after +incorporating box constraints. Its robust motion controllability is validated +by drastic increases in the bounding box alignment metric. Human evaluation +also shows that users favor Boximator generation results over the base model. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ Deep Continuous Networks ICML 2021 + + +
+ CNNs and computational models of biological vision share some fundamental +principles, which opened new avenues of research. However, fruitful cross-field +research is hampered by conventional CNN architectures being based on spatially +and depthwise discrete representations, which cannot accommodate certain +aspects of biological complexity such as continuously varying receptive field +sizes and dynamics of neuronal responses. Here we propose deep continuous +networks (DCNs), which combine spatially continuous filters, with the +continuous depth framework of neural ODEs. This allows us to learn the spatial +support of the filters during training, as well as model the continuous +evolution of feature maps, linking DCNs closely to biological models. We show +that DCNs are versatile and highly applicable to standard image classification +and reconstruction problems, where they improve parameter and data efficiency, +and allow for meta-parametrization. We illustrate the biological plausibility +of the scale distributions learned by DCNs and explore their performance in a +neuroscientifically inspired pattern completion task. Finally, we investigate +an efficient implementation of DCNs by changing input contrast. + +
+
+ comment: Presented at ICML 2021 +
+
+
+
+
+ + ☆ SLYKLatent, a Learning Framework for Facial Features Estimation + + +
+ In this research, we present SLYKLatent, a novel approach for enhancing gaze +estimation by addressing appearance instability challenges in datasets due to +aleatoric uncertainties, covariant shifts, and test domain generalization. +SLYKLatent utilizes Self-Supervised Learning for initial training with facial +expression datasets, followed by refinement with a patch-based tri-branch +network and an inverse explained variance-weighted training loss function. Our +evaluation on benchmark datasets achieves an 8.7% improvement on Gaze360, +rivals top MPIIFaceGaze results, and leads on a subset of ETH-XGaze by 13%, +surpassing existing methods by significant margins. Adaptability tests on +RAF-DB and Affectnet show 86.4% and 60.9% accuracies, respectively. Ablation +studies confirm the effectiveness of SLYKLatent's novel components. This +approach has strong potential in human-robot interaction. + +
+
+
+
+
+ + ☆ Closing the Gap in Human Behavior Analysis: A Pipeline for Synthesizing + Trimodal Data + + +
+ In pervasive machine learning, especially in Human Behavior Analysis (HBA), +RGB has been the primary modality due to its accessibility and richness of +information. However, linked with its benefits are challenges, including +sensitivity to lighting conditions and privacy concerns. One possibility to +overcome these vulnerabilities is to resort to different modalities. For +instance, thermal is particularly adept at accentuating human forms, while +depth adds crucial contextual layers. Despite their known benefits, only a few +HBA-specific datasets that integrate these modalities exist. To address this +shortage, our research introduces a novel generative technique for creating +trimodal, i.e., RGB, thermal, and depth, human-focused datasets. This technique +capitalizes on human segmentation masks derived from RGB images, combined with +thermal and depth backgrounds that are sourced automatically. With these two +ingredients, we synthesize depth and thermal counterparts from existing RGB +data utilizing conditional image-to-image translation. By employing this +approach, we generate trimodal data that can be leveraged to train models for +settings with limited data, bad lightning conditions, or privacy-sensitive +areas. + +
+
+
+
+
+ + ☆ HyperPlanes: Hypernetwork Approach to Rapid NeRF Adaptation + + +
+ Neural radiance fields (NeRFs) are a widely accepted standard for +synthesizing new 3D object views from a small number of base images. However, +NeRFs have limited generalization properties, which means that we need to use +significant computational resources to train individual architectures for each +item we want to represent. To address this issue, we propose a few-shot +learning approach based on the hypernetwork paradigm that does not require +gradient optimization during inference. The hypernetwork gathers information +from the training data and generates an update for universal weights. As a +result, we have developed an efficient method for generating a high-quality 3D +object representation from a small number of images in a single step. This has +been confirmed by direct comparison with the state-of-the-art solutions and a +comprehensive ablation study. + +
+
+
+
+
+ + ☆ Cross-view Masked Diffusion Transformers for Person Image Synthesis + + +
+ We present X-MDPT (Cross-view Masked Diffusion Prediction Transformers), a +novel diffusion model designed for pose-guided human image generation. X-MDPT +distinguishes itself by employing masked diffusion transformers that operate on +latent patches, a departure from the commonly-used Unet structures in existing +works. The model comprises three key modules: 1) a denoising diffusion +Transformer, 2) an aggregation network that consolidates conditions into a +single vector for the diffusion process, and 3) a mask cross-prediction module +that enhances representation learning with semantic information from the +reference image. X-MDPT demonstrates scalability, improving FID, SSIM, and +LPIPS with larger models. Despite its simple design, our model outperforms +state-of-the-art approaches on the DeepFashion dataset while exhibiting +efficiency in terms of training parameters, training time, and inference speed. +Our compact 33MB model achieves an FID of 7.42, surpassing a prior Unet latent +diffusion approach (FID 8.07) using only $11\times$ fewer parameters. Our best +model surpasses the pixel-based diffusion with $\frac{2}{3}$ of the parameters +and achieves $5.43 \times$ faster inference. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ☆ Advancing Brain Tumor Inpainting with Generative Models + + +
+ Synthesizing healthy brain scans from diseased brain scans offers a potential +solution to address the limitations of general-purpose algorithms, such as +tissue segmentation and brain extraction algorithms, which may not effectively +handle diseased images. We consider this a 3D inpainting task and investigate +the adaptation of 2D inpainting methods to meet the requirements of 3D magnetic +resonance imaging(MRI) data. Our contributions encompass potential +modifications tailored to MRI-specific needs, and we conducted evaluations of +multiple inpainting techniques using the BraTS2023 Inpainting datasets to +assess their efficacy and limitations. + +
+
+
+
+
+ + ☆ Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian + Processes + + +
+ While the great capability of Transformers significantly boosts prediction +accuracy, it could also yield overconfident predictions and require calibrated +uncertainty estimation, which can be commonly tackled by Gaussian processes +(GPs). Existing works apply GPs with symmetric kernels under variational +inference to the attention kernel; however, omitting the fact that attention +kernels are in essence asymmetric. Moreover, the complexity of deriving the GP +posteriors remains high for large-scale data. In this work, we propose +Kernel-Eigen Pair Sparse Variational Gaussian Processes (KEP-SVGP) for building +uncertainty-aware self-attention where the asymmetry of attention kernels is +tackled by Kernel SVD (KSVD) and a reduced complexity is acquired. Through +KEP-SVGP, i) the SVGP pair induced by the two sets of singular vectors from +KSVD w.r.t. the attention kernel fully characterizes the asymmetry; ii) using +only a small set of adjoint eigenfunctions from KSVD, the derivation of SVGP +posteriors can be based on the inversion of a diagonal matrix containing +singular values, contributing to a reduction in time complexity; iii) an +evidence lower bound is derived so that variational parameters can be optimized +towards this objective. Experiments verify our excellent performances and +efficiency on in-distribution, distribution-shift and out-of-distribution +benchmarks. + +
+
+ comment: We propose Kernel-Eigen Pair Sparse Variational Gaussian Processes + (KEP-SVGP) for building uncertainty-aware self-attention where the asymmetry + of attention kernel is tackled by KSVD and a reduced time complexity is + acquired +
+
+
+
+
+ + ☆ Synthetic Data for the Mitigation of Demographic Biases in Face + Recognition + + +
+ This study investigates the possibility of mitigating the demographic biases +that affect face recognition technologies through the use of synthetic data. +Demographic biases have the potential to impact individuals from specific +demographic groups, and can be identified by observing disparate performance of +face recognition systems across demographic groups. They primarily arise from +the unequal representations of demographic groups in the training data. In +recent times, synthetic data have emerged as a solution to some problems that +affect face recognition systems. In particular, during the generation process +it is possible to specify the desired demographic and facial attributes of +images, in order to control the demographic distribution of the synthesized +dataset, and fairly represent the different demographic groups. We propose to +fine-tune with synthetic data existing face recognition systems that present +some demographic biases. We use synthetic datasets generated with GANDiffFace, +a novel framework able to synthesize datasets for face recognition with +controllable demographic distribution and realistic intra-class variations. We +consider multiple datasets representing different demographic groups for +training and evaluation. Also, we fine-tune different face recognition systems, +and evaluate their demographic fairness with different metrics. Our results +support the proposed approach and the use of synthetic data to mitigate +demographic biases in face recognition. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Scaled 360 layouts: Revisiting non-central panoramas + + +
+ From a non-central panorama, 3D lines can be recovered by geometric +reasoning. However, their sensitivity to noise and the complex geometric +modeling required has led these panoramas being very little investigated. In +this work we present a novel approach for 3D layout recovery of indoor +environments using single non-central panoramas. We obtain the boundaries of +the structural lines of the room from a non-central panorama using deep +learning and exploit the properties of non-central projection systems in a new +geometrical processing to recover the scaled layout. We solve the problem for +Manhattan environments, handling occlusions, and also for Atlanta environments +in an unified method. The experiments performed improve the state-of-the-art +methods for 3D layout recovery from a single panorama. Our approach is the +first work using deep learning with non-central panoramas and recovering the +scale of single panorama layouts. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2401.17058 +
+
+
+
+
+ + ☆ 3D Vertebrae Measurements: Assessing Vertebral Dimensions in Human Spine + Mesh Models Using Local Anatomical Vertebral Axes + + +
+ Vertebral morphological measurements are important across various +disciplines, including spinal biomechanics and clinical applications, pre- and +post-operatively. These measurements also play a crucial role in +anthropological longitudinal studies, where spinal metrics are repeatedly +documented over extended periods. Traditionally, such measurements have been +manually conducted, a process that is time-consuming. In this study, we +introduce a novel, fully automated method for measuring vertebral morphology +using 3D meshes of lumbar and thoracic spine models.Our experimental results +demonstrate the method's capability to accurately measure low-resolution +patient-specific vertebral meshes with mean absolute error (MAE) of 1.09 mm and +those derived from artificially created lumbar spines, where the average MAE +value was 0.7 mm. Our qualitative analysis indicates that measurements obtained +using our method on 3D spine models can be accurately reprojected back onto the +original medical images if these images are available. + +
+
+
+
+
+ + ☆ Visual Gyroscope: Combination of Deep Learning Features and Direct + Alignment for Panoramic Stabilization + + +
+ In this article we present a visual gyroscope based on equirectangular +panoramas. We propose a new pipeline where we take advantage of combining three +different methods to obtain a robust and accurate estimation of the attitude of +the camera. We quantitatively and qualitatively validate our method on two +image sequences taken with a $360^\circ$ dual-fisheye camera mounted on +different aerial vehicles. + +
+
+
+
+
+ + ☆ GaMeS: Mesh-Based Adapting and Modification of Gaussian Splatting + + +
+ In recent years, a range of neural network-based methods for image rendering +have been introduced. For instance, widely-researched neural radiance fields +(NeRF) rely on a neural network to represent 3D scenes, allowing for realistic +view synthesis from a small number of 2D images. However, most NeRF models are +constrained by long training and inference times. In comparison, Gaussian +Splatting (GS) is a novel, state-of-theart technique for rendering points in a +3D scene by approximating their contribution to image pixels through Gaussian +distributions, warranting fast training and swift, real-time rendering. A +drawback of GS is the absence of a well-defined approach for its conditioning +due to the necessity to condition several hundred thousand Gaussian components. +To solve this, we introduce Gaussian Mesh Splatting (GaMeS) model, a hybrid of +mesh and a Gaussian distribution, that pin all Gaussians splats on the object +surface (mesh). The unique contribution of our methods is defining Gaussian +splats solely based on their location on the mesh, allowing for automatic +adjustments in position, scale, and rotation during animation. As a result, we +obtain high-quality renders in the real-time generation of high-quality views. +Furthermore, we demonstrate that in the absence of a predefined mesh, it is +possible to fine-tune the initial mesh during the learning process. + +
+
+
+
+
+ + ☆ Convolution kernel adaptation to calibrated fisheye BMVC + + +
+ Convolution kernels are the basic structural component of convolutional +neural networks (CNNs). In the last years there has been a growing interest in +fisheye cameras for many applications. However, the radially symmetric +projection model of these cameras produces high distortions that affect the +performance of CNNs, especially when the field of view is very large. In this +work, we tackle this problem by proposing a method that leverages the +calibration of cameras to deform the convolution kernel accordingly and adapt +to the distortion. That way, the receptive field of the convolution is similar +to standard convolutions in perspective images, allowing us to take advantage +of pre-trained networks in large perspective datasets. We show how, with just a +brief fine-tuning stage in a small dataset, we improve the performance of the +network for the calibrated fisheye with respect to standard convolutions in +depth estimation and semantic segmentation. + +
+
+ comment: Previously presented at BMVC: https://proceedings.bmvc2023.org/721/ +
+
+
+
+
+ + ☆ Mission Critical -- Satellite Data is a Distinct Modality in Machine + Learning + + +
+ Satellite data has the potential to inspire a seismic shift for machine +learning -- one in which we rethink existing practices designed for traditional +data modalities. As machine learning for satellite data (SatML) gains traction +for its real-world impact, our field is at a crossroads. We can either continue +applying ill-suited approaches, or we can initiate a new research agenda that +centers around the unique characteristics and challenges of satellite data. +This position paper argues that satellite data constitutes a distinct modality +for machine learning research and that we must recognize it as such to advance +the quality and impact of SatML research across theory, methods, and +deployment. We outline critical discussion questions and actionable suggestions +to transform SatML from merely an intriguing application area to a dedicated +research discipline that helps move the needle on big challenges for machine +learning and society. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ EmoSpeaker: One-shot Fine-grained Emotion-Controlled Talking Face + Generation + + +
+ Implementing fine-grained emotion control is crucial for emotion generation +tasks because it enhances the expressive capability of the generative model, +allowing it to accurately and comprehensively capture and express various +nuanced emotional states, thereby improving the emotional quality and +personalization of generated content. Generating fine-grained facial animations +that accurately portray emotional expressions using only a portrait and an +audio recording presents a challenge. In order to address this challenge, we +propose a visual attribute-guided audio decoupler. This enables the obtention +of content vectors solely related to the audio content, enhancing the stability +of subsequent lip movement coefficient predictions. To achieve more precise +emotional expression, we introduce a fine-grained emotion coefficient +prediction module. Additionally, we propose an emotion intensity control method +using a fine-grained emotion matrix. Through these, effective control over +emotional expression in the generated videos and finer classification of +emotion intensity are accomplished. Subsequently, a series of 3DMM coefficient +generation networks are designed to predict 3D coefficients, followed by the +utilization of a rendering network to generate the final video. Our +experimental results demonstrate that our proposed method, EmoSpeaker, +outperforms existing emotional talking face generation methods in terms of +expression variation and lip synchronization. Project page: +https://peterfanfan.github.io/EmoSpeaker/ + +
+
+
+
+
+ + ☆ XAI for Skin Cancer Detection with Prototypes and Non-Expert Supervision MICCAI 2023 + + +
+ Skin cancer detection through dermoscopy image analysis is a critical task. +However, existing models used for this purpose often lack interpretability and +reliability, raising the concern of physicians due to their black-box nature. +In this paper, we propose a novel approach for the diagnosis of melanoma using +an interpretable prototypical-part model. We introduce a guided supervision +based on non-expert feedback through the incorporation of: 1) binary masks, +obtained automatically using a segmentation network; and 2) user-refined +prototypes. These two distinct information pathways aim to ensure that the +learned prototypes correspond to relevant areas within the skin lesion, +excluding confounding factors beyond its boundaries. Experimental results +demonstrate that, even without expert supervision, our approach achieves +superior performance and generalization compared to non-interpretable models. + +
+
+ comment: Accepted in the iMIMIC Workshop @ MICCAI 2023 +
+
+
+
+
+ + ☆ ALERT-Transformer: Bridging Asynchronous and Synchronous Machine + Learning for Real-Time Event-based Spatio-Temporal Data + + +
+ We seek to enable classic processing of continuous ultra-sparse +spatiotemporal data generated by event-based sensors with dense machine +learning models. We propose a novel hybrid pipeline composed of asynchronous +sensing and synchronous processing that combines several ideas: (1) an +embedding based on PointNet models -- the ALERT module -- that can continuously +integrate new and dismiss old events thanks to a leakage mechanism, (2) a +flexible readout of the embedded data that allows to feed any downstream model +with always up-to-date features at any sampling rate, (3) exploiting the input +sparsity in a patch-based approach inspired by Vision Transformer to optimize +the efficiency of the method. These embeddings are then processed by a +transformer model trained for object and gesture recognition. Using this +approach, we achieve performances at the state-of-the-art with a lower latency +than competitors. We also demonstrate that our asynchronous model can operate +at any desired sampling rate. + +
+
+ comment: Preprint version. 8 pages, 7 figures, under review +
+
+
+
+
+ + ☆ SiMA-Hand: Boosting 3D Hand-Mesh Reconstruction by Single-to-Multi-View + Adaptation + + +
+ Estimating 3D hand mesh from RGB images is a longstanding track, in which +occlusion is one of the most challenging problems. Existing attempts towards +this task often fail when the occlusion dominates the image space. In this +paper, we propose SiMA-Hand, aiming to boost the mesh reconstruction +performance by Single-to-Multi-view Adaptation. First, we design a multi-view +hand reconstructor to fuse information across multiple views by holistically +adopting feature fusion at image, joint, and vertex levels. Then, we introduce +a single-view hand reconstructor equipped with SiMA. Though taking only one +view as input at inference, the shape and orientation features in the +single-view reconstructor can be enriched by learning non-occluded knowledge +from the extra views at training, enhancing the reconstruction precision on the +occluded regions. We conduct experiments on the Dex-YCB and HanCo benchmarks +with challenging object- and self-caused occlusion cases, manifesting that +SiMA-Hand consistently achieves superior performance over the state of the +arts. Code will be released on https://github.com/JoyboyWang/SiMA-Hand Pytorch. + +
+
+
+
+
+ + ☆ Efficient Dynamic-NeRF Based Volumetric Video Coding with Rate + Distortion Optimization + + +
+ Volumetric videos, benefiting from immersive 3D realism and interactivity, +hold vast potential for various applications, while the tremendous data volume +poses significant challenges for compression. Recently, NeRF has demonstrated +remarkable potential in volumetric video compression thanks to its simple +representation and powerful 3D modeling capabilities, where a notable work is +ReRF. However, ReRF separates the modeling from compression process, resulting +in suboptimal compression efficiency. In contrast, in this paper, we propose a +volumetric video compression method based on dynamic NeRF in a more compact +manner. Specifically, we decompose the NeRF representation into the coefficient +fields and the basis fields, incrementally updating the basis fields in the +temporal domain to achieve dynamic modeling. Additionally, we perform +end-to-end joint optimization on the modeling and compression process to +further improve the compression efficiency. Extensive experiments demonstrate +that our method achieves higher compression efficiency compared to ReRF on +various datasets. + +
+
+
+
+
+ + ☆ Cheating Suffix: Targeted Attack to Text-To-Image Diffusion Models with + Multi-Modal Priors + + +
+ Diffusion models have been widely deployed in various image generation tasks, +demonstrating an extraordinary connection between image and text modalities. +However, they face challenges of being maliciously exploited to generate +harmful or sensitive images by appending a specific suffix to the original +prompt. Existing works mainly focus on using single-modal information to +conduct attacks, which fails to utilize multi-modal features and results in +less than satisfactory performance. Integrating multi-modal priors (MMP), i.e. +both text and image features, we propose a targeted attack method named +MMP-Attack in this work. Specifically, the goal of MMP-Attack is to add a +target object into the image content while simultaneously removing the original +object. The MMP-Attack shows a notable advantage over existing works with +superior universality and transferability, which can effectively attack +commercial text-to-image (T2I) models such as DALL-E 3. To the best of our +knowledge, this marks the first successful attempt of transfer-based attack to +commercial T2I models. Our code is publicly available at +\url{https://github.com/ydc123/MMP-Attack}. + +
+
+ comment: 10 figures +
+
+
+
+
+ + ☆ LIR: Efficient Degradation Removal for Lightweight Image Restoration + + +
+ Recently, there have been significant advancements in Image Restoration based +on CNN and transformer. However, the inherent characteristics of the Image +Restoration task are often overlooked in many works. These works often focus on +the basic block design and stack numerous basic blocks to the model, leading to +redundant parameters and unnecessary computations and hindering the efficiency +of the image restoration. In this paper, we propose a Lightweight Image +Restoration network called LIR to efficiently remove degradation (blur, rain, +noise, haze, etc.). A key component in LIR is the Efficient Adaptive Attention +(EAA) Block, which is mainly composed of Adaptive Filters and Attention Blocks. +It is capable of adaptively sharpening contours, removing degradation, and +capturing global information in various image restoration scenes in an +efficient and computation-friendly manner. In addition, through a simple +structural design, LIR addresses the degradations existing in the local and +global residual connections that are ignored by modern networks. Extensive +experiments demonstrate that our LIR achieves comparable performance to +state-of-the-art networks on most benchmarks with fewer parameters and +computations. It is worth noting that our LIR produces better visual results +than state-of-the-art networks that are more in line with the human aesthetic. + +
+
+
+
+
+ + ☆ FindingEmo: An Image Dataset for Emotion Recognition in the Wild + + +
+ We introduce FindingEmo, a new image dataset containing annotations for 25k +images, specifically tailored to Emotion Recognition. Contrary to existing +datasets, it focuses on complex scenes depicting multiple people in various +naturalistic, social settings, with images being annotated as a whole, thereby +going beyond the traditional focus on faces or single individuals. Annotated +dimensions include Valence, Arousal and Emotion label, with annotations +gathered using Prolific. Together with the annotations, we release the list of +URLs pointing to the original images, as well as all associated source code. + +
+
+ comment: 30 pages, 21 figures, 12 tables +
+
+
+
+
+ + ☆ Describing Images $\textit{Fast and Slow}$: Quantifying and Predicting + the Variation in Human Signals during Visuo-Linguistic Processes EACL 2024 + + +
+ There is an intricate relation between the properties of an image and how +humans behave while describing the image. This behavior shows ample variation, +as manifested in human signals such as eye movements and when humans start to +describe the image. Despite the value of such signals of visuo-linguistic +variation, they are virtually disregarded in the training of current pretrained +models, which motivates further investigation. Using a corpus of Dutch image +descriptions with concurrently collected eye-tracking data, we explore the +nature of the variation in visuo-linguistic signals, and find that they +correlate with each other. Given this result, we hypothesize that variation +stems partly from the properties of the images, and explore whether image +representations encoded by pretrained vision encoders can capture such +variation. Our results indicate that pretrained models do so to a +weak-to-moderate degree, suggesting that the models lack biases about what +makes a stimulus complex for humans and what leads to variations in human +outputs. + +
+
+ comment: To appear in EACL 2024 +
+
+
+
+
+ + ☆ Skip $\textbackslash n$: A simple method to reduce hallucination in + Large Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks ('$\textbackslash n\textbackslash n$'), +where the content before and after '$\textbackslash n\textbackslash n$' in the +training data frequently exhibit significant semantic changes. This pattern +leads the model to infer that the contents following '$\textbackslash +n\textbackslash n$' should be obviously different from the preceding contents +with less hallucinatory descriptions, thereby increasing the probability of +hallucinatory descriptions subsequent to the '$\textbackslash n\textbackslash +n$'. We have validated this hypothesis on multiple publicly available LVLMs. +Besides, we find that deliberately inserting '$\textbackslash n\textbackslash +n$' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of `\textbackslash n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Simulator-Free Visual Domain Randomization via Video Games + + +
+ Domain randomization is an effective computer vision technique for improving +transferability of vision models across visually distinct domains exhibiting +similar content. Existing approaches, however, rely extensively on tweaking +complex and specialized simulation engines that are difficult to construct, +subsequently affecting their feasibility and scalability. This paper introduces +BehAVE, a video understanding framework that uniquely leverages the plethora of +existing commercial video games for domain randomization, without requiring +access to their simulation engines. Under BehAVE (1) the inherent rich visual +diversity of video games acts as the source of randomization and (2) player +behavior -- represented semantically via textual descriptions of actions -- +guides the *alignment* of videos with similar content. We test BehAVE on 25 +games of the first-person shooter (FPS) genre across various video and text +foundation models and we report its robustness for domain randomization. BehAVE +successfully aligns player behavioral patterns and is able to zero-shot +transfer them to multiple unseen FPS games when trained on just one FPS game. +In a more challenging setting, BehAVE manages to improve the zero-shot +transferability of foundation models to unseen FPS games (up to 22%) even when +trained on a game of a different genre (Minecraft). Code and dataset can be +found at https://github.com/nrasajski/BehAVE. + +
+
+
+
+
+ + ☆ A general framework for rotation invariant point cloud analysis ICASSP 2024 + + +
+ We propose a general method for deep learning based point cloud analysis, +which is invariant to rotation on the inputs. Classical methods are vulnerable +to rotation, as they usually take aligned point clouds as input. Principle +Component Analysis (PCA) is a practical approach to achieve rotation +invariance. However, there are still some gaps between theory and practical +algorithms. In this work, we present a thorough study on designing rotation +invariant algorithms for point cloud analysis. We first formulate it as a +permutation invariant problem, then propose a general framework which can be +combined with any backbones. Our method is beneficial for further research such +as 3D pre-training and multi-modal learning. Experiments show that our method +has considerable or better performance compared to state-of-the-art approaches +on common benchmarks. Code is available at +https://github.com/luoshuqing2001/RI_framework. + +
+
+ comment: 5 pages, 1 figure, accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ AutoGCN -- Towards Generic Human Activity Recognition with Neural + Architecture Search + + +
+ This paper introduces AutoGCN, a generic Neural Architecture Search (NAS) +algorithm for Human Activity Recognition (HAR) using Graph Convolution Networks +(GCNs). HAR has gained attention due to advances in deep learning, increased +data availability, and enhanced computational capabilities. At the same time, +GCNs have shown promising results in modeling relationships between body key +points in a skeletal graph. While domain experts often craft dataset-specific +GCN-based methods, their applicability beyond this specific context is severely +limited. AutoGCN seeks to address this limitation by simultaneously searching +for the ideal hyperparameters and architecture combination within a versatile +search space using a reinforcement controller while balancing optimal +exploration and exploitation behavior with a knowledge reservoir during the +search process. We conduct extensive experiments on two large-scale datasets +focused on skeleton-based action recognition to assess the proposed algorithm's +performance. Our experimental results underscore the effectiveness of AutoGCN +in constructing optimal GCN architectures for HAR, outperforming conventional +NAS and GCN methods, as well as random search. These findings highlight the +significance of a diverse search space and an expressive input representation +to enhance the network performance and generalizability. + +
+
+
+
+
+ + ☆ Deep Multimodal Fusion of Data with Heterogeneous Dimensionality via + Projective Networks + + +
+ The use of multimodal imaging has led to significant improvements in the +diagnosis and treatment of many diseases. Similar to clinical practice, some +works have demonstrated the benefits of multimodal fusion for automatic +segmentation and classification using deep learning-based methods. However, +current segmentation methods are limited to fusion of modalities with the same +dimensionality (e.g., 3D+3D, 2D+2D), which is not always possible, and the +fusion strategies implemented by classification methods are incompatible with +localization tasks. In this work, we propose a novel deep learning-based +framework for the fusion of multimodal data with heterogeneous dimensionality +(e.g., 3D+2D) that is compatible with localization tasks. The proposed +framework extracts the features of the different modalities and projects them +into the common feature subspace. The projected features are then fused and +further processed to obtain the final prediction. The framework was validated +on the following tasks: segmentation of geographic atrophy (GA), a late-stage +manifestation of age-related macular degeneration, and segmentation of retinal +blood vessels (RBV) in multimodal retinal imaging. Our results show that the +proposed method outperforms the state-of-the-art monomodal methods on GA and +RBV segmentation by up to 3.10% and 4.64% Dice, respectively. + +
+
+ comment: Accepted for publication in the IEEE Journal of Biomedical and Health + Informatics (JBHI) +
+
+
+
+
+ + ☆ Phrase Grounding-based Style Transfer for Single-Domain Generalized + Object Detection + + +
+ Single-domain generalized object detection aims to enhance a model's +generalizability to multiple unseen target domains using only data from a +single source domain during training. This is a practical yet challenging task +as it requires the model to address domain shift without incorporating target +domain data into training. In this paper, we propose a novel phrase +grounding-based style transfer (PGST) approach for the task. Specifically, we +first define textual prompts to describe potential objects for each unseen +target domain. Then, we leverage the grounded language-image pre-training +(GLIP) model to learn the style of these target domains and achieve style +transfer from the source to the target domain. The style-transferred source +visual features are semantically rich and could be close to imaginary +counterparts in the target domain. Finally, we employ these style-transferred +visual features to fine-tune GLIP. By introducing imaginary counterparts, the +detector could be effectively generalized to unseen target domains using only a +single source domain for training. Extensive experimental results on five +diverse weather driving benchmarks demonstrate our proposed approach achieves +state-of-the-art performance, even surpassing some domain adaptive methods that +incorporate target domain images into the training process.The source codes and +pre-trained models will be made available. + +
+
+ comment: 22 pages, 13 figures +
+
+
+
+
+ + ☆ AGILE: Approach-based Grasp Inference Learned from Element Decomposition + + +
+ Humans, this species expert in grasp detection, can grasp objects by taking +into account hand-object positioning information. This work proposes a method +to enable a robot manipulator to learn the same, grasping objects in the most +optimal way according to how the gripper has approached the object. Built on +deep learning, the proposed method consists of two main stages. In order to +generalize the network on unseen objects, the proposed Approach-based Grasping +Inference involves an element decomposition stage to split an object into its +main parts, each with one or more annotated grasps for a particular approach of +the gripper. Subsequently, a grasp detection network utilizes the decomposed +elements by Mask R-CNN and the information on the approach of the gripper in +order to detect the element the gripper has approached and the most optimal +grasp. In order to train the networks, the study introduces a robotic grasping +dataset collected in the Coppeliasim simulation environment. The dataset +involves 10 different objects with annotated element decomposition masks and +grasp rectangles. The proposed method acquires a 90% grasp success rate on seen +objects and 78% on unseen objects in the Coppeliasim simulation environment. +Lastly, simulation-to-reality domain adaptation is performed by applying +transformations on the training set collected in simulation and augmenting the +dataset, which results in a 70% physical grasp success performance using a +Delta parallel robot and a 2 -fingered gripper. + +
+
+ comment: Conference Paper, ICROM 2023, 8 pages, 8 figures +
+
+
+
+
+ + ☆ Bi-CryptoNets: Leveraging Different-Level Privacy for Encrypted + Inference + + +
+ Privacy-preserving neural networks have attracted increasing attention in +recent years, and various algorithms have been developed to keep the balance +between accuracy, computational complexity and information security from the +cryptographic view. This work takes a different view from the input data and +structure of neural networks. We decompose the input data (e.g., some images) +into sensitive and insensitive segments according to importance and privacy. +The sensitive segment includes some important and private information such as +human faces and we take strong homomorphic encryption to keep security, whereas +the insensitive one contains some background and we add perturbations. We +propose the bi-CryptoNets, i.e., plaintext and ciphertext branches, to deal +with two segments, respectively, and ciphertext branch could utilize the +information from plaintext branch by unidirectional connections. We adopt +knowledge distillation for our bi-CryptoNets by transferring representations +from a well-trained teacher neural network. Empirical studies show the +effectiveness and decrease of inference latency for our bi-CryptoNets. + +
+
+
+
+
+ + ☆ UCVC: A Unified Contextual Video Compression Framework with Joint + P-frame and B-frame Coding + + +
+ This paper presents a learned video compression method in response to video +compression track of the 6th Challenge on Learned Image Compression (CLIC), at +DCC 2024.Specifically, we propose a unified contextual video compression +framework (UCVC) for joint P-frame and B-frame coding. Each non-intra frame +refers to two neighboring decoded frames, which can be either both from the +past for P-frame compression, or one from the past and one from the future for +B-frame compression. In training stage, the model parameters are jointly +optimized with both P-frames and B-frames. Benefiting from the designs, the +framework can support both P-frame and B-frame coding and achieve comparable +compression efficiency with that specifically designed for P-frame or +B-frame.As for challenge submission, we report the optimal compression +efficiency by selecting appropriate frame types for each test sequence. Our +team name is PKUSZ-LVC. + +
+
+ comment: DCC2024, CLIC2024 +
+
+
+
+
+ + ☆ Spiking CenterNet: A Distillation-boosted Spiking Neural Network for + Object Detection + + +
+ In the era of AI at the edge, self-driving cars, and climate change, the need +for energy-efficient, small, embedded AI is growing. Spiking Neural Networks +(SNNs) are a promising approach to address this challenge, with their +event-driven information flow and sparse activations. We propose Spiking +CenterNet for object detection on event data. It combines an SNN CenterNet +adaptation with an efficient M2U-Net-based decoder. Our model significantly +outperforms comparable previous work on Prophesee's challenging GEN1 Automotive +Detection Dataset while using less than half the energy. Distilling the +knowledge of a non-spiking teacher into our SNN further increases performance. +To the best of our knowledge, our work is the first approach that takes +advantage of knowledge distillation in the field of spiking object detection. + +
+
+ comment: 8 pages, 5 figures. Submitted to WCCI-2024 +
+
+
+
+
+ + ☆ Spectrum-guided Feature Enhancement Network for Event Person + Re-Identification + + +
+ As a cutting-edge biosensor, the event camera holds significant potential in +the field of computer vision, particularly regarding privacy preservation. +However, compared to traditional cameras, event streams often contain noise and +possess extremely sparse semantics, posing a formidable challenge for +event-based person re-identification (event Re-ID). To address this, we +introduce a novel event person re-identification network: the Spectrum-guided +Feature Enhancement Network (SFE-Net). This network consists of two innovative +components: the Multi-grain Spectrum Attention Mechanism (MSAM) and the +Consecutive Patch Dropout Module (CPDM). MSAM employs a fourier spectrum +transform strategy to filter event noise, while also utilizing an event-guided +multi-granularity attention strategy to enhance and capture discriminative +person semantics. CPDM employs a consecutive patch dropout strategy to generate +multiple incomplete feature maps, encouraging the deep Re-ID model to equally +perceive each effective region of the person's body and capture robust person +descriptors. Extensive experiments on Event Re-ID datasets demonstrate that our +SFE-Net achieves the best performance in this task. + +
+
+
+
+
+ + ☆ Cascaded Scaling Classifier: class incremental learning with probability + scaling + + +
+ Humans are capable of acquiring new knowledge and transferring learned +knowledge into different domains, incurring a small forgetting. The same +ability, called Continual Learning, is challenging to achieve when operating +with neural networks due to the forgetting affecting past learned tasks when +learning new ones. This forgetting can be mitigated by replaying stored samples +from past tasks, but a large memory size may be needed for long sequences of +tasks; moreover, this could lead to overfitting on saved samples. In this +paper, we propose a novel regularisation approach and a novel incremental +classifier called, respectively, Margin Dampening and Cascaded Scaling +Classifier. The first combines a soft constraint and a knowledge distillation +approach to preserve past learned knowledge while allowing the model to learn +new patterns effectively. The latter is a gated incremental classifier, helping +the model modify past predictions without directly interfering with them. This +is achieved by modifying the output of the model with auxiliary scaling +functions. We empirically show that our approach performs well on multiple +benchmarks against well-established baselines, and we also study each component +of our proposal and how the combinations of such components affect the final +results. + +
+
+
+
+
+ + ☆ Can Shape-Infused Joint Embeddings Improve Image-Conditioned 3D + Diffusion? + + +
+ Recent advancements in deep generative models, particularly with the +application of CLIP (Contrastive Language Image Pretraining) to Denoising +Diffusion Probabilistic Models (DDPMs), have demonstrated remarkable +effectiveness in text to image generation. The well structured embedding space +of CLIP has also been extended to image to shape generation with DDPMs, +yielding notable results. Despite these successes, some fundamental questions +arise: Does CLIP ensure the best results in shape generation from images? Can +we leverage conditioning to bring explicit 3D knowledge into the generative +process and obtain better quality? This study introduces CISP (Contrastive +Image Shape Pre training), designed to enhance 3D shape synthesis guided by 2D +images. CISP aims to enrich the CLIP framework by aligning 2D images with 3D +shapes in a shared embedding space, specifically capturing 3D characteristics +potentially overlooked by CLIP's text image focus. Our comprehensive analysis +assesses CISP's guidance performance against CLIP guided models, focusing on +generation quality, diversity, and coherence of the produced shapes with the +conditioning image. We find that, while matching CLIP in generation quality and +diversity, CISP substantially improves coherence with input images, +underscoring the value of incorporating 3D knowledge into generative models. +These findings suggest a promising direction for advancing the synthesis of 3D +visual content by integrating multimodal systems with 3D representations. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ PRIME: Protect Your Videos From Malicious Editing + + +
+ With the development of generative models, the quality of generated content +keeps increasing. Recently, open-source models have made it surprisingly easy +to manipulate and edit photos and videos, with just a few simple prompts. While +these cutting-edge technologies have gained popularity, they have also given +rise to concerns regarding the privacy and portrait rights of individuals. +Malicious users can exploit these tools for deceptive or illegal purposes. +Although some previous works focus on protecting photos against generative +models, we find there are still gaps between protecting videos and images in +the aspects of efficiency and effectiveness. Therefore, we introduce our +protection method, PRIME, to significantly reduce the time cost and improve the +protection performance. Moreover, to evaluate our proposed protection method, +we consider both objective metrics and human subjective metrics. Our evaluation +results indicate that PRIME only costs 8.3% GPU hours of the cost of the +previous state-of-the-art method and achieves better protection results on both +human evaluation and objective metrics. Code can be found in +https://github.com/GuanlinLee/prime. + +
+
+
+
+
+ + ☆ Delving into Decision-based Black-box Attacks on Semantic Segmentation + + +
+ Semantic segmentation is a fundamental visual task that finds extensive +deployment in applications with security-sensitive considerations. Nonetheless, +recent work illustrates the adversarial vulnerability of semantic segmentation +models to white-box attacks. However, its adversarial robustness against +black-box attacks has not been fully explored. In this paper, we present the +first exploration of black-box decision-based attacks on semantic segmentation. +First, we analyze the challenges that semantic segmentation brings to +decision-based attacks through the case study. Then, to address these +challenges, we first propose a decision-based attack on semantic segmentation, +called Discrete Linear Attack (DLA). Based on random search and proxy index, we +utilize the discrete linear noises for perturbation exploration and calibration +to achieve efficient attack efficiency. We conduct adversarial robustness +evaluation on 5 models from Cityscapes and ADE20K under 8 attacks. DLA shows +its formidable power on Cityscapes by dramatically reducing PSPNet's mIoU from +an impressive 77.83% to a mere 2.14% with just 50 queries. + +
+
+
+
+
+ + ☆ Taming Uncertainty in Sparse-view Generalizable NeRF via Indirect + Diffusion Guidance + + +
+ Neural Radiance Fields (NeRF) have demonstrated effectiveness in synthesizing +novel views. However, their reliance on dense inputs and scene-specific +optimization has limited their broader applicability. Generalizable NeRFs +(Gen-NeRF), while intended to address this, often produce blurring artifacts in +unobserved regions with sparse inputs, which are full of uncertainty. In this +paper, we aim to diminish the uncertainty in Gen-NeRF for plausible renderings. +We assume that NeRF's inability to effectively mitigate this uncertainty stems +from its inherent lack of generative capacity. Therefore, we innovatively +propose an Indirect Diffusion-guided NeRF framework, termed ID-NeRF, to address +this uncertainty from a generative perspective by leveraging a distilled +diffusion prior as guidance. Specifically, to avoid model confusion caused by +directly regularizing with inconsistent samplings as in previous methods, our +approach introduces a strategy to indirectly inject the inherently missing +imagination into the learned implicit function through a diffusion-guided +latent space. Empirical evaluation across various benchmarks demonstrates the +superior performance of our approach in handling uncertainty with sparse +inputs. + +
+
+
+
+
+ + ☆ TSJNet: A Multi-modality Target and Semantic Awareness Joint-driven + Image Fusion Network + + +
+ Multi-modality image fusion involves integrating complementary information +from different modalities into a single image. Current methods primarily focus +on enhancing image fusion with a single advanced task such as incorporating +semantic or object-related information into the fusion process. This method +creates challenges in achieving multiple objectives simultaneously. We +introduce a target and semantic awareness joint-driven fusion network called +TSJNet. TSJNet comprises fusion, detection, and segmentation subnetworks +arranged in a series structure. It leverages object and semantically relevant +information derived from dual high-level tasks to guide the fusion network. +Additionally, We propose a local significant feature extraction module with a +double parallel branch structure to fully capture the fine-grained features of +cross-modal images and foster interaction among modalities, targets, and +segmentation information. We conducted extensive experiments on four publicly +available datasets (MSRS, M3FD, RoadScene, and LLVIP). The results demonstrate +that TSJNet can generate visually pleasing fused results, achieving an average +increase of 2.84% and 7.47% in object detection and segmentation mAP @0.5 and +mIoU, respectively, compared to the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Structured World Modeling via Semantic Vector Quantization ICLR 2024 + + +
+ Neural discrete representations are crucial components of modern neural +networks. However, their main limitation is that the primary strategies such as +VQ-VAE can only provide representations at the patch level. Therefore, one of +the main goals of representation learning, acquiring structured, semantic, and +compositional abstractions such as the color and shape of an object, remains +elusive. In this paper, we present the first approach to semantic neural +discrete representation learning. The proposed model, called Semantic +Vector-Quantized Variational Autoencoder (SVQ), leverages recent advances in +unsupervised object-centric learning to address this limitation. Specifically, +we observe that a simple approach quantizing at the object level poses a +significant challenge and propose constructing scene representations +hierarchically, from low-level discrete concept schemas to object +representations. Additionally, we suggest a novel method for structured +semantic world modeling by training a prior over these representations, +enabling the ability to generate images by sampling the semantic properties of +the objects in the scene. In experiments on various 2D and 3D object-centric +datasets, we find that our model achieves superior generation performance +compared to non-semantic vector quantization methods such as VQ-VAE and +previous object-centric generative models. Furthermore, we find that the +semantic discrete representations can solve downstream scene understanding +tasks that require reasoning about the properties of different objects in the +scene. + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ☆ Unsupervised Generation of Pseudo Normal PET from MRI with Diffusion + Model for Epileptic Focus Localization SP + + +
+ [$^{18}$F]fluorodeoxyglucose (FDG) positron emission tomography (PET) has +emerged as a crucial tool in identifying the epileptic focus, especially in +cases where magnetic resonance imaging (MRI) diagnosis yields indeterminate +results. FDG PET can provide the metabolic information of glucose and help +identify abnormal areas that are not easily found through MRI. However, the +effectiveness of FDG PET-based assessment and diagnosis depends on the +selection of a healthy control group. The healthy control group typically +consists of healthy individuals similar to epilepsy patients in terms of age, +gender, and other aspects for providing normal FDG PET data, which will be used +as a reference for enhancing the accuracy and reliability of the epilepsy +diagnosis. However, significant challenges arise when a healthy PET control +group is unattainable. Yaakub \emph{et al.} have previously introduced a +Pix2PixGAN-based method for MRI to PET translation. This method used paired MRI +and FDG PET scans from healthy individuals for training, and produced pseudo +normal FDG PET images from patient MRIs that are subsequently used for lesion +detection. However, this approach requires a large amount of high-quality, +paired MRI and PET images from healthy control subjects, which may not always +be available. In this study, we investigated unsupervised learning methods for +unpaired MRI to PET translation for generating pseudo normal FDG PET for +epileptic focus localization. Two deep learning methods, CycleGAN and SynDiff, +were employed, and we found that diffusion-based method achieved improved +performance in accurately localizing the epileptic focus. + +
+
+ comment: SPIE Medical Imaging 2024 +
+
+
+
+
+ + ☆ Segment Any Change + + +
+ Visual foundation models have achieved remarkable results in zero-shot image +classification and segmentation, but zero-shot change detection remains an open +problem. In this paper, we propose the segment any change models (AnyChange), a +new type of change detection model that supports zero-shot prediction and +generalization on unseen change types and data distributions. AnyChange is +built on the segment anything model (SAM) via our training-free adaptation +method, bitemporal latent matching. By revealing and exploiting intra-image and +inter-image semantic similarities in SAM's latent space, bitemporal latent +matching endows SAM with zero-shot change detection capabilities in a +training-free way. We also propose a point query mechanism to enable +AnyChange's zero-shot object-centric change detection capability. We perform +extensive experiments to confirm the effectiveness of AnyChange for zero-shot +change detection. AnyChange sets a new record on the SECOND benchmark for +unsupervised change detection, exceeding the previous SOTA by up to 4.4% F$_1$ +score, and achieving comparable accuracy with negligible manual annotations (1 +pixel per image) for supervised change detection. + +
+
+ comment: technical report, 12 pages +
+
+
+
+
+ + ☆ DeepBranchTracer: A Generally-Applicable Approach to Curvilinear + Structure Reconstruction Using Multi-Feature Learning AAAI 2024 + + +
+ Curvilinear structures, which include line-like continuous objects, are +fundamental geometrical elements in image-based applications. Reconstructing +these structures from images constitutes a pivotal research area in computer +vision. However, the complex topology and ambiguous image evidence render this +process a challenging task. In this paper, we introduce DeepBranchTracer, a +novel method that learns both external image features and internal geometric +characteristics to reconstruct curvilinear structures. Firstly, we formulate +the curvilinear structures extraction as a geometric attribute estimation +problem. Then, a curvilinear structure feature learning network is designed to +extract essential branch attributes, including the image features of centerline +and boundary, and the geometric features of direction and radius. Finally, +utilizing a multi-feature fusion tracing strategy, our model iteratively traces +the entire branch by integrating the extracted image and geometric features. We +extensively evaluated our model on both 2D and 3D datasets, demonstrating its +superior performance over existing segmentation and reconstruction methods in +terms of accuracy and continuity. + +
+
+ comment: 10 pages, 6 figures, AAAI 2024 accepted +
+
+
+
+
+ + ☆ Ambient-Pix2PixGAN for Translating Medical Images from Noisy Data SP + + +
+ Image-to-image translation is a common task in computer vision and has been +rapidly increasing the impact on the field of medical imaging. Deep +learning-based methods that employ conditional generative adversarial networks +(cGANs), such as Pix2PixGAN, have been extensively explored to perform +image-to-image translation tasks. However, when noisy medical image data are +considered, such methods cannot be directly applied to produce clean images. +Recently, an augmented GAN architecture named AmbientGAN has been proposed that +can be trained on noisy measurement data to synthesize high-quality clean +medical images. Inspired by AmbientGAN, in this work, we propose a new cGAN +architecture, Ambient-Pix2PixGAN, for performing medical image-to-image +translation tasks by use of noisy measurement data. Numerical studies that +consider MRI-to-PET translation are conducted. Both traditional image quality +metrics and task-based image quality metrics are employed to assess the +proposed Ambient-Pix2PixGAN. It is demonstrated that our proposed +Ambient-Pix2PixGAN can be successfully trained on noisy measurement data to +produce high-quality translated images in target imaging modality. + +
+
+ comment: SPIE Medical Imaging 2024 +
+
+
+
+
+ + ☆ AmbientCycleGAN for Establishing Interpretable Stochastic Object Models + Based on Mathematical Phantoms and Medical Imaging Measurements SP + + +
+ Medical imaging systems that are designed for producing diagnostically +informative images should be objectively assessed via task-based measures of +image quality (IQ). Ideally, computation of task-based measures of IQ needs to +account for all sources of randomness in the measurement data, including the +variability in the ensemble of objects to be imaged. To address this need, +stochastic object models (SOMs) that can generate an ensemble of synthesized +objects or phantoms can be employed. Various mathematical SOMs or phantoms were +developed that can interpretably synthesize objects, such as lumpy object +models and parameterized torso phantoms. However, such SOMs that are purely +mathematically defined may not be able to comprehensively capture realistic +object variations. To establish realistic SOMs, it is desirable to use +experimental data. An augmented generative adversarial network (GAN), +AmbientGAN, was recently proposed for establishing SOMs from medical imaging +measurements. However, it remains unclear to which extent the +AmbientGAN-produced objects can be interpretably controlled. This work +introduces a novel approach called AmbientCycleGAN that translates mathematical +SOMs to realistic SOMs by use of noisy measurement data. Numerical studies that +consider clustered lumpy background (CLB) models and real mammograms are +conducted. It is demonstrated that our proposed method can stably establish +SOMs based on mathematical models and noisy measurement data. Moreover, the +ability of the proposed AmbientCycleGAN to interpretably control image features +in the synthesized objects is investigated. + +
+
+ comment: SPIE Medical Imaging 2024 +
+
+
+
+
+ + ☆ Faster Inference of Integer SWIN Transformer by Removing the GELU + Activation AAAI 2024 + + +
+ SWIN transformer is a prominent vision transformer model that has +state-of-the-art accuracy in image classification tasks. Despite this success, +its unique architecture causes slower inference compared with similar deep +neural networks. Integer quantization of the model is one of the methods used +to improve its inference latency. However, state-of-the-art has not been able +to fully quantize the model. In this work, we improve upon the inference +latency of the state-of-the-art methods by removing the floating-point +operations, which are associated with the GELU activation in Swin Transformer. +While previous work proposed to replace the non-integer operations with linear +approximation functions, we propose to replace GELU with ReLU activation. The +advantage of ReLU over previous methods is its low memory and computation +complexity. We use iterative knowledge distillation to compensate for the lost +accuracy due to replacing GELU with ReLU. We quantize our GELU-less SWIN +transformer and show that on an RTX 4090 NVIDIA GPU we can improve the +inference latency of the quantized SWIN transformer by at least $11\%$ while +maintaining an accuracy drop of under $0.5\%$ on the ImageNet evaluation +dataset. + +
+
+ comment: 5 pages, 1 figure. Submitted to Edge Intelligence Workshop III, an + AAAI 2024 workshop +
+
+
+
+
+ + ☆ A Comprehensive Survey on 3D Content Generation + + +
+ Recent years have witnessed remarkable advances in artificial intelligence +generated content(AIGC), with diverse input modalities, e.g., text, image, +video, audio and 3D. The 3D is the most close visual modality to real-world 3D +environment and carries enormous knowledge. The 3D content generation shows +both academic and practical values while also presenting formidable technical +challenges. This review aims to consolidate developments within the burgeoning +domain of 3D content generation. Specifically, a new taxonomy is proposed that +categorizes existing approaches into three types: 3D native generative methods, +2D prior-based 3D generative methods, and hybrid 3D generative methods. The +survey covers approximately 60 papers spanning the major techniques. Besides, +we discuss limitations of current 3D content generation techniques, and point +out open challenges as well as promising directions for future work. +Accompanied with this survey, we have established a project website where the +resources on 3D content generation research are provided. The project page is +available at https://github.com/hitcslj/Awesome-AIGC-3D. + +
+
+
+
+
+ + ☆ Enhanced Urban Region Profiling with Adversarial Self-Supervised + Learning + + +
+ Urban region profiling is pivotal for smart cities, but mining fine-grained +semantics from noisy and incomplete urban data remains challenging. In +response, we propose a novel self-supervised graph collaborative filtering +model for urban region embedding called EUPAS. Specifically, region +heterogeneous graphs containing human mobility data, point of interests (POIs) +information, and geographic neighborhood details for each region are fed into +the model, which generates region embeddings that preserve intra-region and +inter-region dependencies through GCNs and multi-head attention. Meanwhile, we +introduce spatial perturbation augmentation to generate positive samples that +are semantically similar and spatially close to the anchor, preparing for +subsequent contrastive learning. Furthermore, adversarial training is employed +to construct an effective pretext task by generating strong positive pairs and +mining hard negative pairs for the region embeddings. Finally, we jointly +optimize supervised and self-supervised learning to encourage the model to +capture the high-level semantics of region embeddings while ignoring the noisy +and unimportant details. Extensive experiments on real-world datasets +demonstrate the superiority of our model over state-of-the-art methods. + +
+
+
+
+
+ + ☆ 2AFC Prompting of Large Multimodal Models for Image Quality Assessment + + +
+ While abundant research has been conducted on improving high-level visual +understanding and reasoning capabilities of large multimodal models~(LMMs), +their visual quality assessment~(IQA) ability has been relatively +under-explored. Here we take initial steps towards this goal by employing the +two-alternative forced choice~(2AFC) prompting, as 2AFC is widely regarded as +the most reliable way of collecting human opinions of visual quality. +Subsequently, the global quality score of each image estimated by a particular +LMM can be efficiently aggregated using the maximum a posterior estimation. +Meanwhile, we introduce three evaluation criteria: consistency, accuracy, and +correlation, to provide comprehensive quantifications and deeper insights into +the IQA capability of five LMMs. Extensive experiments show that existing LMMs +exhibit remarkable IQA ability on coarse-grained quality comparison, but there +is room for improvement on fine-grained quality discrimination. The proposed +dataset sheds light on the future development of IQA models based on LMMs. The +codes will be made publicly available at https://github.com/h4nwei/2AFC-LMMs. + +
+
+
+
+
+ + ☆ Source-Free Unsupervised Domain Adaptation with Hypothesis Consolidation + of Prediction Rationale + + +
+ Source-Free Unsupervised Domain Adaptation (SFUDA) is a challenging task +where a model needs to be adapted to a new domain without access to target +domain labels or source domain data. The primary difficulty in this task is +that the model's predictions may be inaccurate, and using these inaccurate +predictions for model adaptation can lead to misleading results. To address +this issue, this paper proposes a novel approach that considers multiple +prediction hypotheses for each sample and investigates the rationale behind +each hypothesis. By consolidating these hypothesis rationales, we identify the +most likely correct hypotheses, which we then use as a pseudo-labeled set to +support a semi-supervised learning procedure for model adaptation. To achieve +the optimal performance, we propose a three-step adaptation process: model +pre-adaptation, hypothesis consolidation, and semi-supervised learning. +Extensive experimental results demonstrate that our approach achieves +state-of-the-art performance in the SFUDA task and can be easily integrated +into existing approaches to improve their performance. The codes are available +at \url{https://github.com/GANPerf/HCPR}. + +
+
+
+
+
+ + ☆ Scale Equalization for Multi-Level Feature Fusion + + +
+ Deep neural networks have exhibited remarkable performance in a variety of +computer vision fields, especially in semantic segmentation tasks. Their +success is often attributed to multi-level feature fusion, which enables them +to understand both global and local information from an image. However, we +found that multi-level features from parallel branches are on different scales. +The scale disequilibrium is a universal and unwanted flaw that leads to +detrimental gradient descent, thereby degrading performance in semantic +segmentation. We discover that scale disequilibrium is caused by bilinear +upsampling, which is supported by both theoretical and empirical evidence. +Based on this observation, we propose injecting scale equalizers to achieve +scale equilibrium across multi-level features after bilinear upsampling. Our +proposed scale equalizers are easy to implement, applicable to any +architecture, hyperparameter-free, implementable without requiring extra +computational cost, and guarantee scale equilibrium for any dataset. +Experiments showed that adopting scale equalizers consistently improved the +mIoU index across various target datasets, including ADE20K, PASCAL VOC 2012, +and Cityscapes, as well as various decoder choices, including UPerHead, +PSPHead, ASPPHead, SepASPPHead, and FCNHead. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ DeepAAT: Deep Automated Aerial Triangulation for Fast UAV-based Mapping + + +
+ Automated Aerial Triangulation (AAT), aiming to restore image pose and +reconstruct sparse points simultaneously, plays a pivotal role in earth +observation. With its rich research heritage spanning several decades in +photogrammetry, AAT has evolved into a fundamental process widely applied in +large-scale Unmanned Aerial Vehicle (UAV) based mapping. Despite its +advancements, classic AAT methods still face challenges like low efficiency and +limited robustness. This paper introduces DeepAAT, a deep learning network +designed specifically for AAT of UAV imagery. DeepAAT considers both spatial +and spectral characteristics of imagery, enhancing its capability to resolve +erroneous matching pairs and accurately predict image poses. DeepAAT marks a +significant leap in AAT's efficiency, ensuring thorough scene coverage and +precision. Its processing speed outpaces incremental AAT methods by hundreds of +times and global AAT methods by tens of times while maintaining a comparable +level of reconstruction accuracy. Additionally, DeepAAT's scene clustering and +merging strategy facilitate rapid localization and pose determination for +large-scale UAV images, even under constrained computing resources. The +experimental results demonstrate DeepAAT's substantial improvements over +conventional AAT methods, highlighting its potential in the efficiency and +accuracy of UAV-based 3D reconstruction tasks. To benefit the photogrammetry +society, the code of DeepAAT will be released at: +https://github.com/WHU-USI3DV/DeepAAT. + +
+
+
+
+
+ + ☆ Seeing Objects in a Cluttered World: Computational Objectness from + Motion in Video + + +
+ Perception of the visually disjoint surfaces of our cluttered world as whole +objects, physically distinct from those overlapping them, is a cognitive +phenomenon called objectness that forms the basis of our visual perception. +Shared by all vertebrates and present at birth in humans, it enables +object-centric representation and reasoning about the visual world. We present +a computational approach to objectness that leverages motion cues and +spatio-temporal attention using a pair of supervised spatio-temporal +R(2+1)U-Nets. The first network detects motion boundaries and classifies the +pixels at those boundaries in terms of their local foreground-background sense. +This motion boundary sense (MBS) information is passed, along with a +spatio-temporal object attention cue, to an attentional surface perception +(ASP) module which infers the form of the attended object over a sequence of +frames and classifies its 'pixels' as visible or obscured. The spatial form of +the attention cue is flexible, but it must loosely track the attended object +which need not be visible. We demonstrate the ability of this simple but novel +approach to infer objectness from phenomenology without object models, and show +that it delivers robust perception of individual attended objects in cluttered +scenes, even with blur and camera shake. We show that our data diversity and +augmentation minimizes bias and facilitates transfer to real video. Finally, we +describe how this computational objectness capability can grow in +sophistication and anchor a robust modular video object perception framework. + +
+
+ comment: 10 pages, 11 figures, plus 18 pages of Supplemental Information +
+
+
+
+
+ + ☆ A Single Simple Patch is All You Need for AI-generated Image Detection + + +
+ The recent development of generative models unleashes the potential of +generating hyper-realistic fake images. To prevent the malicious usage of fake +images, AI-generated image detection aims to distinguish fake images from real +images. Nevertheless, existing methods usually suffer from poor +generalizability across different generators. In this work, we propose an +embarrassingly simple approach named SSP, i.e., feeding the noise pattern of a +Single Simple Patch (SSP) to a binary classifier, which could achieve 14.6% +relative improvement over the recent method on GenImage dataset. Our SSP method +is very robust and generalizable, which could serve as a simple and competitive +baseline for the future methods. + +
+
+
+
+
+ + ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+ + ☆ Compositional Generative Modeling: A Single Model is Not All You Need + + +
+ Large monolithic generative models trained on massive amounts of data have +become an increasingly dominant approach in AI research. In this paper, we +argue that we should instead construct large generative systems by composing +smaller generative models together. We show how such a compositional generative +approach enables us to learn distributions in a more data-efficient manner, +enabling generalization to parts of the data distribution unseen at training +time. We further show how this enables us to program and construct new +generative models for tasks completely unseen at training. Finally, we show +that in many cases, we can discover separate compositional components from +data. + +
+
+
+
+
+ + ☆ How many views does your deep neural network use for prediction? + + +
+ The generalization ability of Deep Neural Networks (DNNs) is still not fully +understood, despite numerous theoretical and empirical analyses. Recently, +Allen-Zhu & Li (2023) introduced the concept of multi-views to explain the +generalization ability of DNNs, but their main target is ensemble or distilled +models, and no method for estimating multi-views used in a prediction of a +specific input is discussed. In this paper, we propose Minimal Sufficient Views +(MSVs), which is similar to multi-views but can be efficiently computed for +real images. MSVs is a set of minimal and distinct features in an input, each +of which preserves a model's prediction for the input. We empirically show that +there is a clear relationship between the number of MSVs and prediction +accuracy across models, including convolutional and transformer models, +suggesting that a multi-view like perspective is also important for +understanding the generalization ability of (non-ensemble or non-distilled) +DNNs. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Inversion by Direct Iteration: An Alternative to Denoising Diffusion for + Image Restoration + + +
+ Inversion by Direct Iteration (InDI) is a new formulation for supervised +image restoration that avoids the so-called "regression to the mean" effect and +produces more realistic and detailed images than existing regression-based +methods. It does this by gradually improving image quality in small steps, +similar to generative denoising diffusion models. Image restoration is an +ill-posed problem where multiple high-quality images are plausible +reconstructions of a given low-quality input. Therefore, the outcome of a +single step regression model is typically an aggregate of all possible +explanations, therefore lacking details and realism. The main advantage of InDI +is that it does not try to predict the clean target image in a single step but +instead gradually improves the image in small steps, resulting in better +perceptual quality. While generative denoising diffusion models also work in +small steps, our formulation is distinct in that it does not require knowledge +of any analytic form of the degradation process. Instead, we directly learn an +iterative restoration process from low-quality and high-quality paired +examples. InDI can be applied to virtually any image degradation, given paired +training data. In conditional denoising diffusion image restoration the +denoising network generates the restored image by repeatedly denoising an +initial image of pure noise, conditioned on the degraded input. Contrary to +conditional denoising formulations, InDI directly proceeds by iteratively +restoring the input low-quality image, producing high-quality results on a +variety of image restoration tasks, including motion and out-of-focus +deblurring, super-resolution, compression artifact removal, and denoising. + +
+
+
+
+
+ + ♻ ☆ STELLA: Continual Audio-Video Pre-training with Spatio-Temporal + Localized Alignment + + +
+ Continuously learning a variety of audio-video semantics over time is crucial +for audio-related reasoning tasks in our ever-evolving world. However, this is +a nontrivial problem and poses two critical challenges: sparse spatio-temporal +correlation between audio-video pairs and multimodal correlation overwriting +that forgets audio-video relations. To tackle this problem, we propose a new +continual audio-video pre-training method with two novel ideas: (1) Localized +Patch Importance Scoring: we introduce a multimodal encoder to determine the +importance score for each patch, emphasizing semantically intertwined +audio-video patches. (2) Replay-guided Correlation Assessment: to reduce the +corruption of previously learned audiovisual knowledge due to drift, we propose +to assess the correlation of the current patches on the past steps to identify +the patches exhibiting high correlations with the past steps. Based on the +results from the two ideas, we perform probabilistic patch selection for +effective continual audio-video pre-training. Experimental validation on +multiple benchmarks shows that our method achieves a 3.69%p of relative +performance gain in zero-shot retrieval tasks compared to strong continual +learning baselines, while reducing memory consumption by ~45%. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model + Conversions + + +
+ Converting deep learning models between frameworks is a common step to +maximize model compatibility across devices and leverage optimization features +that may be exclusively provided in one deep learning framework. However, this +conversion process may be riddled with bugs, making the converted models either +undeployable or problematic, considerably degrading their prediction +correctness. + We propose an automated approach for fault localization and repair, Fix-Con, +during model conversion between deep learning frameworks. Fix-Con is capable of +detecting and fixing faults introduced in model input, parameters, +hyperparameters, and the model graph during conversion. + Fix-Con uses a set of fault types mined from surveying conversion issues +raised to localize potential conversion faults in the converted target model, +and then repairs them appropriately, e.g. replacing the parameters of the +target model with those from the source model. This is done iteratively for +every image in the dataset with output label differences between the source +model and the converted target model until all differences are resolved. We +evaluate the effectiveness of Fix-Con in fixing model conversion bugs of three +widely used image recognition models converted across four different deep +learning frameworks. Overall, Fix-Con was able to either completely repair, or +significantly improve the performance of 14 out of the 15 erroneous conversion +cases. + +
+
+ comment: 12 pages, 3 figures, 4 tables, 1 algorithm +
+
+
+
+
+ + ♻ ☆ TadML: A fast temporal action detection with Mechanics-MLP + + +
+ Temporal Action Detection(TAD) is a crucial but challenging task in video +understanding.It is aimed at detecting both the type and start-end frame for +each action instance in a long, untrimmed video.Most current models adopt both +RGB and Optical-Flow streams for the TAD task. Thus, original RGB frames must +be converted manually into Optical-Flow frames with additional computation and +time cost, which is an obstacle to achieve real-time processing. At present, +many models adopt two-stage strategies, which would slow the inference speed +down and complicatedly tuning on proposals generating.By comparison, we propose +a one-stage anchor-free temporal localization method with RGB stream only, in +which a novel Newtonian Mechanics-MLP architecture is established. It has +comparable accuracy with all existing state-of-the-art models, while surpasses +the inference speed of these methods by a large margin. The typical inference +speed in this paper is astounding 4.44 video per second on THUMOS14. In +applications, because there is no need to convert optical flow, the inference +speed will be faster.It also proves that MLP has great potential in downstream +tasks such as TAD. The source code is available at +https://github.com/BonedDeng/TadML + +
+
+ comment: 8 pages,3 figures +
+
+
+
+
+ + ♻ ☆ MagiCapture: High-Resolution Multi-Concept Portrait Customization + + +
+ Large-scale text-to-image models including Stable Diffusion are capable of +generating high-fidelity photorealistic portrait images. There is an active +research area dedicated to personalizing these models, aiming to synthesize +specific subjects or styles using provided sets of reference images. However, +despite the plausible results from these personalization methods, they tend to +produce images that often fall short of realism and are not yet on a +commercially viable level. This is particularly noticeable in portrait image +generation, where any unnatural artifact in human faces is easily discernible +due to our inherent human bias. To address this, we introduce MagiCapture, a +personalization method for integrating subject and style concepts to generate +high-resolution portrait images using just a few subject and style references. +For instance, given a handful of random selfies, our fine-tuned model can +generate high-quality portrait images in specific styles, such as passport or +profile photos. The main challenge with this task is the absence of ground +truth for the composed concepts, leading to a reduction in the quality of the +final output and an identity shift of the source subject. To address these +issues, we present a novel Attention Refocusing loss coupled with auxiliary +priors, both of which facilitate robust learning within this weakly supervised +learning setting. Our pipeline also includes additional post-processing steps +to ensure the creation of highly realistic outputs. MagiCapture outperforms +other baselines in both quantitative and qualitative evaluations and can also +be generalized to other non-human objects. + +
+
+ comment: 18 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Neural Semantic Surface Maps + + +
+ We present an automated technique for computing a map between two genus-zero +shapes, which matches semantically corresponding regions to one another. Lack +of annotated data prohibits direct inference of 3D semantic priors; instead, +current State-of-the-art methods predominantly optimize geometric properties or +require varying amounts of manual annotation. To overcome the lack of annotated +training data, we distill semantic matches from pre-trained vision models: our +method renders the pair of 3D shapes from multiple viewpoints; the resulting +renders are then fed into an off-the-shelf image-matching method which +leverages a pretrained visual model to produce feature points. This yields +semantic correspondences, which can be projected back to the 3D shapes, +producing a raw matching that is inaccurate and inconsistent between different +viewpoints. These correspondences are refined and distilled into an +inter-surface map by a dedicated optimization scheme, which promotes +bijectivity and continuity of the output map. We illustrate that our approach +can generate semantic surface-to-surface maps, eliminating manual annotations +or any 3D training data requirement. Furthermore, it proves effective in +scenarios with high semantic complexity, where objects are non-isometrically +related, as well as in situations where they are nearly isometric. + +
+
+
+
+
+ + ♻ ☆ InstantID: Zero-shot Identity-Preserving Generation in Seconds + + +
+ There has been significant progress in personalized image synthesis with +methods such as Textual Inversion, DreamBooth, and LoRA. Yet, their real-world +applicability is hindered by high storage demands, lengthy fine-tuning +processes, and the need for multiple reference images. Conversely, existing ID +embedding-based methods, while requiring only a single forward inference, face +challenges: they either necessitate extensive fine-tuning across numerous model +parameters, lack compatibility with community pre-trained models, or fail to +maintain high face fidelity. Addressing these limitations, we introduce +InstantID, a powerful diffusion model-based solution. Our plug-and-play module +adeptly handles image personalization in various styles using just a single +facial image, while ensuring high fidelity. To achieve this, we design a novel +IdentityNet by imposing strong semantic and weak spatial conditions, +integrating facial and landmark images with textual prompts to steer the image +generation. InstantID demonstrates exceptional performance and efficiency, +proving highly beneficial in real-world applications where identity +preservation is paramount. Moreover, our work seamlessly integrates with +popular pre-trained text-to-image diffusion models like SD1.5 and SDXL, serving +as an adaptable plugin. Our codes and pre-trained checkpoints will be available +at https://github.com/InstantID/InstantID. + +
+
+ comment: Technical Report, project page available at + https://instantid.github.io/ +
+
+
+
+
+ + ♻ ☆ The Role of Data Curation in Image Captioning + + +
+ Image captioning models are typically trained by treating all samples +equally, neglecting to account for mismatched or otherwise difficult data +points. In contrast, recent work has shown the effectiveness of training models +by scheduling the data using curriculum learning strategies. This paper +contributes to this direction by actively curating difficult samples in +datasets without increasing the total number of samples. We explore the effect +of using three data curation methods within the training process: complete +removal of an sample, caption replacement, or image replacement via a +text-to-image generation model. Experiments on the Flickr30K and COCO datasets +with the BLIP and BEiT-3 models demonstrate that these curation methods do +indeed yield improved image captioning models, underscoring their efficacy. + +
+
+
+
+
+ + ♻ ☆ An Algorithm to Train Unrestricted Sequential Discrete Morphological + Neural Networks + + +
+ There have been attempts to insert mathematical morphology (MM) operators +into convolutional neural networks (CNN), and the most successful endeavor to +date has been the morphological neural networks (MNN). Although MNN have +performed better than CNN in solving some problems, they inherit their +black-box nature. Furthermore, in the case of binary images, they are +approximations that loose the Boolean lattice structure of MM operators and, +thus, it is not possible to represent a specific class of W-operators with +desired properties. In a recent work, we proposed the Discrete Morphological +Neural Networks (DMNN) for binary image transformation to represent specific +classes of W-operators and estimate them via machine learning. We also proposed +a stochastic lattice descent algorithm (SLDA) to learn the parameters of +Canonical Discrete Morphological Neural Networks (CDMNN), whose architecture is +composed only of operators that can be decomposed as the supremum, infimum, and +complement of erosions and dilations. In this paper, we propose an algorithm to +learn unrestricted sequential DMNN, whose architecture is given by the +composition of general W-operators. We illustrate the algorithm in a practical +example. + +
+
+
+
+
+ + ♻ ☆ Untargeted Near-collision Attacks on Biometrics: Real-world Bounds and + Theoretical Limits + + +
+ A biometric recognition system can operate in two distinct modes: +identification or verification. In the first mode, the system recognizes an +individual by searching the enrolled templates of all the users for a match. In +the second mode, the system validates a user's identity claim by comparing the +fresh provided template with the enrolled template. The biometric +transformation schemes usually produce binary templates that are better handled +by cryptographic schemes, and the comparison is based on a distance that leaks +information about the similarities between two biometric templates. Both the +experimentally determined false match rate and false non-match rate through +recognition threshold adjustment define the recognition accuracy, and hence the +security of the system. To our knowledge, few works provide a formal treatment +of security in case of minimal information leakage, i.e., the binary outcome of +a comparison with a threshold. In this paper, we focus on untargeted attacks +that can be carried out both online and offline, and in both identification and +verification modes. On the first hand, we focus our analysis on the accuracy +metrics of biometric systems. We provide the complexity of untargeted attacks +using the False Match Rate (FMR) and the False Positive Identification Rate +(FPIR) to address the security of these systems. Studying near-collisions with +these metrics allows us to estimate the maximum number of users in a database, +given a chosen FMR, to preserve the security and the accuracy. These results +are evaluated on systems from the literature. On the other hand, we rely on +probabilistic modelling to assess the theoretical security limits of biometric +systems. The study of this metric space, and system parameters (template size, +threshold and database size), gives us the complexity of untargeted attacks and +the probability of a near-collision. + +
+
+ comment: Addition of results +
+
+
+
+
+ + ♻ ☆ Test-Time Degradation Adaption for Open-Set Image Restoration + + +
+ In contrast to close-set scenarios that restore images from a predefined set +of degradations, open-set image restoration aims to handle the unknown +degradations that were unforeseen during the pretraining phase, which is +less-touched as far as we know. In this work, we explicitly study this +challenging problem and reveal its essence, i.e., the unidentified distribution +shifts between test and training data. In recent, test-time adaptation emerges +as a fundamental method to address this inherent disparities. Inspired by this, +we propose a test-time degradation adaption framework for open-set image +restoration, which involves three components, i.e., i) a pre-trained and +degradation-agnostic diffusion model to generate clean images, ii) a test-time +degradation adapter adapts the unknown degradations based on the input image +during the testing phase, and iii) the adapter-guided image restoration guides +the model through the adapter to produce the corresponding clean image. Through +experiments on multiple degradations absent from the training data, we show +that our method achieves comparable even better performance than those +task-specific methods. + +
+
+
+
+
+ + ♻ ☆ Multi-Class Anomaly Detection based on Regularized Discriminative + Coupled hypersphere-based Feature Adaptation + + +
+ In anomaly detection, identification of anomalies across diverse product +categories is a complex task. This paper introduces a new model by including +class discriminative properties obtained by a modified Regularized +Discriminative Variational Auto-Encoder (RD-VAE) in the feature extraction +process of Coupled-hypersphere-based Feature Adaptation (CFA). By doing so, the +proposed Regularized Discriminative Coupled-hypersphere-based Feature +Adaptation (RD-CFA), forms a solution for multi-class anomaly detection. By +using the discriminative power of RD-VAE to capture intricate class +distributions, combined with CFA's robust anomaly detection capability, the +proposed method excels in discerning anomalies across various classes. +Extensive evaluations on multi-class anomaly detection and localization using +the MVTec AD and BeanTech AD datasets showcase the effectiveness of RD-CFA +compared to eight leading contemporary methods. + +
+
+ comment: 14 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Pix4Point: Image Pretrained Standard Transformers for 3D Point Cloud + Understanding 3DV 2024 + + +
+ While Transformers have achieved impressive success in natural language +processing and computer vision, their performance on 3D point clouds is +relatively poor. This is mainly due to the limitation of Transformers: a +demanding need for extensive training data. Unfortunately, in the realm of 3D +point clouds, the availability of large datasets is a challenge, exacerbating +the issue of training Transformers for 3D tasks. In this work, we solve the +data issue of point cloud Transformers from two perspectives: (i) introducing +more inductive bias to reduce the dependency of Transformers on data, and (ii) +relying on cross-modality pretraining. More specifically, we first present +Progressive Point Patch Embedding and present a new point cloud Transformer +model namely PViT. PViT shares the same backbone as Transformer but is shown to +be less hungry for data, enabling Transformer to achieve performance comparable +to the state-of-the-art. Second, we formulate a simple yet effective pipeline +dubbed "Pix4Point" that allows harnessing Transformers pretrained in the image +domain to enhance downstream point cloud understanding. This is achieved +through a modality-agnostic Transformer backbone with the help of a tokenizer +and decoder specialized in the different domains. Pretrained on a large number +of widely available images, significant gains of PViT are observed in the tasks +of 3D point cloud classification, part segmentation, and semantic segmentation +on ScanObjectNN, ShapeNetPart, and S3DIS, respectively. Our code and models are +available at https://github.com/guochengqian/Pix4Point . + +
+
+ comment: camera-ready version at 3DV 2024 +
+
+
+
+
+ + ♻ ☆ InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image + + +
+ With the success of Neural Radiance Field (NeRF) in 3D-aware portrait +editing, a variety of works have achieved promising results regarding both +quality and 3D consistency. However, these methods heavily rely on per-prompt +optimization when handling natural language as editing instructions. Due to the +lack of labeled human face 3D datasets and effective architectures, the area of +human-instructed 3D-aware editing for open-world portraits in an end-to-end +manner remains under-explored. To solve this problem, we propose an end-to-end +diffusion-based framework termed InstructPix2NeRF, which enables instructed +3D-aware portrait editing from a single open-world image with human +instructions. At its core lies a conditional latent 3D diffusion process that +lifts 2D editing to 3D space by learning the correlation between the paired +images' difference and the instructions via triplet data. With the help of our +proposed token position randomization strategy, we could even achieve +multi-semantic editing through one single pass with the portrait identity +well-preserved. Besides, we further propose an identity consistency module that +directly modulates the extracted identity signals into our diffusion process, +which increases the multi-view 3D identity consistency. Extensive experiments +verify the effectiveness of our method and show its superiority against strong +baselines quantitatively and qualitatively. Source code and pre-trained models +can be found on our project page: +\url{https://mybabyyh.github.io/InstructPix2NeRF}. + +
+
+ comment: https://github.com/mybabyyh/InstructPix2NeRF +
+
+
+
+
+ + ♻ ☆ ResDiff: Combining CNN and Diffusion Model for Image Super-Resolution + + +
+ Adapting the Diffusion Probabilistic Model (DPM) for direct image +super-resolution is wasteful, given that a simple Convolutional Neural Network +(CNN) can recover the main low-frequency content. Therefore, we present +ResDiff, a novel Diffusion Probabilistic Model based on Residual structure for +Single Image Super-Resolution (SISR). ResDiff utilizes a combination of a CNN, +which restores primary low-frequency components, and a DPM, which predicts the +residual between the ground-truth image and the CNN predicted image. In +contrast to the common diffusion-based methods that directly use LR images to +guide the noise towards HR space, ResDiff utilizes the CNN's initial prediction +to direct the noise towards the residual space between HR space and +CNN-predicted space, which not only accelerates the generation process but also +acquires superior sample quality. Additionally, a frequency-domain-based loss +function for CNN is introduced to facilitate its restoration, and a +frequency-domain guided diffusion is designed for DPM on behalf of predicting +high-frequency details. The extensive experiments on multiple benchmark +datasets demonstrate that ResDiff outperforms previous diffusion based methods +in terms of shorter model convergence time, superior generation quality, and +more diverse samples. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ DINOv2: Learning Robust Visual Features without Supervision + + +
+ The recent breakthroughs in natural language processing for model pretraining +on large quantities of data have opened the way for similar foundation models +in computer vision. These models could greatly simplify the use of images in +any system by producing all-purpose visual features, i.e., features that work +across image distributions and tasks without finetuning. This work shows that +existing pretraining methods, especially self-supervised methods, can produce +such features if trained on enough curated data from diverse sources. We +revisit existing approaches and combine different techniques to scale our +pretraining in terms of data and model size. Most of the technical +contributions aim at accelerating and stabilizing the training at scale. In +terms of data, we propose an automatic pipeline to build a dedicated, diverse, +and curated image dataset instead of uncurated data, as typically done in the +self-supervised literature. In terms of models, we train a ViT model +(Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of +smaller models that surpass the best available all-purpose features, OpenCLIP +(Ilharco et al., 2021) on most of the benchmarks at image and pixel levels. + +
+
+
+
+
+ + ♻ ☆ YOLO-World: Real-Time Open-Vocabulary Object Detection + + +
+ The You Only Look Once (YOLO) series of detectors have established themselves +as efficient and practical tools. However, their reliance on predefined and +trained object categories limits their applicability in open scenarios. +Addressing this limitation, we introduce YOLO-World, an innovative approach +that enhances YOLO with open-vocabulary detection capabilities through +vision-language modeling and pre-training on large-scale datasets. +Specifically, we propose a new Re-parameterizable Vision-Language Path +Aggregation Network (RepVL-PAN) and region-text contrastive loss to facilitate +the interaction between visual and linguistic information. Our method excels in +detecting a wide range of objects in a zero-shot manner with high efficiency. +On the challenging LVIS dataset, YOLO-World achieves 35.4 AP with 52.0 FPS on +V100, which outperforms many state-of-the-art methods in terms of both accuracy +and speed. Furthermore, the fine-tuned YOLO-World achieves remarkable +performance on several downstream tasks, including object detection and +open-vocabulary instance segmentation. + +
+
+ comment: Work still in progress. Code & models are available at: + https://github.com/AILab-CVC/YOLO-World +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis + + +
+ Artificial intelligence (AI) in healthcare, especially in medical imaging, +faces challenges due to data scarcity and privacy concerns. Addressing these, +we introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI +synthesis. This model effectively tackles data scarcity and privacy issues by +integrating semantic conditioning. This involves the channel-wise concatenation +of a conditioning image to the model input, enabling control in image +generation. Med-DDPM demonstrates superior stability and performance compared +to existing 3D brain imaging synthesis methods. It generates diverse, +anatomically coherent images with high visual fidelity. In terms of dice score +accuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the +0.6531 accuracy of real images, and outperforms baseline models. Combined with +real images, it further increases segmentation accuracy to 0.6675, showing the +potential of our proposed method for data augmentation. This model represents +the first use of a diffusion model in 3D semantic brain MRI synthesis, +producing high-quality images. Its semantic conditioning feature also shows +potential for image anonymization in biomedical imaging, addressing data and +privacy issues. We provide the code and model weights for Med-DDPM on our +GitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support +reproducibility. + +
+
+
+
+
+ + ♻ ☆ Calibrating Panoramic Depth Estimation for Practical Localization and + Mapping ICCV 2023 + + +
+ The absolute depth values of surrounding environments provide crucial cues +for various assistive technologies, such as localization, navigation, and 3D +structure estimation. We propose that accurate depth estimated from panoramic +images can serve as a powerful and light-weight input for a wide range of +downstream tasks requiring 3D information. While panoramic images can easily +capture the surrounding context from commodity devices, the estimated depth +shares the limitations of conventional image-based depth estimation; the +performance deteriorates under large domain shifts and the absolute values are +still ambiguous to infer from 2D observations. By taking advantage of the +holistic view, we mitigate such effects in a self-supervised way and fine-tune +the network with geometric consistency during the test phase. Specifically, we +construct a 3D point cloud from the current depth prediction and project the +point cloud at various viewpoints or apply stretches on the current input image +to generate synthetic panoramas. Then we minimize the discrepancy of the 3D +structure estimated from synthetic images without collecting additional data. +We empirically evaluate our method in robot navigation and map-free +localization where our method shows large performance enhancements. Our +calibration method can therefore widen the applicability under various external +conditions, serving as a key component for practical panorama-based machine +vision systems. Code is available through the following link: +\url{https://github.com/82magnolia/panoramic-depth-calibration}. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ IIANet: An Intra- and Inter-Modality Attention Network for Audio-Visual + Speech Separation + + +
+ Recent research has made significant progress in designing fusion modules for +audio-visual speech separation. However, they predominantly focus on +multi-modal fusion at a single temporal scale of auditory and visual features +without employing selective attention mechanisms, which is in sharp contrast +with the brain. To address this issue, We propose a novel model called Intra- +and Inter-Attention Network (IIANet), which leverages the attention mechanism +for efficient audio-visual feature fusion. IIANet consists of two types of +attention blocks: intra-attention (IntraA) and inter-attention (InterA) blocks, +where the InterA blocks are distributed at the top, middle and bottom of +IIANet. Heavily inspired by the way how human brain selectively focuses on +relevant content at various temporal scales, these blocks maintain the ability +to learn modality-specific features and enable the extraction of different +semantics from audio-visual features. Comprehensive experiments on three +standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of IIANet, outperforming previous +state-of-the-art methods while maintaining comparable inference time. In +particular, the fast version of IIANet (IIANet-fast) has only 7% of CTCNet's +MACs and is 40% faster than CTCNet on CPUs while achieving better separation +quality, showing the great potential of attention mechanism for efficient and +effective multimodal fusion. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ On the explainable properties of 1-Lipschitz Neural Networks: An Optimal + Transport Perspective + + +
+ Input gradients have a pivotal role in a variety of applications, including +adversarial attack algorithms for evaluating model robustness, explainable AI +techniques for generating Saliency Maps, and counterfactual +explanations.However, Saliency Maps generated by traditional neural networks +are often noisy and provide limited insights. In this paper, we demonstrate +that, on the contrary, the Saliency Maps of 1-Lipschitz neural networks, +learned with the dual loss of an optimal transportation problem, exhibit +desirable XAI properties:They are highly concentrated on the essential parts of +the image with low noise, significantly outperforming state-of-the-art +explanation approaches across various models and metrics. We also prove that +these maps align unprecedentedly well with human explanations on ImageNet.To +explain the particularly beneficial properties of the Saliency Map for such +models, we prove this gradient encodes both the direction of the transportation +plan and the direction towards the nearest adversarial attack. Following the +gradient down to the decision boundary is no longer considered an adversarial +attack, but rather a counterfactual explanation that explicitly transports the +input from one class to another. Thus, Learning with such a loss jointly +optimizes the classification objective and the alignment of the gradient, i.e. +the Saliency Map, to the transportation plan direction.These networks were +previously known to be certifiably robust by design, and we demonstrate that +they scale well for large problems and models, and are tailored for +explainability using a fast and straightforward method. + +
+
+
+
+
+ + ♻ ☆ Federated Learning for Large-Scale Scene Modeling with Neural Radiance + Fields + + +
+ We envision a system to continuously build and maintain a map based on +earth-scale neural radiance fields (NeRF) using data collected from vehicles +and drones in a lifelong learning manner. However, existing large-scale +modeling by NeRF has problems in terms of scalability and maintainability when +modeling earth-scale environments. Therefore, to address these problems, we +propose a federated learning pipeline for large-scale modeling with NeRF. We +tailor the model aggregation pipeline in federated learning for NeRF, thereby +allowing local updates of NeRF. In the aggregation step, the accuracy of the +clients' global pose is critical. Thus, we also propose global pose alignment +to align the noisy global pose of clients before the aggregation step. In +experiments, we show the effectiveness of the proposed pose alignment and the +federated learning pipeline on the large-scale scene dataset, Mill19. + +
+
+
+
+
+ + ♻ ☆ DTL: Disentangled Transfer Learning for Visual Recognition AAAI 2024 + + +
+ When pre-trained models become rapidly larger, the cost of fine-tuning on +downstream tasks steadily increases, too. To economically fine-tune these +models, parameter-efficient transfer learning (PETL) is proposed, which only +tunes a tiny subset of trainable parameters to efficiently learn quality +representations. However, current PETL methods are facing the dilemma that +during training the GPU memory footprint is not effectively reduced as +trainable parameters. PETL will likely fail, too, if the full fine-tuning +encounters the out-of-GPU-memory issue. This phenomenon happens because +trainable parameters from these methods are generally entangled with the +backbone, such that a lot of intermediate states have to be stored in GPU +memory for gradient propagation. To alleviate this problem, we introduce +Disentangled Transfer Learning (DTL), which disentangles the trainable +parameters from the backbone using a lightweight Compact Side Network (CSN). By +progressively extracting task-specific information with a few low-rank linear +mappings and appropriately adding the information back to the backbone, CSN +effectively realizes knowledge transfer in various downstream tasks. We +conducted extensive experiments to validate the effectiveness of our method. +The proposed method not only reduces a large amount of GPU memory usage and +trainable parameters, but also outperforms existing PETL methods by a +significant margin in accuracy, achieving new state-of-the-art on several +standard benchmarks. The code is available at https://github.com/heekhero/DTL. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ MagicPose: Realistic Human Poses and Facial Expressions Retargeting with + Identity-aware Diffusion + + +
+ In this work, we propose MagicPose, a diffusion-based model for 2D human pose +and facial expression retargeting. Specifically, given a reference image, we +aim to generate a person's new images by controlling the poses and facial +expressions while keeping the identity unchanged. To this end, we propose a +two-stage training strategy to disentangle human motions and appearance (e.g., +facial expressions, skin tone and dressing), consisting of (1) the pre-training +of an appearance-control block and (2) learning appearance-disentangled pose +control. Our novel design enables robust appearance control over generated +human images, including body, facial attributes, and even background. By +leveraging the prior knowledge of image diffusion models, MagicPose generalizes +well to unseen human identities and complex poses without the need for +additional fine-tuning. Moreover, the proposed model is easy to use and can be +considered as a plug-in module/extension to Stable Diffusion. + +
+
+ comment: Project Page:https://boese0601.github.io/magicdance/ + Code:https://github.com/Boese0601/MagicDance +
+
+
+
+
+ + ♻ ☆ TVPR: Text-to-Video Person Retrieval and a New Benchmark + + +
+ Most existing methods for text-based person retrieval focus on text-to-image +person retrieval. Nevertheless, due to the lack of dynamic information provided +by isolated frames, the performance is hampered when the person is obscured in +isolated frames or variable motion details are given in the textual +description. In this paper, we propose a new task called Text-to-Video Person +Retrieval(TVPR) which aims to effectively overcome the limitations of isolated +frames. Since there is no dataset or benchmark that describes person videos +with natural language, we construct a large-scale cross-modal person video +dataset containing detailed natural language annotations, such as person's +appearance, actions and interactions with environment, etc., termed as +Text-to-Video Person Re-identification (TVPReid) dataset, which will be +publicly available. To this end, a Text-to-Video Person Retrieval Network +(TVPRN) is proposed. Specifically, TVPRN acquires video representations by +fusing visual and motion representations of person videos, which can deal with +temporal occlusion and the absence of variable motion details in isolated +frames. Meanwhile, we employ the pre-trained BERT to obtain caption +representations and the relationship between caption and video representations +to reveal the most relevant person videos. To evaluate the effectiveness of the +proposed TVPRN, extensive experiments have been conducted on TVPReid dataset. +To the best of our knowledge, TVPRN is the first successful attempt to use +video for text-based person retrieval task and has achieved state-of-the-art +performance on TVPReid dataset. The TVPReid dataset will be publicly available +to benefit future research. + +
+
+
+
+
+ + ♻ ☆ Prompting Segmentation with Sound Is Generalizable Audio-Visual Source + Localizer AAAI 2024 + + +
+ Never having seen an object and heard its sound simultaneously, can the model +still accurately localize its visual position from the input audio? In this +work, we concentrate on the Audio-Visual Localization and Segmentation tasks +but under the demanding zero-shot and few-shot scenarios. To achieve this goal, +different from existing approaches that mostly employ the +encoder-fusion-decoder paradigm to decode localization information from the +fused audio-visual feature, we introduce the encoder-prompt-decoder paradigm, +aiming to better fit the data scarcity and varying data distribution dilemmas +with the help of abundant knowledge from pre-trained models. Specifically, we +first propose to construct Semantic-aware Audio Prompt (SAP) to help the visual +foundation model focus on sounding objects, meanwhile, the semantic gap between +the visual and audio modalities is also encouraged to shrink. Then, we develop +a Correlation Adapter (ColA) to keep minimal training efforts as well as +maintain adequate knowledge of the visual foundation model. By equipping with +these means, extensive experiments demonstrate that this new paradigm +outperforms other fusion-based methods in both the unseen class and +cross-dataset settings. We hope that our work can further promote the +generalization study of Audio-Visual Localization and Segmentation in practical +application scenarios. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Geometry Transfer for Stylizing Radiance Fields + + +
+ Shape and geometric patterns are essential in defining stylistic identity. +However, current 3D style transfer methods predominantly focus on transferring +colors and textures, often overlooking geometric aspects. In this paper, we +introduce Geometry Transfer, a novel method that leverages geometric +deformation for 3D style transfer. This technique employs depth maps to extract +a style guide, subsequently applied to stylize the geometry of radiance fields. +Moreover, we propose new techniques that utilize geometric cues from the 3D +scene, thereby enhancing aesthetic expressiveness and more accurately +reflecting intended styles. Our extensive experiments show that Geometry +Transfer enables a broader and more expressive range of stylizations, thereby +significantly expanding the scope of 3D style transfer. + +
+
+ comment: project page: https://hyblue.github.io/geo-srf/ +
+
+
+
+
+ + ♻ ☆ Completing Visual Objects via Bridging Generation and Segmentation + + +
+ This paper presents a novel approach to object completion, with the primary +goal of reconstructing a complete object from its partially visible components. +Our method, named MaskComp, delineates the completion process through iterative +stages of generation and segmentation. In each iteration, the object mask is +provided as an additional condition to boost image generation, and, in return, +the generated images can lead to a more accurate mask by fusing the +segmentation of images. We demonstrate that the combination of one generation +and one segmentation stage effectively functions as a mask denoiser. Through +alternation between the generation and segmentation stages, the partial object +mask is progressively refined, providing precise shape guidance and yielding +superior object completion results. Our experiments demonstrate the superiority +of MaskComp over existing approaches, e.g., ControlNet and Stable Diffusion, +establishing it as an effective solution for object completion. + +
+
+
+
+
+ + ♻ ☆ Surrogate Model for Geological CO2 Storage and Its Use in Hierarchical + MCMC History Matching + + +
+ Deep-learning-based surrogate models show great promise for use in geological +carbon storage operations. In this work we target an important application - +the history matching of storage systems characterized by a high degree of +(prior) geological uncertainty. Toward this goal, we extend the recently +introduced recurrent R-U-Net surrogate model to treat geomodel realizations +drawn from a wide range of geological scenarios. These scenarios are defined by +a set of metaparameters, which include the horizontal correlation length, mean +and standard deviation of log-permeability, permeability anisotropy ratio, and +constants in the porosity-permeability relationship. An infinite number of +realizations can be generated for each set of metaparameters, so the range of +prior uncertainty is large. The surrogate model is trained with flow simulation +results, generated using the open-source simulator GEOS, for 2000 random +realizations. The flow problems involve four wells, each injecting 1 Mt +CO2/year, for 30 years. The trained surrogate model is shown to provide +accurate predictions for new realizations over the full range of geological +scenarios, with median relative error of 1.3% in pressure and 4.5% in +saturation. The surrogate model is incorporated into a hierarchical Markov +chain Monte Carlo history matching workflow, where the goal is to generate +history matched geomodel realizations and posterior estimates of the +metaparameters. We show that, using observed data from monitoring wells in +synthetic `true' models, geological uncertainty is reduced substantially. This +leads to posterior 3D pressure and saturation fields that display much closer +agreement with the true-model responses than do prior predictions. + +
+
+
+
+
+ + ♻ ☆ Multimodal video and IMU kinematic dataset on daily life activities + using affordable devices (VIDIMU) + + +
+ Human activity recognition and clinical biomechanics are challenging problems +in physical telerehabilitation medicine. However, most publicly available +datasets on human body movements cannot be used to study both problems in an +out-of-the-lab movement acquisition setting. The objective of the VIDIMU +dataset is to pave the way towards affordable patient gross motor tracking +solutions for daily life activities recognition and kinematic analysis. The +dataset includes 13 activities registered using a commodity camera and five +inertial sensors. The video recordings were acquired in 54 subjects, of which +16 also had simultaneous recordings of inertial sensors. The novelty of dataset +lies in: (i) the clinical relevance of the chosen movements, (ii) the combined +utilization of affordable video and custom sensors, and (iii) the +implementation of state-of-the-art tools for multimodal data processing of 3D +body pose tracking and motion reconstruction in a musculoskeletal model from +inertial data. The validation confirms that a minimally disturbing acquisition +protocol, performed according to real-life conditions can provide a +comprehensive picture of human joint angles during daily life activities. + +
+
+
+
+
+ + ♻ ☆ PICCOLO: Point Cloud-Centric Omnidirectional Localization ICCV 2021 + + +
+ We present PICCOLO, a simple and efficient algorithm for omnidirectional +localization. Given a colored point cloud and a 360 panorama image of a scene, +our objective is to recover the camera pose at which the panorama image is +taken. Our pipeline works in an off-the-shelf manner with a single image given +as a query and does not require any training of neural networks or collecting +ground-truth poses of images. Instead, we match each point cloud color to the +holistic view of the panorama image with gradient-descent optimization to find +the camera pose. Our loss function, called sampling loss, is point +cloud-centric, evaluated at the projected location of every point in the point +cloud. In contrast, conventional photometric loss is image-centric, comparing +colors at each pixel location. With a simple change in the compared entities, +sampling loss effectively overcomes the severe visual distortion of +omnidirectional images, and enjoys the global context of the 360 view to handle +challenging scenarios for visual localization. PICCOLO outperforms existing +omnidirectional localization algorithms in both accuracy and stability when +evaluated in various environments. Code is available at +\url{https://github.com/82magnolia/panoramic-localization/}. + +
+
+ comment: Accepted to ICCV 2021 +
+
+
+
+
+ + ♻ ☆ Characteristic Guidance: Non-linear Correction for Diffusion Model at + Large Guidance Scale + + +
+ Popular guidance for denoising diffusion probabilistic model (DDPM) linearly +combines distinct conditional models together to provide enhanced control over +samples. However, this approach overlooks nonlinear effects that become +significant when guidance scale is large. To address this issue, we propose +characteristic guidance, a guidance method that provides first-principle +non-linear correction for classifier-free guidance. Such correction forces the +guided DDPMs to respect the Fokker-Planck (FP) equation of diffusion process, +in a way that is training-free and compatible with existing sampling methods. +Experiments show that characteristic guidance enhances semantic characteristics +of prompts and mitigate irregularities in image generation, proving effective +in diverse applications ranging from simulating magnet phase transitions to +latent space sampling. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ CPO: Change Robust Panorama to Point Cloud Localization ECCV 2022 + + +
+ We present CPO, a fast and robust algorithm that localizes a 2D panorama with +respect to a 3D point cloud of a scene possibly containing changes. To robustly +handle scene changes, our approach deviates from conventional feature point +matching, and focuses on the spatial context provided from panorama images. +Specifically, we propose efficient color histogram generation and subsequent +robust localization using score maps. By utilizing the unique equivariance of +spherical projections, we propose very fast color histogram generation for a +large number of camera poses without explicitly rendering images for all +candidate poses. We accumulate the regional consistency of the panorama and +point cloud as 2D/3D score maps, and use them to weigh the input color values +to further increase robustness. The weighted color distribution quickly finds +good initial poses and achieves stable convergence for gradient-based +optimization. CPO is lightweight and achieves effective localization in all +tested scenarios, showing stable performance despite scene changes, repetitive +structures, or featureless regions, which are typical challenges for visual +localization with perspective cameras. Code is available at +\url{https://github.com/82magnolia/panoramic-localization/}. + +
+
+ comment: Accepted to ECCV 2022 +
+
+
+
+
+ + ♻ ☆ Denoising Diffusion Step-aware Models + + +
+ Denoising Diffusion Probabilistic Models (DDPMs) have garnered popularity for +data generation across various domains. However, a significant bottleneck is +the necessity for whole-network computation during every step of the generative +process, leading to high computational overheads. This paper presents a novel +framework, Denoising Diffusion Step-aware Models (DDSM), to address this +challenge. Unlike conventional approaches, DDSM employs a spectrum of neural +networks whose sizes are adapted according to the importance of each generative +step, as determined through evolutionary search. This step-wise network +variation effectively circumvents redundant computational efforts, particularly +in less critical steps, thereby enhancing the efficiency of the diffusion +model. Furthermore, the step-aware design can be seamlessly integrated with +other efficiency-geared diffusion models such as DDIMs and latent diffusion, +thus broadening the scope of computational savings. Empirical evaluations +demonstrate that DDSM achieves computational savings of 49% for CIFAR-10, 61% +for CelebA-HQ, 59% for LSUN-bedroom, 71% for AFHQ, and 76% for ImageNet, all +without compromising the generation quality. + +
+
+
+
+
+ + ♻ ☆ Optimal Projection for 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting has garnered extensive attention and application in +real-time neural rendering. Concurrently, concerns have been raised about the +limitations of this technology in aspects such as point cloud storage, +performance , and robustness in sparse viewpoints , leading to various +improvements. However, there has been a notable lack of attention to the +projection errors introduced by the local affine approximation inherent in the +splatting itself, and the consequential impact of these errors on the quality +of photo-realistic rendering. This paper addresses the projection error +function of 3D Gaussian Splatting, commencing with the residual error from the +first-order Taylor expansion of the projection function $\phi$. The analysis +establishes a correlation between the error and the Gaussian mean position. +Subsequently, leveraging function optimization theory, this paper analyzes the +function's minima to provide an optimal projection strategy for Gaussian +Splatting referred to Optimal Gaussian Splatting. Experimental validation +further confirms that this projection methodology reduces artifacts, resulting +in a more convincingly realistic rendering. + +
+
+
+
+
+ + ♻ ☆ SmartCooper: Vehicular Collaborative Perception with Adaptive Fusion and + Judger Mechanism + + +
+ In recent years, autonomous driving has garnered significant attention due to +its potential for improving road safety through collaborative perception among +connected and autonomous vehicles (CAVs). However, time-varying channel +variations in vehicular transmission environments demand dynamic allocation of +communication resources. Moreover, in the context of collaborative perception, +it is important to recognize that not all CAVs contribute valuable data, and +some CAV data even have detrimental effects on collaborative perception. In +this paper, we introduce SmartCooper, an adaptive collaborative perception +framework that incorporates communication optimization and a judger mechanism +to facilitate CAV data fusion. Our approach begins with optimizing the +connectivity of vehicles while considering communication constraints. We then +train a learnable encoder to dynamically adjust the compression ratio based on +the channel state information (CSI). Subsequently, we devise a judger mechanism +to filter the detrimental image data reconstructed by adaptive decoders. We +evaluate the effectiveness of our proposed algorithm on the OpenCOOD platform. +Our results demonstrate a substantial reduction in communication costs by +23.10\% compared to the non-judger scheme. Additionally, we achieve a +significant improvement on the average precision of Intersection over Union +(AP@IoU) by 7.15\% compared with state-of-the-art schemes. + +
+
+
+
+
+ + ♻ ☆ Machine Unlearning for Image-to-Image Generative Models ICLR 2024 + + +
+ Machine unlearning has emerged as a new paradigm to deliberately forget data +samples from a given model in order to adhere to stringent regulations. +However, existing machine unlearning methods have been primarily focused on +classification models, leaving the landscape of unlearning for generative +models relatively unexplored. This paper serves as a bridge, addressing the gap +by providing a unifying framework of machine unlearning for image-to-image +generative models. Within this framework, we propose a +computationally-efficient algorithm, underpinned by rigorous theoretical +analysis, that demonstrates negligible performance degradation on the retain +samples, while effectively removing the information from the forget samples. +Empirical studies on two large-scale datasets, ImageNet-1K and Places-365, +further show that our algorithm does not rely on the availability of the retain +samples, which further complies with data retention policy. To our best +knowledge, this work is the first that represents systemic, theoretical, +empirical explorations of machine unlearning specifically tailored for +image-to-image generative models. Our code is available at +https://github.com/jpmorganchase/l2l-generator-unlearning. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ 3DPFIX: Improving Remote Novices' 3D Printing Troubleshooting through + Human-AI Collaboration SC + + +
+ The widespread consumer-grade 3D printers and learning resources online +enable novices to self-train in remote settings. While troubleshooting plays an +essential part of 3D printing, the process remains challenging for many remote +novices even with the help of well-developed online sources, such as online +troubleshooting archives and online community help. We conducted a formative +study with 76 active 3D printing users to learn how remote novices leverage +online resources in troubleshooting and their challenges. We found that remote +novices cannot fully utilize online resources. For example, the online archives +statically provide general information, making it hard to search and relate +their unique cases with existing descriptions. Online communities can +potentially ease their struggles by providing more targeted suggestions, but a +helper who can provide custom help is rather scarce, making it hard to obtain +timely assistance. We propose 3DPFIX, an interactive 3D troubleshooting system +powered by the pipeline to facilitate Human-AI Collaboration, designed to +improve novices' 3D printing experiences and thus help them easily accumulate +their domain knowledge. We built 3DPFIX that supports automated diagnosis and +solution-seeking. 3DPFIX was built upon shared dialogues about failure cases +from Q&A discourses accumulated in online communities. We leverage social +annotations (i.e., comments) to build an annotated failure image dataset for AI +classifiers and extract a solution pool. Our summative study revealed that +using 3DPFIX helped participants spend significantly less effort in diagnosing +failures and finding a more accurate solution than relying on their common +practice. We also found that 3DPFIX users learn about 3D printing +domain-specific knowledge. We discuss the implications of leveraging +community-driven data in developing future Human-AI Collaboration designs. + +
+
+ comment: CSCW2024 +
+
+
+
+
+ + ♻ ☆ InstaStyle: Inversion Noise of a Stylized Image is Secretly a Style + Adviser + + +
+ Stylized text-to-image generation focuses on creating images from textual +descriptions while adhering to a style specified by a few reference images. +However, subtle style variations within different reference images can hinder +the model from accurately learning the target style. In this paper, we propose +InstaStyle, a novel approach that excels in generating high-fidelity stylized +images with only a single reference image. Our approach is based on the finding +that the inversion noise from a stylized reference image inherently carries the +style signal, as evidenced by their non-zero signal-to-noise ratio. We employ +DDIM inversion to extract this noise from the reference image and leverage a +diffusion model to generate new stylized images from the "style" noise. +Additionally, the inherent ambiguity and bias of textual prompts impede the +precise conveying of style. To address this, we introduce a learnable style +token via prompt refinement, which enhances the accuracy of the style +description for the reference image. Qualitative and quantitative experimental +results demonstrate that InstaStyle achieves superior performance compared to +current benchmarks. Furthermore, our approach also showcases its capability in +the creative task of style combination with mixed inversion noise. + +
+
+ comment: 21 pages,20 figures +
+
+
+
+
+ + ♻ ☆ Enlighten-Your-Voice: When Multimodal Meets Zero-shot Low-light Image + Enhancement + + +
+ Low-light image enhancement is a crucial visual task, and many unsupervised +methods tend to overlook the degradation of visible information in low-light +scenes, which adversely affects the fusion of complementary information and +hinders the generation of satisfactory results. To address this, our study +introduces "Enlighten-Your-Voice", a multimodal enhancement framework that +innovatively enriches user interaction through voice and textual commands. This +approach does not merely signify a technical leap but also represents a +paradigm shift in user engagement. Our model is equipped with a Dual +Collaborative Attention Module (DCAM) that meticulously caters to distinct +content and color discrepancies, thereby facilitating nuanced enhancements. +Complementarily, we introduce a Semantic Feature Fusion (SFM) plug-and-play +module that synergizes semantic context with low-light enhancement operations, +sharpening the algorithm's efficacy. Crucially, "Enlighten-Your-Voice" +showcases remarkable generalization in unsupervised zero-shot scenarios. The +source code can be accessed from +https://github.com/zhangbaijin/Enlighten-Your-Voice + +
+
+ comment: It needs revised +
+
+
+
+
+ + ♻ ☆ Guided Interpretable Facial Expression Recognition via Spatial Action + Unit Cues + + +
+ While state-of-the-art facial expression recognition (FER) classifiers +achieve a high level of accuracy, they lack interpretability, an important +aspect for end-users. To recognize basic facial expressions, experts resort to +a codebook associating a set of spatial action units to a facial expression. In +this paper, we follow the same expert footsteps, and propose a learning +strategy that allows us to explicitly incorporate spatial action units (aus) +cues into the classifier's training to build a deep interpretable model. In +particular, using this aus codebook, input image expression label, and facial +landmarks, a single action units heatmap is built to indicate the most +discriminative regions of interest in the image w.r.t the facial expression. We +leverage this valuable spatial cue to train a deep interpretable classifier for +FER. This is achieved by constraining the spatial layer features of a +classifier to be correlated with \aus map. Using a composite loss, the +classifier is trained to correctly classify an image while yielding +interpretable visual layer-wise attention correlated with aus maps, simulating +the experts' decision process. This is achieved using only the image class +expression as supervision and without any extra manual annotations. Moreover, +our method is generic. It can be applied to any CNN- or transformer-based deep +classifier without the need for architectural change or adding significant +training time. Our extensive evaluation on two public benchmarks RAFDB, and +AFFECTNET datasets shows that our proposed strategy can improve layer-wise +interpretability without degrading classification performance. In addition, we +explore a common type of interpretable classifiers that rely on +Class-Activation Mapping methods (CAMs), and we show that our training +technique improves the CAM interpretability. + +
+
+ comment: 11 +
+
+
+
+
+ + ♻ ☆ The Neglected Tails of Vision-Language Models + + +
+ Vision-language models (VLMs) excel in zero-shot recognition but their +performance varies greatly across different visual concepts. For example, +although CLIP achieves impressive accuracy on ImageNet (60-80%), its +performance drops below 10% for more than ten concepts like night snake, +presumably due to their limited presence in the pretraining data. However, +measuring the frequency of concepts in VLMs' large-scale datasets is +challenging. We address this by using large language models (LLMs) to count the +number of pretraining texts that contain synonyms of these concepts. Our +analysis confirms that popular datasets, such as LAION, exhibit a long-tailed +concept distribution, yielding biased performance in VLMs. We also find that +downstream applications of VLMs, including visual chatbots (e.g., GPT-4V) and +text-to-image models (e.g., Stable Diffusion), often fail to recognize or +generate images of rare concepts identified by our method. To mitigate the +imbalanced performance of zero-shot VLMs, we propose REtrieval-Augmented +Learning (REAL). First, instead of prompting VLMs using the original class +names, REAL uses their most frequent synonyms found in pretraining texts. This +simple change already outperforms costly human-engineered and LLM-enriched +prompts over nine benchmark datasets. Second, REAL trains a linear classifier +on a small yet balanced set of pretraining data retrieved using concept +synonyms. REAL surpasses the previous zero-shot SOTA, using 400x less storage +and 10,000x less training time! + +
+
+ comment: Project Page: + https://shubhamprshr27.github.io/neglected-tails-of-vlms/ +
+
+
+
+
+ + ♻ ☆ Fossil Image Identification using Deep Learning Ensembles of Data + Augmented Multiviews + + +
+ Identification of fossil species is crucial to evolutionary studies. Recent +advances from deep learning have shown promising prospects in fossil image +identification. However, the quantity and quality of labeled fossil images are +often limited due to fossil preservation, conditioned sampling, and expensive +and inconsistent label annotation by domain experts, which pose great +challenges to training deep learning based image classification models. To +address these challenges, we follow the idea of the wisdom of crowds and +propose a multiview ensemble framework, which collects Original (O), Gray (G), +and Skeleton (S) views of each fossil image reflecting its different +characteristics to train multiple base models, and then makes the final +decision via soft voting. Experiments on the largest fusulinid dataset with +2400 images show that the proposed OGS consistently outperforms baselines +(using a single model for each view), and obtains superior or comparable +performance compared to OOO (using three base models for three the same +Original views). Besides, as the training data decreases, the proposed +framework achieves more gains. While considering the identification consistency +estimation with respect to human experts, OGS receives the highest agreement +with the original labels of dataset and with the re-identifications of two +human experts. The validation performance provides a quantitative estimation of +consistency across different experts and genera. We conclude that the proposed +framework can present state-of-the-art performance in the fusulinid fossil +identification case study. This framework is designed for general fossil +identification and it is expected to see applications to other fossil datasets +in future work. The source code is publicly available at +https://github.com/houchengbin/Fossil-Image-Identification to benefit future +research in fossil image identification. + +
+
+ comment: published in Methods in Ecology and Evolution +
+
+
+
+
+ + ♻ ☆ Text Image Inpainting via Global Structure-Guided Diffusion Models AAAI-24 + + +
+ Real-world text can be damaged by corrosion issues caused by environmental or +human factors, which hinder the preservation of the complete styles of texts, +e.g., texture and structure. These corrosion issues, such as graffiti signs and +incomplete signatures, bring difficulties in understanding the texts, thereby +posing significant challenges to downstream applications, e.g., scene text +recognition and signature identification. Notably, current inpainting +techniques often fail to adequately address this problem and have difficulties +restoring accurate text images along with reasonable and consistent styles. +Formulating this as an open problem of text image inpainting, this paper aims +to build a benchmark to facilitate its study. In doing so, we establish two +specific text inpainting datasets which contain scene text images and +handwritten text images, respectively. Each of them includes images revamped by +real-life and synthetic datasets, featuring pairs of original images, corrupted +images, and other assistant information. On top of the datasets, we further +develop a novel neural framework, Global Structure-guided Diffusion Model +(GSDM), as a potential solution. Leveraging the global structure of the text as +a prior, the proposed GSDM develops an efficient diffusion model to recover +clean texts. The efficacy of our approach is demonstrated by thorough empirical +study, including a substantial boost in both recognition accuracy and image +quality. These findings not only highlight the effectiveness of our method but +also underscore its potential to enhance the broader field of text image +understanding and processing. Code and datasets are available at: +https://github.com/blackprotoss/GSDM. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+ + ♻ ☆ Geometry aware 3D generation from in-the-wild images in ImageNet + + +
+ Generating accurate 3D models is a challenging problem that traditionally +requires explicit learning from 3D datasets using supervised learning. Although +recent advances have shown promise in learning 3D models from 2D images, these +methods often rely on well-structured datasets with multi-view images of each +instance or camera pose information. Furthermore, these datasets usually +contain clean backgrounds with simple shapes, making them expensive to acquire +and hard to generalize, which limits the applicability of these methods. To +overcome these limitations, we propose a method for reconstructing 3D geometry +from the diverse and unstructured Imagenet dataset without camera pose +information. We use an efficient triplane representation to learn 3D models +from 2D images and modify the architecture of the generator backbone based on +StyleGAN2 to adapt to the highly diverse dataset. To prevent mode collapse and +improve the training stability on diverse data, we propose to use multi-view +discrimination. The trained generator can produce class-conditional 3D models +as well as renderings from arbitrary viewpoints. The class-conditional +generation results demonstrate significant improvement over the current +state-of-the-art method. Additionally, using PTI, we can efficiently +reconstruct the whole 3D geometry from single-view images. + +
+
+
+
+
+ + ♻ ☆ CT-MVSNet: Efficient Multi-View Stereo with Cross-scale Transformer + + +
+ Recent deep multi-view stereo (MVS) methods have widely incorporated +transformers into cascade network for high-resolution depth estimation, +achieving impressive results. However, existing transformer-based methods are +constrained by their computational costs, preventing their extension to finer +stages. In this paper, we propose a novel cross-scale transformer (CT) that +processes feature representations at different stages without additional +computation. Specifically, we introduce an adaptive matching-aware transformer +(AMT) that employs different interactive attention combinations at multiple +scales. This combined strategy enables our network to capture intra-image +context information and enhance inter-image feature relationships. Besides, we +present a dual-feature guided aggregation (DFGA) that embeds the coarse global +semantic information into the finer cost volume construction to further +strengthen global and local feature awareness. Meanwhile, we design a feature +metric loss (FM Loss) that evaluates the feature bias before and after +transformation to reduce the impact of feature mismatch on depth estimation. +Extensive experiments on DTU dataset and Tanks and Temples (T\&T) benchmark +demonstrate that our method achieves state-of-the-art results. Code is +available at https://github.com/wscstrive/CT-MVSNet. + +
+
+ comment: Accepted at the 30th International Conference on Multimedia + Modeling(MMM'24 Oral) +
+
+
+
+
+ + ♻ ☆ VR-based generation of photorealistic synthetic data for training + hand-object tracking models + + +
+ Supervised learning models for precise tracking of hand-object interactions +(HOI) in 3D require large amounts of annotated data for training. Moreover, it +is not intuitive for non-experts to label 3D ground truth (e.g. 6DoF object +pose) on 2D images. To address these issues, we present "blender-hoisynth", an +interactive synthetic data generator based on the Blender software. +Blender-hoisynth can scalably generate and automatically annotate visual HOI +training data. Other competing approaches usually generate synthetic HOI data +compeletely without human input. While this may be beneficial in some +scenarios, HOI applications inherently necessitate direct control over the HOIs +as an expression of human intent. With blender-hoisynth, it is possible for +users to interact with objects via virtual hands using standard Virtual Reality +hardware. The synthetically generated data are characterized by a high degree +of photorealism and contain visually plausible and physically realistic videos +of hands grasping objects and moving them around in 3D. To demonstrate the +efficacy of our data generation, we replace large parts of the training data in +the well-known DexYCB dataset with hoisynth data and train a state-of-the-art +HOI reconstruction model with it. We show that there is no significant +degradation in the model performance despite the data replacement. + +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ Improving Sequential Recommendations with LLMs + + +
+ The sequential recommendation problem has attracted considerable research +attention in the past few years, leading to the rise of numerous recommendation +models. In this work, we explore how Large Language Models (LLMs), which are +nowadays introducing disruptive effects in many AI-based applications, can be +used to build or improve sequential recommendation approaches. Specifically, we +design three orthogonal approaches and hybrids of those to leverage the power +of LLMs in different ways. In addition, we investigate the potential of each +approach by focusing on its comprising technical aspects and determining an +array of alternative choices for each one. We conduct extensive experiments on +three datasets and explore a large variety of configurations, including +different language models and baseline recommendation models, to obtain a +comprehensive picture of the performance of each approach. Among other +observations, we highlight that initializing state-of-the-art sequential +recommendation models such as BERT4Rec or SASRec with embeddings obtained from +an LLM can lead to substantial performance gains in terms of accuracy. +Furthermore, we find that fine-tuning an LLM for recommendation tasks enables +it to learn not only the tasks, but also concepts of a domain to some extent. +We also show that fine-tuning OpenAI GPT leads to considerably better +performance than fine-tuning Google PaLM 2. Overall, our extensive experiments +indicate a huge potential value of leveraging LLMs in future recommendation +approaches. We publicly share the code and data of our experiments to ensure +reproducibility. + +
+
+ comment: 33 pages, 12 figures, 7 tables +
+
+
+
+
+ + ☆ Minimizing Regret in Billboard Advertisement under Zonal Influence + Constraint + + +
+ In a typical billboard advertisement technique, a number of digital +billboards are owned by an influence provider, and many advertisers approach +the influence provider for a specific number of views of their advertisement +content on a payment basis. If the influence provider provides the demanded or +more influence, then he will receive the full payment or else a partial +payment. In the context of an influence provider, if he provides more or less +than an advertiser's demanded influence, it is a loss for him. This is +formalized as 'Regret', and naturally, in the context of the influence +provider, the goal will be to allocate the billboard slots among the +advertisers such that the total regret is minimized. In this paper, we study +this problem as a discrete optimization problem and propose four solution +approaches. The first one selects the billboard slots from the available ones +in an incremental greedy manner, and we call this method the Budget Effective +Greedy approach. In the second one, we introduce randomness with the first one, +where we perform the marginal gain computation for a sample of randomly chosen +billboard slots. The remaining two approaches are further improvements over the +second one. We analyze all the algorithms to understand their time and space +complexity. We implement them with real-life trajectory and billboard datasets +and conduct a number of experiments. It has been observed that the randomized +budget effective greedy approach takes reasonable computational time while +minimizing the regret. + +
+
+ comment: 32 Pages +
+
+
+
+
+ + ☆ HimiRec: Modeling Hierarchical Multi-interest for Recommendation + + +
+ Industrial recommender systems usually consist of the retrieval stage and the +ranking stage, to handle the billion-scale of users and items. The retrieval +stage retrieves candidate items relevant to user interests for recommendations +and has attracted much attention. Frequently, users show hierarchical +multi-interests reflected in a heavy user of a certain NBA team Golden State +Warriors in Sports, who is also a light user of almost the whole Animation. +Both Sports and Animation are at the same level. However, most existing methods +implicitly learn this hierarchical difference, making more fine-grained +interest information to be averaged and limiting detailed understanding of the +user's different needs in heavy interests and other light interests. Therefore, +we propose a novel two-stage approach to explicitly modeling hierarchical +multi-interest for recommendation in this work. In the first hierarchical +multi-interest mining stage, the hierarchical clustering and transformer-based +model adaptively generate circles or sub-circles that users are interested in. +In the second stage, the partition of retrieval space allows the EBR models to +only deal with items within each circle and accurately capture user's refined +interests. Experimental results show that the proposed approach achieves +state-of-the-art performance. Our framework has also successfully deployed at +Lofter (one of the largest derivative content communities with 10 million +monthly active users) for over four months. + +
+
+ comment: 4 pages, 4 figures +
+
+
+
+
+ + ☆ Towards a Unified Language Model for Knowledge-Intensive Tasks Utilizing + External Corpus + + +
+ The advent of large language models (LLMs) has showcased their efficacy +across various domains, yet they often hallucinate, especially in +knowledge-intensive tasks that require external knowledge sources. To improve +factual accuracy of language models, retrieval-augmented generation (RAG) has +emerged as a popular solution. However, traditional retrieval modules often +rely on large-scale document indexes, which can be disconnected from generative +tasks. Through generative retrieval (GR) approach, language models can achieve +superior retrieval performance by directly generating relevant document +identifiers (DocIDs). However, the relationship between GR and downstream +tasks, as well as the potential of LLMs in GR, remains unexplored. In this +paper, we present a unified language model that utilizes external corpus to +handle various knowledge-intensive tasks by seamlessly integrating generative +retrieval, closed-book generation, and RAG. In order to achieve effective +retrieval and generation through a unified continuous decoding process, we +introduce the following mechanisms: (1) a ranking-oriented DocID decoding +strategy, which improves ranking ability by directly learning from a DocID +ranking list; (2) a continuous generation strategy to facilitate effective and +efficient RAG; (3) well-designed auxiliary DocID understanding tasks to enhance +the model's comprehension of DocIDs and their relevance to downstream tasks. +Our approach is evaluated on the widely used KILT benchmark using two variants +of backbone models: an encoder-decoder T5 model and a decoder-only LLM, Llama2. +Experimental results showcase the superior performance of our models in both +retrieval and downstream knowledge-intensive tasks. + +
+
+
+
+
+ + ☆ A Multi-Agent Conversational Recommender System + + +
+ Due to strong capabilities in conducting fluent, multi-turn conversations +with users, Large Language Models (LLMs) have the potential to further improve +the performance of Conversational Recommender System (CRS). Unlike the aimless +chit-chat that LLM excels at, CRS has a clear target. So it is imperative to +control the dialogue flow in the LLM to successfully recommend appropriate +items to the users. Furthermore, user feedback in CRS can assist the system in +better modeling user preferences, which has been ignored by existing studies. +However, simply prompting LLM to conduct conversational recommendation cannot +address the above two key challenges. + In this paper, we propose Multi-Agent Conversational Recommender System +(MACRS) which contains two essential modules. First, we design a multi-agent +act planning framework, which can control the dialogue flow based on four +LLM-based agents. This cooperative multi-agent framework will generate various +candidate responses based on different dialogue acts and then choose the most +appropriate response as the system response, which can help MACRS plan suitable +dialogue acts. Second, we propose a user feedback-aware reflection mechanism +which leverages user feedback to reason errors made in previous turns to adjust +the dialogue act planning, and higher-level user information from implicit +semantics. We conduct extensive experiments based on user simulator to +demonstrate the effectiveness of MACRS in recommendation and user preferences +collection. Experimental results illustrate that MACRS demonstrates an +improvement in user interaction experience compared to directly using LLMs. + +
+
+
+
+
+ + ☆ TransFR: Transferable Federated Recommendation with Pre-trained Language + Models + + +
+ Federated recommendations (FRs), facilitating multiple local clients to +collectively learn a global model without disclosing user private data, have +emerged as a prevalent architecture for privacy-preserving recommendations. In +conventional FRs, a dominant paradigm is to utilize discrete identities to +represent users/clients and items, which are subsequently mapped to +domain-specific embeddings to participate in model training. Despite +considerable performance, we reveal three inherent limitations that can not be +ignored in federated settings, i.e., non-transferability across domains, +unavailability in cold-start settings, and potential privacy violations during +federated training. To this end, we propose a transferable federated +recommendation model with universal textual representations, TransFR, which +delicately incorporates the general capabilities empowered by pre-trained +language models and the personalized abilities by fine-tuning local private +data. Specifically, it first learns domain-agnostic representations of items by +exploiting pre-trained models with public textual corpora. To tailor for +federated recommendation, we further introduce an efficient federated +fine-tuning and a local training mechanism. This facilitates personalized local +heads for each client by utilizing their private behavior data. By +incorporating pre-training and fine-tuning within FRs, it greatly improves the +adaptation efficiency transferring to a new domain and the generalization +capacity to address cold-start issues. Through extensive experiments on several +datasets, we demonstrate that our TransFR model surpasses several +state-of-the-art FRs in terms of accuracy, transferability, and privacy. + +
+
+
+
+
+ + ☆ Clarifying the Path to User Satisfaction: An Investigation into + Clarification Usefulness EACL + + +
+ Clarifying questions are an integral component of modern information +retrieval systems, directly impacting user satisfaction and overall system +performance. Poorly formulated questions can lead to user frustration and +confusion, negatively affecting the system's performance. This research +addresses the urgent need to identify and leverage key features that contribute +to the classification of clarifying questions, enhancing user satisfaction. To +gain deeper insights into how different features influence user satisfaction, +we conduct a comprehensive analysis, considering a broad spectrum of lexical, +semantic, and statistical features, such as question length and sentiment +polarity. Our empirical results provide three main insights into the qualities +of effective query clarification: (1) specific questions are more effective +than generic ones; (2) the subjectivity and emotional tone of a question play a +role; and (3) shorter and more ambiguous queries benefit significantly from +clarification. Based on these insights, we implement feature-integrated user +satisfaction prediction using various classifiers, both traditional and +neural-based, including random forest, BERT, and large language models. Our +experiments show a consistent and significant improvement, particularly in +traditional classifiers, with a minimum performance boost of 45\%. This study +presents invaluable guidelines for refining the formulation of clarifying +questions and enhancing both user satisfaction and system performance. + +
+
+ comment: EACL +
+
+
+
+
+ + ☆ CoLe and LYS at BioASQ MESINESP8 Task: similarity based descriptor + assignment in Spanish + + +
+ In this paper, we describe our participation in the MESINESP Task of the +BioASQ biomedical semantic indexing challenge. The participating system follows +an approach based solely on conventional information retrieval tools. We have +evaluated various alternatives for extracting index terms from IBECS/LILACS +documents in order to be stored in an Apache Lucene index. Those indexed +representations are queried using the contents of the article to be annotated +and a ranked list of candidate labels is created from the retrieved documents. +We also have evaluated a sort of limited Label Powerset approach which creates +meta-labels joining pairs of DeCS labels with high co-occurrence scores, and an +alternative method based on label profile matching. Results obtained in +official runs seem to confirm the suitability of this approach for languages +like Spanish. + +
+
+ comment: Accepted at the 8th BioASQ Workshop at the 11th Conference and Labs + of the Evaluation Forum (CLEF) 2020. 11 pages +
+
+
+
+
+ + ☆ Detection of tortured phrases in scientific literature + + +
+ This paper presents various automatic detection methods to extract so called +tortured phrases from scientific papers. These tortured phrases, e.g. flag to +clamor instead of signal to noise, are the results of paraphrasing tools used +to escape plagiarism detection. We built a dataset and evaluated several +strategies to flag previously undocumented tortured phrases. The proposed and +tested methods are based on language models and either on embeddings +similarities or on predictions of masked token. We found that an approach using +token prediction and that propagates the scores to the chunk level gives the +best results. With a recall value of .87 and a precision value of .61, it could +retrieve new tortured phrases to be submitted to domain experts for validation. + +
+
+
+
+
+ + ♻ ☆ End-to-end Learnable Clustering for Intent Learning in Recommendation + + +
+ Intent learning, which aims to learn users' intents for user understanding +and item recommendation, has become a hot research spot in recent years. +However, the existing methods suffer from complex and cumbersome alternating +optimization, limiting the performance and scalability. To this end, we propose +a novel intent learning method termed \underline{ELCRec}, by unifying behavior +representation learning into an \underline{E}nd-to-end \underline{L}earnable +\underline{C}lustering framework, for effective and efficient +\underline{Rec}ommendation. Concretely, we encode users' behavior sequences and +initialize the cluster centers (latent intents) as learnable neurons. Then, we +design a novel learnable clustering module to separate different cluster +centers, thus decoupling users' complex intents. Meanwhile, it guides the +network to learn intents from behaviors by forcing behavior embeddings close to +cluster centers. This allows simultaneous optimization of recommendation and +clustering via mini-batch data. Moreover, we propose intent-assisted +contrastive learning by using cluster centers as self-supervision signals, +further enhancing mutual promotion. Both experimental results and theoretical +analyses demonstrate the superiority of ELCRec from six perspectives. Compared +to the runner-up, ELCRec improves NDCG@5 by 8.9\% and reduces computational +costs by 22.5\% on Beauty dataset. Furthermore, due to the scalability and +universal applicability, we deploy this method on the industrial recommendation +system with 130 million page views and achieve promising results. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ How Can Recommender Systems Benefit from Large Language Models: A Survey + + +
+ With the rapid development of online services, recommender systems (RS) have +become increasingly indispensable for mitigating information overload. Despite +remarkable progress, conventional recommendation models (CRM) still have some +limitations, e.g., lacking open-world knowledge, and difficulties in +comprehending users' underlying preferences and motivations. Meanwhile, large +language models (LLM) have shown impressive general intelligence and human-like +capabilities, which mainly stem from their extensive open-world knowledge, +reasoning ability, as well as their comprehension of human culture and society. +Consequently, the emergence of LLM is inspiring the design of recommender +systems and pointing out a promising research direction, i.e., whether we can +incorporate LLM and benefit from their knowledge and capabilities to compensate +for the limitations of CRM. In this paper, we conduct a comprehensive survey on +this research direction from the perspective of the whole pipeline in +real-world recommender systems. Specifically, we summarize existing works from +two orthogonal aspects: where and how to adapt LLM to RS. For the WHERE +question, we discuss the roles that LLM could play in different stages of the +recommendation pipeline, i.e., feature engineering, feature encoder, +scoring/ranking function, user interaction, and pipeline controller. For the +HOW question, we investigate the training and inference strategies, resulting +in two fine-grained taxonomy criteria, i.e., whether to tune LLM or not, and +whether to involve conventional recommendation models for inference. Then, we +highlight key challenges in adapting LLM to RS from three aspects, i.e., +efficiency, effectiveness, and ethics. Finally, we summarize the survey and +discuss the future prospects. We actively maintain a GitHub repository for +papers and other related resources: +https://github.com/CHIANGEL/Awesome-LLM-for-RecSys/. + +
+
+ comment: New version released with 27-page main content; Look-up table in + appendix +
+
+
+
+
+ + ♻ ☆ DQNC2S: DQN-based Cross-stream Crisis event Summarizer ECIR 2024 + + +
+ Summarizing multiple disaster-relevant data streams simultaneously is +particularly challenging as existing Retrieve&Re-ranking strategies suffer from +the inherent redundancy of multi-stream data and limited scalability in a +multi-query setting. This work proposes an online approach to crisis timeline +generation based on weak annotation with Deep Q-Networks. It selects on-the-fly +the relevant pieces of text without requiring neither human annotations nor +content re-ranking. This makes the inference time independent of the number of +input queries. The proposed approach also incorporates a redundancy filter into +the reward function to effectively handle cross-stream content overlaps. The +achieved ROUGE and BERTScore results are superior to those of best-performing +models on the CrisisFACTS 2022 benchmark. + +
+
+ comment: accepted at ECIR 2024 +
+
+
+
+
+ + ♻ ☆ Temporally and Distributionally Robust Optimization for Cold-Start + Recommendation AAAI'24 + + +
+ Collaborative Filtering (CF) recommender models highly depend on user-item +interactions to learn CF representations, thus falling short of recommending +cold-start items. To address this issue, prior studies mainly introduce item +features (e.g., thumbnails) for cold-start item recommendation. They learn a +feature extractor on warm-start items to align feature representations with +interactions, and then leverage the feature extractor to extract the feature +representations of cold-start items for interaction prediction. Unfortunately, +the features of cold-start items, especially the popular ones, tend to diverge +from those of warm-start ones due to temporal feature shifts, preventing the +feature extractor from accurately learning feature representations of +cold-start items. + To alleviate the impact of temporal feature shifts, we consider using +Distributionally Robust Optimization (DRO) to enhance the generation ability of +the feature extractor. Nonetheless, existing DRO methods face an inconsistency +issue: the worse-case warm-start items emphasized during DRO training might not +align well with the cold-start item distribution. To capture the temporal +feature shifts and combat this inconsistency issue, we propose a novel temporal +DRO with new optimization objectives, namely, 1) to integrate a worst-case +factor to improve the worst-case performance, and 2) to devise a shifting +factor to capture the shifting trend of item features and enhance the +optimization of the potentially popular groups in cold-start items. Substantial +experiments on three real-world datasets validate the superiority of our +temporal DRO in enhancing the generalization ability of cold-start recommender +models. The code is available at https://github.com/Linxyhaha/TDRO/. + +
+
+ comment: Accepted by AAAI'24 +
+
+
+
+
+ + ♻ ☆ A Survey on Data-Centric Recommender Systems + + +
+ Recommender systems (RSs) have become an essential tool for mitigating +information overload in a range of real-world applications. Recent trends in +RSs have revealed a major paradigm shift, moving the spotlight from +model-centric innovations to data-centric efforts (e.g., improving data quality +and quantity). This evolution has given rise to the concept of data-centric +recommender systems (Data-Centric RSs), marking a significant development in +the field. This survey provides the first systematic overview of Data-Centric +RSs, covering 1) the foundational concepts of recommendation data and +Data-Centric RSs; 2) three primary issues of recommendation data; 3) recent +research developed to address these issues; and 4) several potential future +directions of Data-Centric RSs. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ kNN Algorithm for Conditional Mean and Variance Estimation with + Automated Uncertainty Quantification and Variable Selection + + +
+ In this paper, we introduce a kNN-based regression method that synergizes the +scalability and adaptability of traditional non-parametric kNN models with a +novel variable selection technique. This method focuses on accurately +estimating the conditional mean and variance of random response variables, +thereby effectively characterizing conditional distributions across diverse +scenarios.Our approach incorporates a robust uncertainty quantification +mechanism, leveraging our prior estimation work on conditional mean and +variance. The employment of kNN ensures scalable computational efficiency in +predicting intervals and statistical accuracy in line with optimal +non-parametric rates. Additionally, we introduce a new kNN semi-parametric +algorithm for estimating ROC curves, accounting for covariates. For selecting +the smoothing parameter k, we propose an algorithm with theoretical +guarantees.Incorporation of variable selection enhances the performance of the +method significantly over conventional kNN techniques in various modeling +tasks. We validate the approach through simulations in low, moderate, and +high-dimensional covariate spaces. The algorithm's effectiveness is +particularly notable in biomedical applications as demonstrated in two case +studies. Concluding with a theoretical analysis, we highlight the consistency +and convergence rate of our method over traditional kNN models, particularly +when the underlying regression model takes values in a low-dimensional space. + +
+
+
+
+
+ + ☆ Beyond Lengthscales: No-regret Bayesian Optimisation With Unknown + Hyperparameters Of Any Type + + +
+ Bayesian optimisation requires fitting a Gaussian process model, which in +turn requires specifying hyperparameters - most of the theoretical literature +assumes those hyperparameters are known. The commonly used maximum likelihood +estimator for hyperparameters of the Gaussian process is consistent only if the +data fills the space uniformly, which does not have to be the case in Bayesian +optimisation. Since no guarantees exist regarding the correctness of +hyperparameter estimation, and those hyperparameters can significantly affect +the Gaussian process fit, theoretical analysis of Bayesian optimisation with +unknown hyperparameters is very challenging. Previously proposed algorithms +with the no-regret property were only able to handle the special case of +unknown lengthscales, reproducing kernel Hilbert space norm and applied only to +the frequentist case. We propose a novel algorithm, HE-GP-UCB, which is the +first algorithm enjoying the no-regret property in the case of unknown +hyperparameters of arbitrary form, and which supports both Bayesian and +frequentist settings. Our proof idea is novel and can easily be extended to +other variants of Bayesian optimisation. We show this by extending our +algorithm to the adversarially robust optimisation setting under unknown +hyperparameters. Finally, we empirically evaluate our algorithm on a set of toy +problems and show that it can outperform the maximum likelihood estimator. + +
+
+
+
+
+ + ☆ Position Paper: Generalized grammar rules and structure-based + generalization beyond classical equivariance for lexical tasks and + transduction + + +
+ Compositional generalization is one of the main properties which +differentiates lexical learning in humans from state-of-art neural networks. We +propose a general framework for building models that can generalize +compositionally using the concept of Generalized Grammar Rules (GGRs), a class +of symmetry-based compositional constraints for transduction tasks, which we +view as a transduction analogue of equivariance constraints in physics-inspired +tasks. Besides formalizing generalized notions of symmetry for language +transduction, our framework is general enough to contain many existing works as +special cases. We present ideas on how GGRs might be implemented, and in the +process draw connections to reinforcement learning and other areas of research. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Stochastic Two Points Method for Deep Model Zeroth-order Optimization + + +
+ Large foundation models, such as large language models, have performed +exceptionally well in various application scenarios. Building or fully +fine-tuning such large models is usually prohibitive due to either hardware +budget or lack of access to backpropagation. The zeroth-order methods offer a +promising direction for tackling this challenge, where only forward passes are +needed to update the model. This paper introduces an efficient Stochastic +Two-Point (S2P) approach within the gradient-free regime. We present the +theoretical convergence properties of S2P under the general and relaxed +smoothness assumptions. The theoretical properties also shed light on a faster +and more stable S2P variant, Accelerated S2P (AS2P), through exploiting our new +convergence properties that better represent the dynamics of deep models in +training. Our comprehensive empirical results show that AS2P is highly +effective in optimizing objectives for large deep models, including language +models, and outperforms standard methods across various model types and scales, +with 2 $\times$ speed-up in training over most conducted tasks. + +
+
+
+
+
+ + ☆ A GP-based Robust Motion Planning Framework for Agile Autonomous Robot + Navigation and Recovery in Unknown Environments ICRA + + +
+ For autonomous mobile robots, uncertainties in the environment and system +model can lead to failure in the motion planning pipeline, resulting in +potential collisions. In order to achieve a high level of robust autonomy, +these robots should be able to proactively predict and recover from such +failures. To this end, we propose a Gaussian Process (GP) based model for +proactively detecting the risk of future motion planning failure. When this +risk exceeds a certain threshold, a recovery behavior is triggered that +leverages the same GP model to find a safe state from which the robot may +continue towards the goal. The proposed approach is trained in simulation only +and can generalize to real world environments on different robotic platforms. +Simulations and physical experiments demonstrate that our framework is capable +of both predicting planner failures and recovering the robot to states where +planner success is likely, all while producing agile motion. + +
+
+ comment: To Appear in 2024 IEEE/RSJ International Conference on Robotics and + Automation (ICRA), 2024 +
+
+
+
+
+ + ☆ L2G2G: a Scalable Local-to-Global Network Embedding with Graph + Autoencoders SC + + +
+ For analysing real-world networks, graph representation learning is a popular +tool. These methods, such as a graph autoencoder (GAE), typically rely on +low-dimensional representations, also called embeddings, which are obtained +through minimising a loss function; these embeddings are used with a decoder +for downstream tasks such as node classification and edge prediction. While +GAEs tend to be fairly accurate, they suffer from scalability issues. For +improved speed, a Local2Global approach, which combines graph patch embeddings +based on eigenvector synchronisation, was shown to be fast and achieve good +accuracy. Here we propose L2G2G, a Local2Global method which improves GAE +accuracy without sacrificing scalability. This improvement is achieved by +dynamically synchronising the latent node representations, while training the +GAEs. It also benefits from the decoder computing an only local patch loss. +Hence, aligning the local embeddings in each epoch utilises more information +from the graph than a single post-training alignment does, while maintaining +scalability. We illustrate on synthetic benchmarks, as well as real-world +examples, that L2G2G achieves higher accuracy than the standard Local2Global +approach and scales efficiently on the larger data sets. We find that for large +and dense networks, it even outperforms the slow, but assumed more accurate, +GAEs. + +
+
+ comment: 13 pages, 4 figures, Complex Networks 2023, Volume I, SCI 1141 +
+
+
+
+
+ + ☆ Contingency Analysis of a Grid of Connected EVs for Primary Frequency + Control of an Industrial Microgrid Using Efficient Control Scheme + + +
+ After over a century of internal combustion engines ruling the transport +sector, electric vehicles appear to be on the verge of gaining traction due to +a slew of advantages, including lower operating costs and lower CO2 emissions. +By using the Vehicle-to-Grid (or Grid-to-Vehicle if Electric vehicles (EVs) are +utilized as load) approach, EVs can operate as both a load and a source. +Primary frequency regulation and congestion management are two essential +characteristics of this technology that are added to an industrial microgrid. +Industrial Microgrids are made up of different energy sources such as wind +farms and PV farms, storage systems, and loads. EVs have gained a lot of +interest as a technique for frequency management because of their ability to +regulate quickly. Grid reliability depends on this quick reaction. Different +contingency, state of charge of the electric vehicles, and a varying number of +EVs in an EV fleet are considered in this work, and a proposed control scheme +for frequency management is presented. This control scheme enables +bidirectional power flow, allowing for primary frequency regulation during the +various scenarios that an industrial microgrid may encounter over the course of +a 24-h period. The presented controller will provide dependable frequency +regulation support to the industrial microgrid during contingencies, as will be +demonstrated by simulation results, achieving a more reliable system. However, +simulation results will show that by increasing a number of the EVs in a fleet +for the Vehicle-to-Grid approach, an industrial microgrid\'s frequency can be +enhanced even further. + +
+
+ comment: Published in energies (MDPI) 2022 +
+
+
+
+
+ + ☆ Natural Counterfactuals With Necessary Backtracking + + +
+ Counterfactual reasoning is pivotal in human cognition and especially +important for providing explanations and making decisions. While Judea Pearl's +influential approach is theoretically elegant, its generation of a +counterfactual scenario often requires interventions that are too detached from +the real scenarios to be feasible. In response, we propose a framework of +natural counterfactuals and a method for generating counterfactuals that are +natural with respect to the actual world's data distribution. Our methodology +refines counterfactual reasoning, allowing changes in causally preceding +variables to minimize deviations from realistic scenarios. To generate natural +counterfactuals, we introduce an innovative optimization framework that permits +but controls the extent of backtracking with a naturalness criterion. Empirical +experiments indicate the effectiveness of our method. + +
+
+
+
+
+ + ☆ Learning from Two Decades of Blood Pressure Data: Demography-Specific + Patterns Across 75 Million Patient Encounters + + +
+ Hypertension remains a global health concern with a rising prevalence, +necessitating effective monitoring and understanding of blood pressure (BP) +dynamics. This study delves into the wealth of information derived from BP +measurement, a crucial approach in informing our understanding of hypertensive +trends. Numerous studies have reported on the relationship between BP variation +and various factors. In this research, we leveraged an extensive dataset +comprising 75 million records spanning two decades, offering a unique +opportunity to explore and analyze BP variations across demographic features +such as age, race, and gender. Our findings revealed that gender-based BP +variation was not statistically significant, challenging conventional +assumptions. Interestingly, systolic blood pressure (SBP) consistently +increased with age, while diastolic blood pressure (DBP) displayed a +distinctive peak in the forties age group. Moreover, our analysis uncovered +intriguing similarities in the distribution of BP among some of the racial +groups. This comprehensive investigation contributes to the ongoing discourse +on hypertension and underscores the importance of considering diverse +demographic factors in understanding BP variations. Our results provide +valuable insights that may inform personalized healthcare approaches tailored +to specific demographic profiles. + +
+
+
+
+
+ + ☆ TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent + Constitution + + +
+ The emergence of LLM-based agents has garnered considerable attention, yet +their trustworthiness remains an under-explored area. As agents can directly +interact with the physical environment, their reliability and safety is +critical. This paper presents an Agent-Constitution-based agent framework, +TrustAgent, an initial investigation into improving the safety dimension of +trustworthiness in LLM-based agents. This framework consists of threefold +strategies: pre-planning strategy which injects safety knowledge to the model +prior to plan generation, in-planning strategy which bolsters safety during +plan generation, and post-planning strategy which ensures safety by +post-planning inspection. Through experimental analysis, we demonstrate how +these approaches can effectively elevate an LLM agent's safety by identifying +and preventing potential dangers. Furthermore, we explore the intricate +relationships between safety and helpfulness, and between the model's reasoning +ability and its efficacy as a safe agent. This paper underscores the imperative +of integrating safety awareness and trustworthiness into the design and +deployment of LLM-based agents, not only to enhance their performance but also +to ensure their responsible integration into human-centric environments. Data +and code are available at https://github.com/agiresearch/TrustAgent. + +
+
+ comment: 16 pages, 3 figures, 5 tables, comments and suggestions are welcome +
+
+
+
+
+ + ☆ Spiking Music: Audio Compression with Event Based Auto-encoders + + +
+ Neurons in the brain communicate information via punctual events called +spikes. The timing of spikes is thought to carry rich information, but it is +not clear how to leverage this in digital systems. We demonstrate that +event-based encoding is efficient for audio compression. To build this +event-based representation we use a deep binary auto-encoder, and under high +sparsity pressure, the model enters a regime where the binary event matrix is +stored more efficiently with sparse matrix storage algorithms. We test this on +the large MAESTRO dataset of piano recordings against vector quantized +auto-encoders. Not only does our "Spiking Music compression" algorithm achieve +a competitive compression/reconstruction trade-off, but selectivity and +synchrony between encoded events and piano key strikes emerge without +supervision in the sparse regime. + +
+
+
+
+
+ + ☆ Understanding Adam Optimizer via Online Learning of Updates: Adam is + FTRL in Disguise + + +
+ Despite the success of the Adam optimizer in practice, the theoretical +understanding of its algorithmic components still remains limited. In +particular, most existing analyses of Adam show the convergence rate that can +be simply achieved by non-adative algorithms like SGD. In this work, we provide +a different perspective based on online learning that underscores the +importance of Adam's algorithmic components. Inspired by Cutkosky et al. +(2023), we consider the framework called online learning of updates, where we +choose the updates of an optimizer based on an online learner. With this +framework, the design of a good optimizer is reduced to the design of a good +online learner. Our main observation is that Adam corresponds to a principled +online learning framework called Follow-the-Regularized-Leader (FTRL). Building +on this observation, we study the benefits of its algorithmic components from +the online learning perspective. + +
+
+ comment: Comments would be appreciated! +
+
+
+
+
+ + ☆ Privacy-Preserving Distributed Learning for Residential Short-Term Load + Forecasting + + +
+ In the realm of power systems, the increasing involvement of residential +users in load forecasting applications has heightened concerns about data +privacy. Specifically, the load data can inadvertently reveal the daily +routines of residential users, thereby posing a risk to their property +security. While federated learning (FL) has been employed to safeguard user +privacy by enabling model training without the exchange of raw data, these FL +models have shown vulnerabilities to emerging attack techniques, such as Deep +Leakage from Gradients and poisoning attacks. To counteract these, we initially +employ a Secure-Aggregation (SecAgg) algorithm that leverages multiparty +computation cryptographic techniques to mitigate the risk of gradient leakage. +However, the introduction of SecAgg necessitates the deployment of additional +sub-center servers for executing the multiparty computation protocol, thereby +escalating computational complexity and reducing system robustness, especially +in scenarios where one or more sub-centers are unavailable. To address these +challenges, we introduce a Markovian Switching-based distributed training +framework, the convergence of which is substantiated through rigorous +theoretical analysis. The Distributed Markovian Switching (DMS) topology shows +strong robustness towards the poisoning attacks as well. Case studies employing +real-world power system load data validate the efficacy of our proposed +algorithm. It not only significantly minimizes communication complexity but +also maintains accuracy levels comparable to traditional FL methods, thereby +enhancing the scalability of our load forecasting algorithm. + +
+
+
+
+
+ + ☆ Adaptive Optimization for Prediction with Missing Data + + +
+ When training predictive models on data with missing entries, the most widely +used and versatile approach is a pipeline technique where we first impute +missing entries and then compute predictions. In this paper, we view prediction +with missing data as a two-stage adaptive optimization problem and propose a +new class of models, adaptive linear regression models, where the regression +coefficients adapt to the set of observed features. We show that some adaptive +linear regression models are equivalent to learning an imputation rule and a +downstream linear regression model simultaneously instead of sequentially. We +leverage this joint-impute-then-regress interpretation to generalize our +framework to non-linear models. In settings where data is strongly not missing +at random, our methods achieve a 2-10% improvement in out-of-sample accuracy. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2104.03158 +
+
+
+
+
+ + ☆ Learning Collective Variables for Protein Folding with Labeled Data + Augmentation through Geodesic Interpolation + + +
+ In molecular dynamics (MD) simulations, rare events, such as protein folding, +are typically studied by means of enhanced sampling techniques, most of which +rely on the definition of a collective variable (CV) along which the +acceleration occurs. Obtaining an expressive CV is crucial, but often hindered +by the lack of information about the particular event, e.g., the transition +from unfolded to folded conformation. We propose a simulation-free data +augmentation strategy using physics-inspired metrics to generate geodesic +interpolations resembling protein folding transitions, thereby improving +sampling efficiency without true transition state samples. Leveraging +interpolation progress parameters, we introduce a regression-based learning +scheme for CV models, which outperforms classifier-based methods when +transition state data is limited and noisy + +
+
+
+
+
+ + ☆ Closing the Gap in Human Behavior Analysis: A Pipeline for Synthesizing + Trimodal Data + + +
+ In pervasive machine learning, especially in Human Behavior Analysis (HBA), +RGB has been the primary modality due to its accessibility and richness of +information. However, linked with its benefits are challenges, including +sensitivity to lighting conditions and privacy concerns. One possibility to +overcome these vulnerabilities is to resort to different modalities. For +instance, thermal is particularly adept at accentuating human forms, while +depth adds crucial contextual layers. Despite their known benefits, only a few +HBA-specific datasets that integrate these modalities exist. To address this +shortage, our research introduces a novel generative technique for creating +trimodal, i.e., RGB, thermal, and depth, human-focused datasets. This technique +capitalizes on human segmentation masks derived from RGB images, combined with +thermal and depth backgrounds that are sourced automatically. With these two +ingredients, we synthesize depth and thermal counterparts from existing RGB +data utilizing conditional image-to-image translation. By employing this +approach, we generate trimodal data that can be leveraged to train models for +settings with limited data, bad lightning conditions, or privacy-sensitive +areas. + +
+
+
+
+
+ + ☆ Decoding Speculative Decoding + + +
+ Speculative Decoding is a widely used technique to speed up inference for +Large Language Models (LLMs) without modifying its outcome. When performing +inference on an LLM, speculative decoding uses a smaller draft model which +generates speculative tokens and then uses the target LLM to verify those draft +tokens. The speedup provided by speculative decoding heavily depends on the +choice of the draft model. It has been widely suggested to select a draft model +that provides a high probability of the generated token being accepted by the +LLM to achieve the highest throughput. However, our experiments indicate the +contrary with throughput diminishing as the probability of generated tokens to +be accepted by the target model increases. To understand this phenomenon, we +perform extensive experiments to characterize the different factors that affect +speculative decoding and how those factors interact and affect the speedups. +Based on our experiments we describe an analytical model which can be used to +decide the right draft model for a given workload. Further, using our insights +we design a new draft model for LLaMA-65B which can provide 30% higher +throughput than existing draft models. + +
+
+
+
+
+ + ☆ HyperPlanes: Hypernetwork Approach to Rapid NeRF Adaptation + + +
+ Neural radiance fields (NeRFs) are a widely accepted standard for +synthesizing new 3D object views from a small number of base images. However, +NeRFs have limited generalization properties, which means that we need to use +significant computational resources to train individual architectures for each +item we want to represent. To address this issue, we propose a few-shot +learning approach based on the hypernetwork paradigm that does not require +gradient optimization during inference. The hypernetwork gathers information +from the training data and generates an update for universal weights. As a +result, we have developed an efficient method for generating a high-quality 3D +object representation from a small number of images in a single step. This has +been confirmed by direct comparison with the state-of-the-art solutions and a +comprehensive ablation study. + +
+
+
+
+
+ + ☆ Low-Resource Cross-Domain Singing Voice Synthesis via Reduced + Self-Supervised Speech Representations ICASSP + + +
+ In this paper, we propose a singing voice synthesis model, Karaoker-SSL, that +is trained only on text and speech data as a typical multi-speaker acoustic +model. It is a low-resource pipeline that does not utilize any singing data +end-to-end, since its vocoder is also trained on speech data. Karaoker-SSL is +conditioned by self-supervised speech representations in an unsupervised +manner. We preprocess these representations by selecting only a subset of their +task-correlated dimensions. The conditioning module is indirectly guided to +capture style information during training by multi-tasking. This is achieved +with a Conformer-based module, which predicts the pitch from the acoustic +model's output. Thus, Karaoker-SSL allows singing voice synthesis without +reliance on hand-crafted and domain-specific features. There are also no +requirements for text alignments or lyrics timestamps. To refine the voice +quality, we employ a U-Net discriminator that is conditioned on the target +speaker and follows a Diffusion GAN training scheme. + +
+
+ comment: Accepted to IEEE ICASSP SASB 2024 +
+
+
+
+
+ + ☆ Enhancing Stochastic Gradient Descent: A Unified Framework and Novel + Acceleration Methods for Faster Convergence + + +
+ Based on SGD, previous works have proposed many algorithms that have improved +convergence speed and generalization in stochastic optimization, such as SGDm, +AdaGrad, Adam, etc. However, their convergence analysis under non-convex +conditions is challenging. In this work, we propose a unified framework to +address this issue. For any first-order methods, we interpret the updated +direction $g_t$ as the sum of the stochastic subgradient $\nabla f_t(x_t)$ and +an additional acceleration term $\frac{2|\langle v_t, \nabla f_t(x_t) +\rangle|}{\|v_t\|_2^2} v_t$, thus we can discuss the convergence by analyzing +$\langle v_t, \nabla f_t(x_t) \rangle$. Through our framework, we have +discovered two plug-and-play acceleration methods: \textbf{Reject Accelerating} +and \textbf{Random Vector Accelerating}, we theoretically demonstrate that +these two methods can directly lead to an improvement in convergence rate. + +
+
+
+
+
+ + ☆ Mapping the Multiverse of Latent Representations + + +
+ Echoing recent calls to counter reliability and robustness concerns in +machine learning via multiverse analysis, we present PRESTO, a principled +framework for mapping the multiverse of machine-learning models that rely on +latent representations. Although such models enjoy widespread adoption, the +variability in their embeddings remains poorly understood, resulting in +unnecessary complexity and untrustworthy representations. Our framework uses +persistent homology to characterize the latent spaces arising from different +combinations of diverse machine-learning methods, (hyper)parameter +configurations, and datasets, allowing us to measure their pairwise +(dis)similarity and statistically reason about their distributions. As we +demonstrate both theoretically and empirically, our pipeline preserves +desirable properties of collections of latent representations, and it can be +leveraged to perform sensitivity analysis, detect anomalous embeddings, or +efficiently and effectively navigate hyperparameter search spaces. + +
+
+
+
+
+ + ☆ Advancing Brain Tumor Inpainting with Generative Models + + +
+ Synthesizing healthy brain scans from diseased brain scans offers a potential +solution to address the limitations of general-purpose algorithms, such as +tissue segmentation and brain extraction algorithms, which may not effectively +handle diseased images. We consider this a 3D inpainting task and investigate +the adaptation of 2D inpainting methods to meet the requirements of 3D magnetic +resonance imaging(MRI) data. Our contributions encompass potential +modifications tailored to MRI-specific needs, and we conducted evaluations of +multiple inpainting techniques using the BraTS2023 Inpainting datasets to +assess their efficacy and limitations. + +
+
+
+
+
+ + ☆ Why do Random Forests Work? Understanding Tree Ensembles as + Self-Regularizing Adaptive Smoothers + + +
+ Despite their remarkable effectiveness and broad application, the drivers of +success underlying ensembles of trees are still not fully understood. In this +paper, we highlight how interpreting tree ensembles as adaptive and +self-regularizing smoothers can provide new intuition and deeper insight to +this topic. We use this perspective to show that, when studied as smoothers, +randomized tree ensembles not only make predictions that are quantifiably more +smooth than the predictions of the individual trees they consist of, but also +further regulate their smoothness at test-time based on the dissimilarity +between testing and training inputs. First, we use this insight to revisit, +refine and reconcile two recent explanations of forest success by providing a +new way of quantifying the conjectured behaviors of tree ensembles objectively +by measuring the effective degree of smoothing they imply. Then, we move beyond +existing explanations for the mechanisms by which tree ensembles improve upon +individual trees and challenge the popular wisdom that the superior performance +of forests should be understood as a consequence of variance reduction alone. +We argue that the current high-level dichotomy into bias- and +variance-reduction prevalent in statistics is insufficient to understand tree +ensembles -- because the prevailing definition of bias does not capture +differences in the expressivity of the hypothesis classes formed by trees and +forests. Instead, we show that forests can improve upon trees by three distinct +mechanisms that are usually implicitly entangled. In particular, we demonstrate +that the smoothing effect of ensembling can reduce variance in predictions due +to noise in outcome generation, reduce variability in the quality of the +learned function given fixed input data and reduce potential bias in learnable +functions by enriching the available hypothesis space. + +
+
+
+
+
+ + ☆ Sliced-Wasserstein Estimation with Spherical Harmonics as Control + Variates + + +
+ The Sliced-Wasserstein (SW) distance between probability measures is defined +as the average of the Wasserstein distances resulting for the associated +one-dimensional projections. As a consequence, the SW distance can be written +as an integral with respect to the uniform measure on the sphere and the Monte +Carlo framework can be employed for calculating the SW distance. Spherical +harmonics are polynomials on the sphere that form an orthonormal basis of the +set of square-integrable functions on the sphere. Putting these two facts +together, a new Monte Carlo method, hereby referred to as Spherical Harmonics +Control Variates (SHCV), is proposed for approximating the SW distance using +spherical harmonics as control variates. The resulting approach is shown to +have good theoretical properties, e.g., a no-error property for Gaussian +measures under a certain form of linear dependency between the variables. +Moreover, an improved rate of convergence, compared to Monte Carlo, is +established for general measures. The convergence analysis relies on the +Lipschitz property associated to the SW integrand. Several numerical +experiments demonstrate the superior performance of SHCV against +state-of-the-art methods for SW distance computation. + +
+
+
+
+
+ + ☆ Connecting the Dots: Is Mode-Connectedness the Key to Feasible + Sample-Based Inference in Bayesian Neural Networks? + + +
+ A major challenge in sample-based inference (SBI) for Bayesian neural +networks is the size and structure of the networks' parameter space. Our work +shows that successful SBI is possible by embracing the characteristic +relationship between weight and function space, uncovering a systematic link +between overparameterization and the difficulty of the sampling problem. +Through extensive experiments, we establish practical guidelines for sampling +and convergence diagnosis. As a result, we present a Bayesian deep ensemble +approach as an effective solution with competitive performance and uncertainty +quantification. + +
+
+
+
+
+ + ☆ Multi-level protein pre-training with Vabs-Net + + +
+ In recent years, there has been a surge in the development of 3D +structure-based pre-trained protein models, representing a significant +advancement over pre-trained protein language models in various downstream +tasks. However, most existing structure-based pre-trained models primarily +focus on the residue level, i.e., alpha carbon atoms, while ignoring other +atoms like side chain atoms. We argue that modeling proteins at both residue +and atom levels is important since the side chain atoms can also be crucial for +numerous downstream tasks, for example, molecular docking. Nevertheless, we +find that naively combining residue and atom information during pre-training +typically fails. We identify a key reason is the information leakage caused by +the inclusion of atom structure in the input, which renders residue-level +pre-training tasks trivial and results in insufficiently expressive residue +representations. To address this issue, we introduce a span mask pre-training +strategy on 3D protein chains to learn meaningful representations of both +residues and atoms. This leads to a simple yet effective approach to learning +protein representation suitable for diverse downstream tasks. Extensive +experimental results on binding site prediction and function prediction tasks +demonstrate our proposed pre-training approach significantly outperforms other +methods. Our code will be made public. + +
+
+
+
+
+ + ☆ Self-Attention through Kernel-Eigen Pair Sparse Variational Gaussian + Processes + + +
+ While the great capability of Transformers significantly boosts prediction +accuracy, it could also yield overconfident predictions and require calibrated +uncertainty estimation, which can be commonly tackled by Gaussian processes +(GPs). Existing works apply GPs with symmetric kernels under variational +inference to the attention kernel; however, omitting the fact that attention +kernels are in essence asymmetric. Moreover, the complexity of deriving the GP +posteriors remains high for large-scale data. In this work, we propose +Kernel-Eigen Pair Sparse Variational Gaussian Processes (KEP-SVGP) for building +uncertainty-aware self-attention where the asymmetry of attention kernels is +tackled by Kernel SVD (KSVD) and a reduced complexity is acquired. Through +KEP-SVGP, i) the SVGP pair induced by the two sets of singular vectors from +KSVD w.r.t. the attention kernel fully characterizes the asymmetry; ii) using +only a small set of adjoint eigenfunctions from KSVD, the derivation of SVGP +posteriors can be based on the inversion of a diagonal matrix containing +singular values, contributing to a reduction in time complexity; iii) an +evidence lower bound is derived so that variational parameters can be optimized +towards this objective. Experiments verify our excellent performances and +efficiency on in-distribution, distribution-shift and out-of-distribution +benchmarks. + +
+
+ comment: We propose Kernel-Eigen Pair Sparse Variational Gaussian Processes + (KEP-SVGP) for building uncertainty-aware self-attention where the asymmetry + of attention kernel is tackled by KSVD and a reduced time complexity is + acquired +
+
+
+
+
+ + ☆ Deep Conditional Generative Learning: Model and Error Analysis + + +
+ We introduce an Ordinary Differential Equation (ODE) based deep generative +method for learning a conditional distribution, named the Conditional Follmer +Flow. Starting from a standard Gaussian distribution, the proposed flow could +efficiently transform it into the target conditional distribution at time 1. +For effective implementation, we discretize the flow with Euler's method where +we estimate the velocity field nonparametrically using a deep neural network. +Furthermore, we derive a non-asymptotic convergence rate in the Wasserstein +distance between the distribution of the learned samples and the target +distribution, providing the first comprehensive end-to-end error analysis for +conditional distribution learning via ODE flow. Our numerical experiments +showcase its effectiveness across a range of scenarios, from standard +nonparametric conditional density estimation problems to more intricate +challenges involving image data, illustrating its superiority over various +existing conditional density estimation methods. + +
+
+
+
+
+ + ☆ Integrating Large Language Models in Causal Discovery: A Statistical + Causal Approach + + +
+ In practical statistical causal discovery (SCD), embedding domain expert +knowledge as constraints into the algorithm is widely accepted as significant +for creating consistent meaningful causal models, despite the recognized +challenges in systematic acquisition of the background knowledge. To overcome +these challenges, this paper proposes a novel methodology for causal inference, +in which SCD methods and knowledge based causal inference (KBCI) with a large +language model (LLM) are synthesized through "statistical causal prompting +(SCP)" for LLMs and prior knowledge augmentation for SCD. Experiments have +revealed that GPT-4 can cause the output of the LLM-KBCI and the SCD result +with prior knowledge from LLM-KBCI to approach the ground truth, and that the +SCD result can be further improved, if GPT-4 undergoes SCP. Furthermore, it has +been clarified that an LLM can improve SCD with its background knowledge, even +if the LLM does not contain information on the dataset. The proposed approach +can thus address challenges such as dataset biases and limitations, +illustrating the potential of LLMs to improve data-driven causal inference +across diverse scientific domains. + +
+
+
+
+
+ + ☆ Improving importance estimation in covariate shift for providing + accurate prediction error + + +
+ In traditional Machine Learning, the algorithms predictions are based on the +assumption that the data follows the same distribution in both the training and +the test datasets. However, in real world data this condition does not hold +and, for instance, the distribution of the covariates changes whereas the +conditional distribution of the targets remains unchanged. This situation is +called covariate shift problem where standard error estimation may be no longer +accurate. In this context, the importance is a measure commonly used to +alleviate the influence of covariate shift on error estimations. The main +drawback is that it is not easy to compute. The Kullback-Leibler Importance +Estimation Procedure (KLIEP) is capable of estimating importance in a promising +way. Despite its good performance, it fails to ignore target information, since +it only includes the covariates information for computing the importance. In +this direction, this paper explores the potential performance improvement if +target information is considered in the computation of the importance. Then, a +redefinition of the importance arises in order to be generalized in this way. +Besides the potential improvement in performance, including target information +make possible the application to a real application about plankton +classification that motivates this research and characterized by its great +dimensionality, since considering targets rather than covariates reduces the +computation and the noise in the covariates. The impact of taking target +information is also explored when Logistic Regression (LR), Kernel Mean +Matching (KMM), Ensemble Kernel Mean Matching (EKMM) and the naive predecessor +of KLIEP called Kernel Density Estimation (KDE) methods estimate the +importance. The experimental results lead to a more accurate error estimation +using target information, especially in case of the more promising method +KLIEP. + +
+
+
+
+
+ + ☆ Mission Critical -- Satellite Data is a Distinct Modality in Machine + Learning + + +
+ Satellite data has the potential to inspire a seismic shift for machine +learning -- one in which we rethink existing practices designed for traditional +data modalities. As machine learning for satellite data (SatML) gains traction +for its real-world impact, our field is at a crossroads. We can either continue +applying ill-suited approaches, or we can initiate a new research agenda that +centers around the unique characteristics and challenges of satellite data. +This position paper argues that satellite data constitutes a distinct modality +for machine learning research and that we must recognize it as such to advance +the quality and impact of SatML research across theory, methods, and +deployment. We outline critical discussion questions and actionable suggestions +to transform SatML from merely an intriguing application area to a dedicated +research discipline that helps move the needle on big challenges for machine +learning and society. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Learning the Market: Sentiment-Based Ensemble Trading Agents + + +
+ We propose the integration of sentiment analysis and deep-reinforcement +learning ensemble algorithms for stock trading, and design a strategy capable +of dynamically altering its employed agent given concurrent market sentiment. +In particular, we create a simple-yet-effective method for extracting news +sentiment and combine this with general improvements upon existing works, +resulting in automated trading agents that effectively consider both +qualitative market factors and quantitative stock data. We show that our +approach results in a strategy that is profitable, robust, and risk-minimal -- +outperforming the traditional ensemble strategy as well as single agent +algorithms and market metrics. Our findings determine that the conventional +practice of switching ensemble agents every fixed-number of months is +sub-optimal, and that a dynamic sentiment-based framework greatly unlocks +additional performance within these agents. Furthermore, as we have designed +our algorithm with simplicity and efficiency in mind, we hypothesize that the +transition of our method from historical evaluation towards real-time trading +with live data should be relatively simple. + +
+
+
+
+
+ + ☆ Few-Shot Learning on Graphs: from Meta-learning to Pre-training and + Prompting + + +
+ Graph representation learning, a critical step in graph-centric tasks, has +seen significant advancements. Earlier techniques often operate in an +end-to-end setting, where performance heavily relies on the availability of +ample labeled data. This constraint has spurred the emergence of few-shot +learning on graphs, where only a few task-specific labels are available for +each task. Given the extensive literature in this field, this survey endeavors +to synthesize recent developments, provide comparative insights, and identify +future directions. We systematically categorize existing studies into three +major families: meta-learning approaches, pre-training approaches, and hybrid +approaches, with a finer-grained classification in each family to aid readers +in their method selection process. Within each category, we analyze the +relationships among these methods and compare their strengths and limitations. +Finally, we outline prospective future directions for few-shot learning on +graphs to catalyze continued innovation in this field. + +
+
+
+
+
+ + ☆ From Words to Molecules: A Survey of Large Language Models in Chemistry IJCAI 2024 + + +
+ In recent years, Large Language Models (LLMs) have achieved significant +success in natural language processing (NLP) and various interdisciplinary +areas. However, applying LLMs to chemistry is a complex task that requires +specialized domain knowledge. This paper provides a thorough exploration of the +nuanced methodologies employed in integrating LLMs into the field of chemistry, +delving into the complexities and innovations at this interdisciplinary +juncture. Specifically, our analysis begins with examining how molecular +information is fed into LLMs through various representation and tokenization +methods. We then categorize chemical LLMs into three distinct groups based on +the domain and modality of their input data, and discuss approaches for +integrating these inputs for LLMs. Furthermore, this paper delves into the +pretraining objectives with adaptations to chemical LLMs. After that, we +explore the diverse applications of LLMs in chemistry, including novel +paradigms for their application in chemistry tasks. Finally, we identify +promising research directions, including further integration with chemical +knowledge, advancements in continual learning, and improvements in model +interpretability, paving the way for groundbreaking developments in the field. + +
+
+ comment: Submitted to IJCAI 2024 survey track +
+
+
+
+
+ + ☆ Conditioning non-linear and infinite-dimensional diffusion processes + + +
+ Generative diffusion models and many stochastic models in science and +engineering naturally live in infinite dimensions before discretisation. To +incorporate observed data for statistical and learning tasks, one needs to +condition on observations. While recent work has treated conditioning linear +processes in infinite dimensions, conditioning non-linear processes in infinite +dimensions has not been explored. This paper conditions function valued +stochastic processes without prior discretisation. To do so, we use an +infinite-dimensional version of Girsanov's theorem to condition a +function-valued stochastic process, leading to a stochastic differential +equation (SDE) for the conditioned process involving the score. We apply this +technique to do time series analysis for shapes of organisms in evolutionary +biology, where we discretise via the Fourier basis and then learn the +coefficients of the score function with score matching methods. + +
+
+
+
+
+ + ☆ Approximate Control for Continuous-Time POMDPs AISTATS 2024 + + +
+ This work proposes a decision-making framework for partially observable +systems in continuous time with discrete state and action spaces. As optimal +decision-making becomes intractable for large state spaces we employ +approximation methods for the filtering and the control problem that scale well +with an increasing number of states. Specifically, we approximate the +high-dimensional filtering distribution by projecting it onto a parametric +family of distributions, and integrate it into a control heuristic based on the +fully observable system to obtain a scalable policy. We demonstrate the +effectiveness of our approach on several partially observed systems, including +queueing systems and chemical reaction networks. + +
+
+ comment: To be published in AISTATS 2024 +
+
+
+
+
+ + ☆ A Data-Driven Analysis of Robust Automatic Piano Transcription + + +
+ Algorithms for automatic piano transcription have improved dramatically in +recent years due to new datasets and modeling techniques. Recent developments +have focused primarily on adapting new neural network architectures, such as +the Transformer and Perceiver, in order to yield more accurate systems. In this +work, we study transcription systems from the perspective of their training +data. By measuring their performance on out-of-distribution annotated piano +data, we show how these models can severely overfit to acoustic properties of +the training data. We create a new set of audio for the MAESTRO dataset, +captured automatically in a professional studio recording environment via +Yamaha Disklavier playback. Using various data augmentation techniques when +training with the original and re-performed versions of the MAESTRO dataset, we +achieve state-of-the-art note-onset accuracy of 88.4 F1-score on the MAPS +dataset, without seeing any of its training data. We subsequently analyze these +data augmentation techniques in a series of ablation studies to better +understand their influence on the resulting models. + +
+
+ comment: Accepted for publication in IEEE Signal Processing Letters on 31 + Janurary, 2024 +
+
+
+
+
+ + ☆ Sequence Shortening for Context-Aware Machine Translation ACL + + +
+ Context-aware Machine Translation aims to improve translations of sentences +by incorporating surrounding sentences as context. Towards this task, two main +architectures have been applied, namely single-encoder (based on concatenation) +and multi-encoder models. In this study, we show that a special case of +multi-encoder architecture, where the latent representation of the source +sentence is cached and reused as the context in the next step, achieves higher +accuracy on the contrastive datasets (where the models have to rank the correct +translation among the provided sentences) and comparable BLEU and COMET scores +as the single- and multi-encoder approaches. Furthermore, we investigate the +application of Sequence Shortening to the cached representations. We test three +pooling-based shortening techniques and introduce two novel methods - Latent +Grouping and Latent Selecting, where the network learns to group tokens or +selects the tokens to be cached as context. Our experiments show that the two +methods achieve competitive BLEU and COMET scores and accuracies on the +contrastive datasets to the other tested methods while potentially allowing for +higher interpretability and reducing the growth of memory requirements with +increased context size. + +
+
+ comment: Findings of the ACL: EACL 2024 +
+
+
+
+
+ + ☆ SMLP: Symbolic Machine Learning Prover + + +
+ Symbolic Machine Learning Prover (SMLP) is a tool and a library for system +exploration based on data samples obtained by simulating or executing the +system on a number of input vectors. SMLP aims at exploring the system based on +this data by taking a grey-box approach: SMLP combines statistical methods of +data exploration with building and exploring machine learning models in close +feedback loop with the system's response, and exploring these models by +combining probabilistic and formal methods. SMLP has been applied in industrial +setting at Intel for analyzing and optimizing hardware designs at the analog +level. SMLP is a general purpose tool and can be applied to systems that can be +sampled and modeled by machine learning models. + +
+
+ comment: 12 pages, 4 figures. (submitted) +
+
+
+
+
+ + ☆ Objective and subjective evaluation of speech enhancement methods in the + UDASE task of the 7th CHiME challenge + + +
+ Supervised models for speech enhancement are trained using artificially +generated mixtures of clean speech and noise signals. However, the synthetic +training conditions may not accurately reflect real-world conditions +encountered during testing. This discrepancy can result in poor performance +when the test domain significantly differs from the synthetic training domain. +To tackle this issue, the UDASE task of the 7th CHiME challenge aimed to +leverage real-world noisy speech recordings from the test domain for +unsupervised domain adaptation of speech enhancement models. Specifically, this +test domain corresponds to the CHiME-5 dataset, characterized by real +multi-speaker and conversational speech recordings made in noisy and +reverberant domestic environments, for which ground-truth clean speech signals +are not available. In this paper, we present the objective and subjective +evaluations of the systems that were submitted to the CHiME-7 UDASE task, and +we provide an analysis of the results. This analysis reveals a limited +correlation between subjective ratings and several supervised nonintrusive +performance metrics recently proposed for speech enhancement. Conversely, the +results suggest that more traditional intrusive objective metrics can be used +for in-domain performance evaluation using the reverberant LibriCHiME-5 dataset +developed for the challenge. The subjective evaluation indicates that all +systems successfully reduced the background noise, but always at the expense of +increased distortion. Out of the four speech enhancement methods evaluated +subjectively, only one demonstrated an improvement in overall quality compared +to the unprocessed noisy speech, highlighting the difficulty of the task. The +tools and audio material created for the CHiME-7 UDASE task are shared with the +community. + +
+
+
+
+
+ + ☆ Bass Accompaniment Generation via Latent Diffusion ICASSP 2024 + + +
+ The ability to automatically generate music that appropriately matches an +arbitrary input track is a challenging task. We present a novel controllable +system for generating single stems to accompany musical mixes of arbitrary +length. At the core of our method are audio autoencoders that efficiently +compress audio waveform samples into invertible latent representations, and a +conditional latent diffusion model that takes as input the latent encoding of a +mix and generates the latent encoding of a corresponding stem. To provide +control over the timbre of generated samples, we introduce a technique to +ground the latent space to a user-provided reference style during diffusion +sampling. For further improving audio quality, we adapt classifier-free +guidance to avoid distortions at high guidance strengths when generating an +unbounded latent space. We train our model on a dataset of pairs of mixes and +matching bass stems. Quantitative experiments demonstrate that, given an input +mix, the proposed system can generate basslines with user-specified timbres. +Our controllable conditional audio generation framework represents a +significant step forward in creating generative AI tools to assist musicians in +music production. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ☆ XAI for Skin Cancer Detection with Prototypes and Non-Expert Supervision MICCAI 2023 + + +
+ Skin cancer detection through dermoscopy image analysis is a critical task. +However, existing models used for this purpose often lack interpretability and +reliability, raising the concern of physicians due to their black-box nature. +In this paper, we propose a novel approach for the diagnosis of melanoma using +an interpretable prototypical-part model. We introduce a guided supervision +based on non-expert feedback through the incorporation of: 1) binary masks, +obtained automatically using a segmentation network; and 2) user-refined +prototypes. These two distinct information pathways aim to ensure that the +learned prototypes correspond to relevant areas within the skin lesion, +excluding confounding factors beyond its boundaries. Experimental results +demonstrate that, even without expert supervision, our approach achieves +superior performance and generalization compared to non-interpretable models. + +
+
+ comment: Accepted in the iMIMIC Workshop @ MICCAI 2023 +
+
+
+
+
+ + ☆ Climbing the Ladder of Interpretability with Counterfactual Concept + Bottleneck Models + + +
+ Current deep learning models are not designed to simultaneously address three +fundamental questions: predict class labels to solve a given classification +task (the "What?"), explain task predictions (the "Why?"), and imagine +alternative scenarios that could result in different predictions (the "What +if?"). The inability to answer these questions represents a crucial gap in +deploying reliable AI agents, calibrating human trust, and deepening +human-machine interaction. To bridge this gap, we introduce CounterFactual +Concept Bottleneck Models (CF-CBMs), a class of models designed to efficiently +address the above queries all at once without the need to run post-hoc +searches. Our results show that CF-CBMs produce: accurate predictions (the +"What?"), simple explanations for task predictions (the "Why?"), and +interpretable counterfactuals (the "What if?"). CF-CBMs can also sample or +estimate the most probable counterfactual to: (i) explain the effect of concept +interventions on tasks, (ii) show users how to get a desired class label, and +(iii) propose concept interventions via "task-driven" interventions. + +
+
+
+
+
+ + ☆ Zero-Shot Machine Unlearning at Scale via Lipschitz Regularization + + +
+ To comply with AI and data regulations, the need to forget private or +copyrighted information from trained machine learning models is increasingly +important. The key challenge in unlearning is forgetting the necessary data in +a timely manner, while preserving model performance. In this work, we address +the zero-shot unlearning scenario, whereby an unlearning algorithm must be able +to remove data given only a trained model and the data to be forgotten. Under +such a definition, existing state-of-the-art methods are insufficient. Building +on the concepts of Lipschitz continuity, we present a method that induces +smoothing of the forget sample's output, with respect to perturbations of that +sample. We show this smoothing successfully results in forgetting while +preserving general model performance. We perform extensive empirical evaluation +of our method over a range of contemporary benchmarks, verifying that our +method achieves state-of-the-art performance under the strict constraints of +zero-shot unlearning. + +
+
+
+
+
+ + ☆ Query-Efficient Correlation Clustering with Noisy Oracle + + +
+ We study a general clustering setting in which we have $n$ elements to be +clustered, and we aim to perform as few queries as possible to an oracle that +returns a noisy sample of the similarity between two elements. Our setting +encompasses many application domains in which the similarity function is costly +to compute and inherently noisy. We propose two novel formulations of online +learning problems rooted in the paradigm of Pure Exploration in Combinatorial +Multi-Armed Bandits (PE-CMAB): fixed confidence and fixed budget settings. For +both settings, we design algorithms that combine a sampling strategy with a +classic approximation algorithm for correlation clustering and study their +theoretical guarantees. Our results are the first examples of polynomial-time +algorithms that work for the case of PE-CMAB in which the underlying offline +optimization problem is NP-hard. + +
+
+
+
+
+ + ☆ A Probabilistic Model to explain Self-Supervised Representation Learning + + +
+ Self-supervised learning (SSL) learns representations by leveraging an +auxiliary unsupervised task, such as classifying semantically related samples, +e.g. different data augmentations or modalities. Of the many approaches to SSL, +contrastive methods, e.g. SimCLR, CLIP and VicREG, have gained attention for +learning representations that achieve downstream performance close to that of +supervised learning. However, a theoretical understanding of the mechanism +behind these methods eludes. We propose a generative latent variable model for +the data and show that several families of discriminative self-supervised +algorithms, including contrastive methods, approximately induce its latent +structure over representations, providing a unifying theoretical framework. We +also justify links to mutual information and the use of a projection head. +Fitting our model generatively, as SimVE, improves performance over previous +VAE methods on common benchmarks (e.g. FashionMNIST, CIFAR10, CelebA), narrows +the gap to discriminative methods on _content_ classification and, as our +analysis predicts, outperforms them where _style_ information is required, +taking a step toward task-agnostic representations. + +
+
+
+
+
+ + ☆ ALERT-Transformer: Bridging Asynchronous and Synchronous Machine + Learning for Real-Time Event-based Spatio-Temporal Data + + +
+ We seek to enable classic processing of continuous ultra-sparse +spatiotemporal data generated by event-based sensors with dense machine +learning models. We propose a novel hybrid pipeline composed of asynchronous +sensing and synchronous processing that combines several ideas: (1) an +embedding based on PointNet models -- the ALERT module -- that can continuously +integrate new and dismiss old events thanks to a leakage mechanism, (2) a +flexible readout of the embedded data that allows to feed any downstream model +with always up-to-date features at any sampling rate, (3) exploiting the input +sparsity in a patch-based approach inspired by Vision Transformer to optimize +the efficiency of the method. These embeddings are then processed by a +transformer model trained for object and gesture recognition. Using this +approach, we achieve performances at the state-of-the-art with a lower latency +than competitors. We also demonstrate that our asynchronous model can operate +at any desired sampling rate. + +
+
+ comment: Preprint version. 8 pages, 7 figures, under review +
+
+
+
+
+ + ☆ Emergence of heavy tails in homogenized stochastic gradient descent + + +
+ It has repeatedly been observed that loss minimization by stochastic gradient +descent (SGD) leads to heavy-tailed distributions of neural network parameters. +Here, we analyze a continuous diffusion approximation of SGD, called +homogenized stochastic gradient descent, show that it behaves asymptotically +heavy-tailed, and give explicit upper and lower bounds on its tail-index. We +validate these bounds in numerical experiments and show that they are typically +close approximations to the empirical tail-index of SGD iterates. In addition, +their explicit form enables us to quantify the interplay between optimization +parameters and the tail-index. Doing so, we contribute to the ongoing +discussion on links between heavy tails and the generalization performance of +neural networks as well as the ability of SGD to avoid suboptimal local minima. + +
+
+
+
+
+ + ☆ Regularized boosting with an increasing coefficient magnitude stop + criterion as meta-learner in hyperparameter optimization stacking ensemble + + +
+ In Hyperparameter Optimization (HPO), only the hyperparameter configuration +with the best performance is chosen after performing several trials, then, +discarding the effort of training all the models with every hyperparameter +configuration trial and performing an ensemble of all them. This ensemble +consists of simply averaging the model predictions or weighting the models by a +certain probability. Recently, other more sophisticated ensemble strategies, +such as the Caruana method or the stacking strategy has been proposed. On the +one hand, the Caruana method performs well in HPO ensemble, since it is not +affected by the effects of multicollinearity, which is prevalent in HPO. It +just computes the average over a subset of predictions with replacement. But it +does not benefit from the generalization power of a learning process. On the +other hand, stacking methods include a learning procedure since a meta-learner +is required to perform the ensemble. Yet, one hardly finds advice about which +meta-learner is adequate. Besides, some meta-learners may suffer from the +effects of multicollinearity or need to be tuned to reduce them. This paper +explores meta-learners for stacking ensemble in HPO, free of hyperparameter +tuning, able to reduce the effects of multicollinearity and considering the +ensemble learning process generalization power. At this respect, the boosting +strategy seems promising as a stacking meta-learner. In fact, it completely +removes the effects of multicollinearity. This paper also proposes an implicit +regularization in the classical boosting method and a novel non-parametric stop +criterion suitable only for boosting and specifically designed for HPO. The +synergy between these two improvements over boosting exhibits competitive and +promising predictive power performance compared to other existing meta-learners +and ensemble approaches for HPO other than the stacking ensemble. + +
+
+
+
+
+ + ☆ LoTR: Low Tensor Rank Weight Adaptation + + +
+ In this paper we generalize and extend an idea of low-rank adaptation (LoRA) +of large language models (LLMs) based on Transformer architecture. Widely used +LoRA-like methods of fine-tuning LLMs are based on matrix factorization of +gradient update. We introduce LoTR, a novel approach for parameter-efficient +fine-tuning of LLMs which represents a gradient update to parameters in a form +of tensor decomposition. Low-rank adapter for each layer is constructed as a +product of three matrices, and tensor structure arises from sharing left and +right multipliers of this product among layers. Simultaneous compression of a +sequence of layers with low-rank tensor representation allows LoTR to archive +even better parameter efficiency then LoRA especially for deep models. +Moreover, the core tensor does not depend on original weight dimension and can +be made arbitrary small, which allows for extremely cheap and fast downstream +fine-tuning. + +
+
+ comment: Submitted +
+
+
+
+
+ + ☆ Critic-Actor for Average Reward MDPs with Function Approximation: A + Finite-Time Analysis + + +
+ In recent years, there has been a lot of research work activity focused on +carrying out asymptotic and non-asymptotic convergence analyses for +two-timescale actor critic algorithms where the actor updates are performed on +a timescale that is slower than that of the critic. In a recent work, the +critic-actor algorithm has been presented for the infinite horizon discounted +cost setting in the look-up table case where the timescales of the actor and +the critic are reversed and asymptotic convergence analysis has been presented. +In our work, we present the first critic-actor algorithm with function +approximation and in the long-run average reward setting and present the first +finite-time (non-asymptotic) analysis of such a scheme. We obtain optimal +learning rates and prove that our algorithm achieves a sample complexity of +$\mathcal{\tilde{O}}(\epsilon^{-2.08})$ for the mean squared error of the +critic to be upper bounded by $\epsilon$ which is better than the one obtained +for actor-critic in a similar setting. We also show the results of numerical +experiments on three benchmark settings and observe that the critic-actor +algorithm competes well with the actor-critic algorithm. + +
+
+
+
+
+ + ☆ Cheating Suffix: Targeted Attack to Text-To-Image Diffusion Models with + Multi-Modal Priors + + +
+ Diffusion models have been widely deployed in various image generation tasks, +demonstrating an extraordinary connection between image and text modalities. +However, they face challenges of being maliciously exploited to generate +harmful or sensitive images by appending a specific suffix to the original +prompt. Existing works mainly focus on using single-modal information to +conduct attacks, which fails to utilize multi-modal features and results in +less than satisfactory performance. Integrating multi-modal priors (MMP), i.e. +both text and image features, we propose a targeted attack method named +MMP-Attack in this work. Specifically, the goal of MMP-Attack is to add a +target object into the image content while simultaneously removing the original +object. The MMP-Attack shows a notable advantage over existing works with +superior universality and transferability, which can effectively attack +commercial text-to-image (T2I) models such as DALL-E 3. To the best of our +knowledge, this marks the first successful attempt of transfer-based attack to +commercial T2I models. Our code is publicly available at +\url{https://github.com/ydc123/MMP-Attack}. + +
+
+ comment: 10 figures +
+
+
+
+
+ + ☆ Continual Learning for Large Language Models: A Survey + + +
+ Large language models (LLMs) are not amenable to frequent re-training, due to +high training costs arising from their massive scale. However, updates are +necessary to endow LLMs with new skills and keep them up-to-date with rapidly +evolving human knowledge. This paper surveys recent works on continual learning +for LLMs. Due to the unique nature of LLMs, we catalog continue learning +techniques in a novel multi-staged categorization scheme, involving continual +pretraining, instruction tuning, and alignment. We contrast continual learning +for LLMs with simpler adaptation methods used in smaller models, as well as +with other enhancement strategies like retrieval-augmented generation and model +editing. Moreover, informed by a discussion of benchmarks and evaluation, we +identify several challenges and future work directions for this crucial task. + +
+
+
+
+
+ + ☆ To the Max: Reinventing Reward in Reinforcement Learning + + +
+ In reinforcement learning (RL), different rewards can define the same optimal +policy but result in drastically different learning performance. For some, the +agent gets stuck with a suboptimal behavior, and for others, it solves the task +efficiently. Choosing a good reward function is hence an extremely important +yet challenging problem. In this paper, we explore an alternative approach to +using rewards for learning. We introduce max-reward RL, where an agent +optimizes the maximum rather than the cumulative reward. Unlike earlier works, +our approach works for deterministic and stochastic environments and can be +easily combined with state-of-the-art RL algorithms. In the experiments, we +study the performance of max-reward RL algorithms in two goal-reaching +environments from Gymnasium-Robotics and demonstrate its benefits over standard +RL. The code is publicly available. + +
+
+
+
+
+ + ☆ TESSERACT: Eliminating Experimental Bias in Malware Classification + across Space and Time (Extended Version) + + +
+ Machine learning (ML) plays a pivotal role in detecting malicious software. +Despite the high F1-scores reported in numerous studies reaching upwards of +0.99, the issue is not completely solved. Malware detectors often experience +performance decay due to constantly evolving operating systems and attack +methods, which can render previously learned knowledge insufficient for +accurate decision-making on new inputs. This paper argues that commonly +reported results are inflated due to two pervasive sources of experimental bias +in the detection task: spatial bias caused by data distributions that are not +representative of a real-world deployment; and temporal bias caused by +incorrect time splits of data, leading to unrealistic configurations. To +address these biases, we introduce a set of constraints for fair experiment +design, and propose a new metric, AUT, for classifier robustness in real-world +settings. We additionally propose an algorithm designed to tune training data +to enhance classifier performance. Finally, we present TESSERACT, an +open-source framework for realistic classifier comparison. Our evaluation +encompasses both traditional ML and deep learning methods, examining published +works on an extensive Android dataset with 259,230 samples over a five-year +span. Additionally, we conduct case studies in the Windows PE and PDF domains. +Our findings identify the existence of biases in previous studies and reveal +that significant performance enhancements are possible through appropriate, +periodic tuning. We explore how mitigation strategies may support in achieving +a more stable and better performance over time by employing multiple strategies +to delay performance decay. + +
+
+ comment: 35 pages, submitted to ACM ToPS, under reviewing. arXiv admin note: + text overlap with arXiv:1807.07838 +
+
+
+
+
+ + ☆ FedMoE: Data-Level Personalization with Mixture of Experts for + Model-Heterogeneous Personalized Federated Learning + + +
+ Federated learning (FL) is widely employed for collaborative training on +decentralized data but faces challenges like data, system, and model +heterogeneity. This prompted the emergency of model-heterogeneous personalized +federated learning (MHPFL). However, concerns persist regarding data and model +privacy, model performance, communication, and computational costs in current +MHPFL methods. To tackle these concerns, we propose a novel model-heterogeneous +personalized Federated learning algorithm (FedMoE) with the Mixture of Experts +(MoE), renowned for enhancing large language models (LLMs). It assigns a shared +homogeneous small feature extractor and a local gating network for each +client's local heterogeneous large model. (1) During local training, the local +heterogeneous model's feature extractor acts as a local expert for personalized +feature (representation) extraction, while the shared homogeneous small feature +extractor serves as a global expert for generalized feature extraction. The +local gating network produces personalized weights for extracted +representations from both experts on each data sample. The three models form a +local heterogeneous MoE. The weighted mixed representation fuses global +generalized and local personalized features and is processed by the local +heterogeneous large model's header with personalized prediction information for +output. The MoE and prediction header are updated synchronously. (2) The +trained local homogeneous small feature extractors are sent to the server for +cross-client information fusion via aggregation. Briefly, FedMoE first enhances +local model personalization at a fine-grained data level while supporting model +heterogeneity. + +
+
+
+
+
+ + ☆ CORE: Mitigating Catastrophic Forgetting in Continual Learning through + Cognitive Replay + + +
+ This paper introduces a novel perspective to significantly mitigate +catastrophic forgetting in continuous learning (CL), which emphasizes models' +capacity to preserve existing knowledge and assimilate new information. Current +replay-based methods treat every task and data sample equally and thus can not +fully exploit the potential of the replay buffer. In response, we propose +COgnitive REplay (CORE), which draws inspiration from human cognitive review +processes. CORE includes two key strategies: Adaptive Quantity Allocation and +Quality-Focused Data Selection. The former adaptively modulates the replay +buffer allocation for each task based on its forgetting rate, while the latter +guarantees the inclusion of representative data that best encapsulates the +characteristics of each task within the buffer. Our approach achieves an +average accuracy of 37.95% on split-CIFAR10, surpassing the best baseline +method by 6.52%. Additionally, it significantly enhances the accuracy of the +poorest-performing task by 6.30% compared to the top baseline. + +
+
+
+
+
+ + ☆ Skip $\textbackslash n$: A simple method to reduce hallucination in + Large Vision-Language Models + + +
+ Recent advancements in large vision-language models (LVLMs) have demonstrated +impressive capability in visual information understanding with human language. +Despite these advances, LVLMs still face challenges with multimodal +hallucination, such as generating text descriptions of objects that are not +present in the visual information. However, the underlying fundamental reasons +of multimodal hallucinations remain poorly explored. In this paper, we propose +a new perspective, suggesting that the inherent biases in LVLMs might be a key +factor in hallucinations. Specifically, we systematically identify a semantic +shift bias related to paragraph breaks ('$\textbackslash n\textbackslash n$'), +where the content before and after '$\textbackslash n\textbackslash n$' in the +training data frequently exhibit significant semantic changes. This pattern +leads the model to infer that the contents following '$\textbackslash +n\textbackslash n$' should be obviously different from the preceding contents +with less hallucinatory descriptions, thereby increasing the probability of +hallucinatory descriptions subsequent to the '$\textbackslash n\textbackslash +n$'. We have validated this hypothesis on multiple publicly available LVLMs. +Besides, we find that deliberately inserting '$\textbackslash n\textbackslash +n$' at the generated description can induce more hallucinations. A simple +method is proposed to effectively mitigate the hallucination of LVLMs by +skipping the output of `\textbackslash n'. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Monotone, Bi-Lipschitz, and Polyak-Łojasiewicz Networks + + +
+ This paper presents a new \emph{bi-Lipschitz} invertible neural network, the +BiLipNet, which has the ability to control both its \emph{Lipschitzness} +(output sensitivity to input perturbations) and \emph{inverse Lipschitzness} +(input distinguishability from different outputs). The main contribution is a +novel invertible residual layer with certified strong monotonicity and +Lipschitzness, which we compose with orthogonal layers to build bi-Lipschitz +networks. The certification is based on incremental quadratic constraints, +which achieves much tighter bounds compared to spectral normalization. +Moreover, we formulate the model inverse calculation as a three-operator +splitting problem, for which fast algorithms are known. Based on the proposed +bi-Lipschitz network, we introduce a new scalar-output network, the PLNet, +which satisfies the Polyak-\L{}ojasiewicz condition. It can be applied to learn +non-convex surrogate losses with favourable properties, e.g., a unique and +efficiently-computable global minimum. + +
+
+
+
+
+ + ☆ Shapelet-based Model-agnostic Counterfactual Local Explanations for Time + Series Classification AAAI 2024 + + +
+ In this work, we propose a model-agnostic instance-based post-hoc +explainability method for time series classification. The proposed algorithm, +namely Time-CF, leverages shapelets and TimeGAN to provide counterfactual +explanations for arbitrary time series classifiers. We validate the proposed +method on several real-world univariate time series classification tasks from +the UCR Time Series Archive. The results indicate that the counterfactual +instances generated by Time-CF when compared to state-of-the-art methods, +demonstrate better performance in terms of four explainability metrics: +closeness, sensibility, plausibility, and sparsity. + +
+
+ comment: The paper has been accepted by the XAI4Sci workshop of AAAI 2024 +
+
+
+
+
+ + ☆ Training-time Neuron Alignment through Permutation Subspace for + Improving Linear Mode Connectivity and Model Fusion + + +
+ In deep learning, stochastic gradient descent often yields functionally +similar yet widely scattered solutions in the weight space even under the same +initialization, causing barriers in the Linear Mode Connectivity (LMC) +landscape. Overcoming these barriers is crucial for understanding deep learning +dynamics and enhancing model-fusion algorithms. Previous studies highlight the +role of permutation symmetry in reducing post-training barriers through network +permutation. However, these post-hoc methods, demanding extra computations, are +less effective for larger, complex models (e.g., ViT, LLM) due to numerous +permutation matrices. Thus, in this paper, we study training-time neuron +alignment. Our hypothesis suggests that training-time permutation subspace can +reduce LMC barriers for free. We find that pruning at initialization supports +this. Beyond pruning, we introduce TNA-PFN, a simple yet lossless algorithm +using a partial gradient mask during training. TNA-PFN is theoretically and +empirically validated for reducing LMC barriers. It excels in wide model fusion +applications, especially in federated learning, two algorithms based on TNA-FPN +that are proposed to show its prospects even under heterogeneous datasets. +Moreover, TNA-PFN can enhance the generalization of model soup for vision +transformers and ColD fusion for pretrained language models. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Fundamental Properties of Causal Entropy and Information Gain + + +
+ Recent developments enable the quantification of causal control given a +structural causal model (SCM). This has been accomplished by introducing +quantities which encode changes in the entropy of one variable when intervening +on another. These measures, named causal entropy and causal information gain, +aim to address limitations in existing information theoretical approaches for +machine learning tasks where causality plays a crucial role. They have not yet +been properly mathematically studied. Our research contributes to the formal +understanding of the notions of causal entropy and causal information gain by +establishing and analyzing fundamental properties of these concepts, including +bounds and chain rules. Furthermore, we elucidate the relationship between +causal entropy and stochastic interventions. We also propose definitions for +causal conditional entropy and causal conditional information gain. Overall, +this exploration paves the way for enhancing causal machine learning tasks +through the study of recently-proposed information theoretic quantities +grounded in considerations about causality. + +
+
+ comment: Accepted for the conference CLeaR (Causal Learning and Reasoning) + 2024. To appear in its proceedings +
+
+
+
+
+ + ☆ SignSGD with Federated Defense: Harnessing Adversarial Attacks through + Gradient Sign Decoding + + +
+ Distributed learning is an effective approach to accelerate model training +using multiple workers. However, substantial communication delays emerge +between workers and a parameter server due to massive costs associated with +communicating gradients. SignSGD with majority voting (signSGD-MV) is a simple +yet effective optimizer that reduces communication costs through one-bit +quantization, yet the convergence rates considerably decrease as adversarial +workers increase. In this paper, we show that the convergence rate is invariant +as the number of adversarial workers increases, provided that the number of +adversarial workers is smaller than that of benign workers. The key idea +showing this counter-intuitive result is our novel signSGD with federated +defense (signSGD-FD). Unlike the traditional approaches, signSGD-FD exploits +the gradient information sent by adversarial workers with the proper weights, +which are obtained through gradient sign decoding. Experimental results +demonstrate signSGD-FD achieves superior convergence rates over traditional +algorithms in various adversarial attack scenarios. + +
+
+
+
+
+ + ☆ Inferring the Langevin Equation with Uncertainty via Bayesian Neural + Networks + + +
+ Pervasive across diverse domains, stochastic systems exhibit fluctuations in +processes ranging from molecular dynamics to climate phenomena. The Langevin +equation has served as a common mathematical model for studying such systems, +enabling predictions of their temporal evolution and analyses of thermodynamic +quantities, including absorbed heat, work done on the system, and entropy +production. However, inferring the Langevin equation from observed trajectories +remains challenging, particularly for nonlinear and high-dimensional systems. +In this study, we present a comprehensive framework that employs Bayesian +neural networks for inferring Langevin equations in both overdamped and +underdamped regimes. Our framework first provides the drift force and diffusion +matrix separately and then combines them to construct the Langevin equation. By +providing a distribution of predictions instead of a single value, our approach +allows us to assess prediction uncertainties, which can prevent potential +misunderstandings and erroneous decisions about the system. We demonstrate the +effectiveness of our framework in inferring Langevin equations for various +scenarios including a neuron model and microscopic engine, highlighting its +versatility and potential impact. + +
+
+ comment: 30 pages, 17 figures +
+
+
+
+
+ + ☆ Supervised Algorithmic Fairness in Distribution Shifts: A Survey + + +
+ Supervised fairness-aware machine learning under distribution shifts is an +emerging field that addresses the challenge of maintaining equitable and +unbiased predictions when faced with changes in data distributions from source +to target domains. In real-world applications, machine learning models are +often trained on a specific dataset but deployed in environments where the data +distribution may shift over time due to various factors. This shift can lead to +unfair predictions, disproportionately affecting certain groups characterized +by sensitive attributes, such as race and gender. In this survey, we provide a +summary of various types of distribution shifts and comprehensively investigate +existing methods based on these shifts, highlighting six commonly used +approaches in the literature. Additionally, this survey lists publicly +available datasets and evaluation metrics for empirical studies. We further +explore the interconnection with related research fields, discuss the +significant challenges, and identify potential directions for future studies. + +
+
+
+
+
+ + ☆ KTO: Model Alignment as Prospect Theoretic Optimization + + +
+ Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive +random variables in a biased but well-defined manner; for example, humans are +famously loss-averse. We show that objectives for aligning LLMs with human +feedback implicitly incorporate many of these biases -- the success of these +objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed +to them being $\textit{human-aware loss functions}$ (HALOs). However, the +utility functions these methods attribute to humans still differ from those in +the prospect theory literature. Using a Kahneman-Tversky model of human +utility, we propose a HALO that directly maximizes the utility of generations +instead of maximizing the log-likelihood of preferences, as current methods do. +We call this approach Kahneman-Tversky Optimization (KTO), and it matches or +exceeds the performance of preference-based methods at scales from 1B to 30B. +Crucially, KTO does not need preferences -- only a binary signal of whether an +output is desirable or undesirable for a given input. This makes it far easier +to use in the real world, where preference data is scarce and expensive. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ A Unified Framework for Gradient-based Clustering of Distributed Data + + +
+ We develop a family of distributed clustering algorithms that work over +networks of users. In the proposed scenario, users contain a local dataset and +communicate only with their immediate neighbours, with the aim of finding a +clustering of the full, joint data. The proposed family, termed Distributed +Gradient Clustering (DGC-$\mathcal{F}_\rho$), is parametrized by $\rho \geq 1$, +controling the proximity of users' center estimates, with $\mathcal{F}$ +determining the clustering loss. Specialized to popular clustering losses like +$K$-means and Huber loss, DGC-$\mathcal{F}_\rho$ gives rise to novel +distributed clustering algorithms DGC-KM$_\rho$ and DGC-HL$_\rho$, while a +novel clustering loss based on the logistic function leads to DGC-LL$_\rho$. We +provide a unified analysis and establish several strong results, under mild +assumptions. First, the sequence of centers generated by the methods converges +to a well-defined notion of fixed point, under any center initialization and +value of $\rho$. Second, as $\rho$ increases, the family of fixed points +produced by DGC-$\mathcal{F}_\rho$ converges to a notion of consensus fixed +points. We show that consensus fixed points of DGC-$\mathcal{F}_{\rho}$ are +equivalent to fixed points of gradient clustering over the full data, +guaranteeing a clustering of the full data is produced. For the special case of +Bregman losses, we show that our fixed points converge to the set of Lloyd +points. Numerical experiments on real data confirm our theoretical findings and +demonstrate strong performance of the methods. + +
+
+ comment: 35 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ Characterizing Overfitting in Kernel Ridgeless Regression Through the + Eigenspectrum + + +
+ We derive new bounds for the condition number of kernel matrices, which we +then use to enhance existing non-asymptotic test error bounds for kernel +ridgeless regression in the over-parameterized regime for a fixed input +dimension. For kernels with polynomial spectral decay, we recover the bound +from previous work; for exponential decay, our bound is non-trivial and novel. + Our conclusion on overfitting is two-fold: (i) kernel regressors whose +eigenspectrum decays polynomially must generalize well, even in the presence of +noisy labeled training data; these models exhibit so-called tempered +overfitting; (ii) if the eigenspectrum of any kernel ridge regressor decays +exponentially, then it generalizes poorly, i.e., it exhibits catastrophic +overfitting. This adds to the available characterization of kernel ridge +regressors exhibiting benign overfitting as the extremal case where the +eigenspectrum of the kernel decays sub-polynomially. Our analysis combines new +random matrix theory (RMT) techniques with recent tools in the kernel ridge +regression (KRR) literature. + +
+
+
+
+
+ + ☆ Bi-CryptoNets: Leveraging Different-Level Privacy for Encrypted + Inference + + +
+ Privacy-preserving neural networks have attracted increasing attention in +recent years, and various algorithms have been developed to keep the balance +between accuracy, computational complexity and information security from the +cryptographic view. This work takes a different view from the input data and +structure of neural networks. We decompose the input data (e.g., some images) +into sensitive and insensitive segments according to importance and privacy. +The sensitive segment includes some important and private information such as +human faces and we take strong homomorphic encryption to keep security, whereas +the insensitive one contains some background and we add perturbations. We +propose the bi-CryptoNets, i.e., plaintext and ciphertext branches, to deal +with two segments, respectively, and ciphertext branch could utilize the +information from plaintext branch by unidirectional connections. We adopt +knowledge distillation for our bi-CryptoNets by transferring representations +from a well-trained teacher neural network. Empirical studies show the +effectiveness and decrease of inference latency for our bi-CryptoNets. + +
+
+
+
+
+ + ☆ ExtremeCast: Boosting Extreme Value Prediction for Global Weather + Forecast + + +
+ Data-driven weather forecast based on machine learning (ML) has experienced +rapid development and demonstrated superior performance in the global +medium-range forecast compared to traditional physics-based dynamical models. +However, most of these ML models struggle with accurately predicting extreme +weather, which is closely related to the extreme value prediction. Through +mathematical analysis, we prove that the use of symmetric losses, such as the +Mean Squared Error (MSE), leads to biased predictions and underestimation of +extreme values. To address this issue, we introduce Exloss, a novel loss +function that performs asymmetric optimization and highlights extreme values to +obtain accurate extreme weather forecast. Furthermore, we introduce a +training-free extreme value enhancement strategy named ExEnsemble, which +increases the variance of pixel values and improves the forecast robustness. +Combined with an advanced global weather forecast model, extensive experiments +show that our solution can achieve state-of-the-art performance in extreme +weather prediction, while maintaining the overall forecast accuracy comparable +to the top medium-range forecast models. + +
+
+
+
+
+ + ☆ Can MLLMs Perform Text-to-Image In-Context Learning? + + +
+ The evolution from Large Language Models (LLMs) to Multimodal Large Language +Models (MLLMs) has spurred research into extending In-Context Learning (ICL) to +its multimodal counterpart. Existing such studies have primarily concentrated +on image-to-text ICL. However, the Text-to-Image ICL (T2I-ICL), with its unique +characteristics and potential applications, remains underexplored. To address +this gap, we formally define the task of T2I-ICL and present CoBSAT, the first +T2I-ICL benchmark dataset, encompassing ten tasks. Utilizing our dataset to +benchmark six state-of-the-art MLLMs, we uncover considerable difficulties +MLLMs encounter in solving T2I-ICL. We identify the primary challenges as the +inherent complexity of multimodality and image generation. To overcome these +challenges, we explore strategies like fine-tuning and Chain-of-Thought +prompting, demonstrating notable improvements. Our code and dataset are +available at \url{https://github.com/UW-Madison-Lee-Lab/CoBSAT}. + +
+
+
+
+
+ + ☆ Spiking CenterNet: A Distillation-boosted Spiking Neural Network for + Object Detection + + +
+ In the era of AI at the edge, self-driving cars, and climate change, the need +for energy-efficient, small, embedded AI is growing. Spiking Neural Networks +(SNNs) are a promising approach to address this challenge, with their +event-driven information flow and sparse activations. We propose Spiking +CenterNet for object detection on event data. It combines an SNN CenterNet +adaptation with an efficient M2U-Net-based decoder. Our model significantly +outperforms comparable previous work on Prophesee's challenging GEN1 Automotive +Detection Dataset while using less than half the energy. Distilling the +knowledge of a non-spiking teacher into our SNN further increases performance. +To the best of our knowledge, our work is the first approach that takes +advantage of knowledge distillation in the field of spiking object detection. + +
+
+ comment: 8 pages, 5 figures. Submitted to WCCI-2024 +
+
+
+
+
+ + ☆ Differentiable and accelerated wavelet transforms on the sphere and ball + + +
+ Directional wavelet dictionaries are hierarchical representations which +efficiently capture and segment information across scale, location and +orientation. Such representations demonstrate a particular affinity to physical +signals, which often exhibit highly anisotropic, localised multiscale +structure. Many physically important signals are observed over spherical +domains, such as the celestial sky in cosmology. Leveraging recent advances in +computational harmonic analysis, we design new highly distributable and +automatically differentiable directional wavelet transforms on the +$2$-dimensional sphere $\mathbb{S}^2$ and $3$-dimensional ball $\mathbb{B}^3 = +\mathbb{R}^+ \times \mathbb{S}^2$ (the space formed by augmenting the sphere +with the radial half-line). We observe up to a $300$-fold and $21800$-fold +acceleration for signals on the sphere and ball, respectively, compared to +existing software, whilst maintaining 64-bit machine precision. Not only do +these algorithms dramatically accelerate existing spherical wavelet transforms, +the gradient information afforded by automatic differentiation unlocks many +data-driven analysis techniques previously not possible for these spaces. We +publicly release both S2WAV and S2BALL, open-sourced JAX libraries for our +transforms that are automatically differentiable and readily deployable both on +and over clusters of hardware accelerators (e.g. GPUs & TPUs). + +
+
+
+
+
+ + ☆ Parametric-Task MAP-Elites + + +
+ Optimizing a set of functions simultaneously by leveraging their similarity +is called multi-task optimization. Current black-box multi-task algorithms only +solve a finite set of tasks, even when the tasks originate from a continuous +space. In this paper, we introduce Parametric-task MAP-Elites (PT-ME), a novel +black-box algorithm to solve continuous multi-task optimization problems. This +algorithm (1) solves a new task at each iteration, effectively covering the +continuous space, and (2) exploits a new variation operator based on local +linear regression. The resulting dataset of solutions makes it possible to +create a function that maps any task parameter to its optimal solution. We show +on two parametric-task toy problems and a more realistic and challenging +robotic problem in simulation that PT-ME outperforms all baselines, including +the deep reinforcement learning algorithm PPO. + +
+
+
+
+
+ + ☆ On the Transferability of Large-Scale Self-Supervision to Few-Shot Audio + Classification ICASSP + + +
+ In recent years, self-supervised learning has excelled for its capacity to +learn robust feature representations from unlabelled data. Networks pretrained +through self-supervision serve as effective feature extractors for downstream +tasks, including Few-Shot Learning. While the evaluation of unsupervised +approaches for few-shot learning is well-established in imagery, it is notably +absent in acoustics. This study addresses this gap by assessing large-scale +self-supervised models' performance in few-shot audio classification. +Additionally, we explore the relationship between a model's few-shot learning +capability and other downstream task benchmarks. Our findings reveal +state-of-the-art performance in some few-shot problems such as +SpeechCommandsv2, as well as strong correlations between speech-based few-shot +problems and various downstream audio tasks. + +
+
+ comment: Camera Ready version as submitted to ICASSP SASB Workshop 2024. 5 + pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Direct side information learning for zero-shot regression + + +
+ Zero-shot learning provides models for targets for which instances are not +available, commonly called unobserved targets. The availability of target side +information becomes crucial in this context in order to properly induce models +for these targets. The literature is plenty of strategies to cope with this +scenario, but specifically designed on the basis of a zero-shot classification +scenario, mostly in computer vision and image classification, but they are +either not applicable or easily extensible for a zero-shot regression framework +for which a continuos value is required to be predicted rather than a label. In +fact, there is a considerable lack of methods for zero-shot regression in the +literature. Two approaches for zero-shot regression that work in a two-phase +procedure were recently proposed. They first learn the observed target models +through a classical regression learning ignoring the target side information. +Then, they aggregate those observed target models afterwards exploiting the +target side information and the models for the unobserved targets are induced. +Despite both have shown quite good performance because of the different +treatment they grant to the common features and to the side information, they +exploit features and side information separately, avoiding a global +optimization for providing the unobserved target models. The proposal of this +paper is a novel method that jointly takes features and side information in a +one-phase learning process, but treating side information properly and in a +more deserving way than as common features. A specific kernel that properly +merges features and side information is proposed for this purpose resulting in +a novel approach that exhibits better performance over both artificial and real +datasets. + +
+
+
+
+
+ + ☆ A Differentiable POGLM with Forward-Backward Message Passing + + +
+ The partially observable generalized linear model (POGLM) is a powerful tool +for understanding neural connectivity under the assumption of existing hidden +neurons. With spike trains only recorded from visible neurons, existing works +use variational inference to learn POGLM meanwhile presenting the difficulty of +learning this latent variable model. There are two main issues: (1) the sampled +Poisson hidden spike count hinders the use of the pathwise gradient estimator +in VI; and (2) the existing design of the variational model is neither +expressive nor time-efficient, which further affects the performance. For (1), +we propose a new differentiable POGLM, which enables the pathwise gradient +estimator, better than the score function gradient estimator used in existing +works. For (2), we propose the forward-backward message-passing sampling scheme +for the variational model. Comprehensive experiments show that our +differentiable POGLMs with our forward-backward message passing produce a +better performance on one synthetic and two real-world datasets. Furthermore, +our new method yields more interpretable parameters, underscoring its +significance in neuroscience. + +
+
+
+
+
+ + ☆ Cascaded Scaling Classifier: class incremental learning with probability + scaling + + +
+ Humans are capable of acquiring new knowledge and transferring learned +knowledge into different domains, incurring a small forgetting. The same +ability, called Continual Learning, is challenging to achieve when operating +with neural networks due to the forgetting affecting past learned tasks when +learning new ones. This forgetting can be mitigated by replaying stored samples +from past tasks, but a large memory size may be needed for long sequences of +tasks; moreover, this could lead to overfitting on saved samples. In this +paper, we propose a novel regularisation approach and a novel incremental +classifier called, respectively, Margin Dampening and Cascaded Scaling +Classifier. The first combines a soft constraint and a knowledge distillation +approach to preserve past learned knowledge while allowing the model to learn +new patterns effectively. The latter is a gated incremental classifier, helping +the model modify past predictions without directly interfering with them. This +is achieved by modifying the output of the model with auxiliary scaling +functions. We empirically show that our approach performs well on multiple +benchmarks against well-established baselines, and we also study each component +of our proposal and how the combinations of such components affect the final +results. + +
+
+
+
+
+ + ☆ TEDDY: Trimming Edges with Degree-based Discrimination strategY + + +
+ Since the pioneering work on the lottery ticket hypothesis for graph neural +networks (GNNs) was proposed in Chen et al. (2021), the study on finding graph +lottery tickets (GLT) has become one of the pivotal focus in the GNN community, +inspiring researchers to discover sparser GLT while achieving comparable +performance to original dense networks. In parallel, the graph structure has +gained substantial attention as a crucial factor in GNN training dynamics, also +elucidated by several recent studies. Despite this, contemporary studies on +GLT, in general, have not fully exploited inherent pathways in the graph +structure and identified tickets in an iterative manner, which is +time-consuming and inefficient. To address these limitations, we introduce +TEDDY, a one-shot edge sparsification framework that leverages structural +information by incorporating edge-degree information. Following edge +sparsification, we encourage the parameter sparsity during training via simple +projected gradient descent on the $\ell_0$ ball. Given the target sparsity +levels for both the graph structure and the model parameters, our TEDDY +facilitates efficient and rapid realization of GLT within a single training. +Remarkably, our experimental results demonstrate that TEDDY significantly +surpasses conventional iterative approaches in generalization, even when +conducting one-shot sparsification that solely utilizes graph structures, +without taking node features into account. + +
+
+
+
+
+ + ☆ Position Aware 60 GHz mmWave Beamforming for V2V Communications + Utilizing Deep Learning + + +
+ Beamforming techniques are considered as essential parts to compensate the +severe path loss in millimeter-wave (mmWave) communications by adopting large +antenna arrays and formulating narrow beams to obtain satisfactory received +powers. However, performing accurate beam alignment over such narrow beams for +efficient link configuration by traditional beam selection approaches, mainly +relied on channel state information, typically impose significant latency and +computing overheads, which is often infeasible in vehicle-to-vehicle (V2V) +communications like highly dynamic scenarios. In contrast, utilizing +out-of-band contextual information, such as vehicular position information, is +a potential alternative to reduce such overheads. In this context, this paper +presents a deep learning-based solution on utilizing the vehicular position +information for predicting the optimal beams having sufficient mmWave received +powers so that the best V2V line-of-sight links can be ensured proactively. +After experimental evaluation of the proposed solution on real-world measured +mmWave sensing and communications datasets, the results show that the solution +can achieve up to 84.58% of received power of link status on average, which +confirm a promising solution for beamforming in mmWave at 60 GHz enabled V2V +communications. + +
+
+ comment: 2024 IEEE International Conference on Communications (ICC), Denver, + CO, USA +
+
+
+
+
+ + ☆ Transformers Learn Nonlinear Features In Context: Nonconvex Mean-field + Dynamics on the Attention Landscape + + +
+ Large language models based on the Transformer architecture have demonstrated +impressive capabilities to learn in context. However, existing theoretical +studies on how this phenomenon arises are limited to the dynamics of a single +layer of attention trained on linear regression tasks. In this paper, we study +the optimization of a Transformer consisting of a fully connected layer +followed by a linear attention layer. The MLP acts as a common nonlinear +representation or feature map, greatly enhancing the power of in-context +learning. We prove in the mean-field and two-timescale limit that the +infinite-dimensional loss landscape for the distribution of parameters, while +highly nonconvex, becomes quite benign. We also analyze the second-order +stability of mean-field dynamics and show that Wasserstein gradient flow almost +always avoids saddle points. Furthermore, we establish novel methods for +obtaining concrete improvement rates both away from and near critical points. +This represents the first saddle point analysis of mean-field dynamics in +general and the techniques are of independent interest. + +
+
+ comment: 32 pages, 1 figure +
+
+
+
+
+ + ☆ Target inductive methods for zero-shot regression + + +
+ This research arises from the need to predict the amount of air pollutants in +meteorological stations. Air pollution depends on the location of the stations +(weather conditions and activities in the surroundings). Frequently, the +surrounding information is not considered in the learning process. This +information is known beforehand in the absence of unobserved weather conditions +and remains constant for the same station. Considering the surrounding +information as side information facilitates the generalization for predicting +pollutants in new stations, leading to a zero-shot regression scenario. +Available methods in zero-shot typically lean towards classification, and are +not easily extensible to regression. This paper proposes two zero-shot methods +for regression. The first method is a similarity based approach that learns +models from features and aggregates them using side information. However, +potential knowledge of the feature models may be lost in the aggregation. The +second method overcomes this drawback by replacing the aggregation procedure +and learning the correspondence between side information and feature-induced +models, instead. Both proposals are compared with a baseline procedure using +artificial datasets, UCI repository communities and crime datasets, and the +pollutants. Both approaches outperform the baseline method, but the parameter +learning approach manifests its superiority over the similarity based method. + +
+
+
+
+
+ + ☆ Two Heads Are Better Than One: Boosting Graph Sparse Training via + Semantic and Topological Awareness + + +
+ Graph Neural Networks (GNNs) excel in various graph learning tasks but face +computational challenges when applied to large-scale graphs. A promising +solution is to remove non-essential edges to reduce the computational overheads +in GNN. Previous literature generally falls into two categories: +topology-guided and semantic-guided. The former maintains certain graph +topological properties yet often underperforms on GNNs due to low integration +with neural network training. The latter performs well at lower sparsity on +GNNs but faces performance collapse at higher sparsity levels. With this in +mind, we take the first step to propose a new research line and concept termed +Graph Sparse Training (GST), which dynamically manipulates sparsity at the data +level. Specifically, GST initially constructs a topology & semantic anchor at a +low training cost, followed by performing dynamic sparse training to align the +sparse graph with the anchor. We introduce the Equilibria Sparsification +Principle to guide this process, effectively balancing the preservation of both +topological and semantic information. Ultimately, GST produces a sparse graph +with maximum topological integrity and no performance degradation. Extensive +experiments on 6 datasets and 5 backbones showcase that GST (I) identifies +subgraphs at higher graph sparsity levels (1.67%~15.85% $\uparrow$) than +state-of-the-art sparsification methods, (II) preserves more key spectral +properties, (III) achieves 1.27-3.42$\times$ speedup in GNN inference and (IV) +successfully helps graph adversarial defense and graph lottery tickets. + +
+
+
+
+
+ + ☆ Beyond the Request: Harnessing HTTP Response Headers for Cross-Browser + Web Tracker Classification in an Imbalanced Setting + + +
+ The World Wide Web's connectivity is greatly attributed to the HTTP protocol, +with HTTP messages offering informative header fields that appeal to +disciplines like web security and privacy, especially concerning web tracking. +Despite existing research employing HTTP/S request messages to identify web +trackers, HTTP/S response headers are often overlooked. This study endeavors to +design effective machine learning classifiers for web tracker detection using +HTTP/S response headers. Data from the Chrome, Firefox, and Brave browsers, +obtained through the traffic monitoring browser extension T.EX, serves as our +data set. Eleven supervised models were trained on Chrome data and tested +across all browsers. The results demonstrated high accuracy, F1-score, +precision, recall, and minimal log-loss error for Chrome and Firefox, but +subpar performance on Brave, potentially due to its distinct data distribution +and feature set. The research suggests that these classifiers are viable for +detecting web trackers in Chrome and Firefox. However, real-world application +testing remains pending, and the distinction between tracker types and broader +label sources could be explored in future studies. + +
+
+
+
+
+ + ☆ Flexible Variational Information Bottleneck: Achieving Diverse + Compression with a Single Training + + +
+ Information Bottleneck (IB) is a widely used framework that enables the +extraction of information related to a target random variable from a source +random variable. In the objective function, IB controls the trade-off between +data compression and predictiveness through the Lagrange multiplier $\beta$. +Traditionally, to find the trade-off to be learned, IB requires a search for +$\beta$ through multiple training cycles, which is computationally expensive. +In this study, we introduce Flexible Variational Information Bottleneck (FVIB), +an innovative framework for classification task that can obtain optimal models +for all values of $\beta$ with single, computationally efficient training. We +theoretically demonstrate that across all values of reasonable $\beta$, FVIB +can simultaneously maximize an approximation of the objective function for +Variational Information Bottleneck (VIB), the conventional IB method. Then we +empirically show that FVIB can learn the VIB objective as effectively as VIB. +Furthermore, in terms of calibration performance, FVIB outperforms other IB and +calibration methods by enabling continuous optimization of $\beta$. Our codes +are available at https://github.com/sotakudo/fvib. + +
+
+
+
+
+ + ☆ Unveiling Delay Effects in Traffic Forecasting: A Perspective from + Spatial-Temporal Delay Differential Equations + + +
+ Traffic flow forecasting is a fundamental research issue for transportation +planning and management, which serves as a canonical and typical example of +spatial-temporal predictions. In recent years, Graph Neural Networks (GNNs) and +Recurrent Neural Networks (RNNs) have achieved great success in capturing +spatial-temporal correlations for traffic flow forecasting. Yet, two +non-ignorable issues haven't been well solved: 1) The message passing in GNNs +is immediate, while in reality the spatial message interactions among +neighboring nodes can be delayed. The change of traffic flow at one node will +take several minutes, i.e., time delay, to influence its connected neighbors. +2) Traffic conditions undergo continuous changes. The prediction frequency for +traffic flow forecasting may vary based on specific scenario requirements. Most +existing discretized models require retraining for each prediction horizon, +restricting their applicability. To tackle the above issues, we propose a +neural Spatial-Temporal Delay Differential Equation model, namely STDDE. It +includes both delay effects and continuity into a unified delay differential +equation framework, which explicitly models the time delay in spatial +information propagation. Furthermore, theoretical proofs are provided to show +its stability. Then we design a learnable traffic-graph time-delay estimator, +which utilizes the continuity of the hidden states to achieve the gradient +backward process. Finally, we propose a continuous output module, allowing us +to accurately predict traffic flow at various frequencies, which provides more +flexibility and adaptability to different scenarios. Extensive experiments show +the superiority of the proposed STDDE along with competitive computational +efficiency. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ HW-SW Optimization of DNNs for Privacy-preserving People Counting on + Low-resolution Infrared Arrays DATE 2024 + + +
+ Low-resolution infrared (IR) array sensors enable people counting +applications such as monitoring the occupancy of spaces and people flows while +preserving privacy and minimizing energy consumption. Deep Neural Networks +(DNNs) have been shown to be well-suited to process these sensor data in an +accurate and efficient manner. Nevertheless, the space of DNNs' architectures +is huge and its manual exploration is burdensome and often leads to sub-optimal +solutions. To overcome this problem, in this work, we propose a highly +automated full-stack optimization flow for DNNs that goes from neural +architecture search, mixed-precision quantization, and post-processing, down to +the realization of a new smart sensor prototype, including a Microcontroller +with a customized instruction set. Integrating these cross-layer optimizations, +we obtain a large set of Pareto-optimal solutions in the 3D-space of energy, +memory, and accuracy. Deploying such solutions on our hardware platform, we +improve the state-of-the-art achieving up to 4.2x model size reduction, 23.8x +code size reduction, and 15.38x energy reduction at iso-accuracy. + +
+
+ comment: This paper has been accepted for publication in the DATE 2024 + conference IEEE +
+
+
+
+
+ + ☆ Location Agnostic Adaptive Rain Precipitation Prediction using Deep + Learning + + +
+ Rain precipitation prediction is a challenging task as it depends on weather +and meteorological features which vary from location to location. As a result, +a prediction model that performs well at one location does not perform well at +other locations due to the distribution shifts. In addition, due to global +warming, the weather patterns are changing very rapidly year by year which +creates the possibility of ineffectiveness of those models even at the same +location as time passes. In our work, we have proposed an adaptive deep +learning-based framework in order to provide a solution to the aforementioned +challenges. Our method can generalize the model for the prediction of +precipitation for any location where the methods without adaptation fail. Our +method has shown 43.51%, 5.09%, and 38.62% improvement after adaptation using a +deep neural network for predicting the precipitation of Paris, Los Angeles, and +Tokyo, respectively. + +
+
+
+
+
+ + Efficient Causal Graph Discovery Using Large Language Models + + +
+ We propose a novel framework that leverages LLMs for full causal graph +discovery. While previous LLM-based methods have used a pairwise query +approach, this requires a quadratic number of queries which quickly becomes +impractical for larger causal graphs. In contrast, the proposed framework uses +a breadth-first search (BFS) approach which allows it to use only a linear +number of queries. We also show that the proposed method can easily incorporate +observational data when available, to improve performance. In addition to being +more time and data-efficient, the proposed framework achieves state-of-the-art +results on real-world causal graphs of varying sizes. The results demonstrate +the effectiveness and efficiency of the proposed method in discovering causal +relationships, showcasing its potential for broad applicability in causal graph +discovery tasks across different domains. + +
+
+
+
+
+ + ☆ Comparative Evaluation of Weather Forecasting using Machine Learning + Models + + +
+ Gaining a deeper understanding of weather and being able to predict its +future conduct have always been considered important endeavors for the growth +of our society. This research paper explores the advancements in understanding +and predicting nature's behavior, particularly in the context of weather +forecasting, through the application of machine learning algorithms. By +leveraging the power of machine learning, data mining, and data analysis +techniques, significant progress has been made in this field. This study +focuses on analyzing the contributions of various machine learning algorithms +in predicting precipitation and temperature patterns using a 20-year dataset +from a single weather station in Dhaka city. Algorithms such as Gradient +Boosting, AdaBoosting, Artificial Neural Network, Stacking Random Forest, +Stacking Neural Network, and Stacking KNN are evaluated and compared based on +their performance metrics, including Confusion matrix measurements. The +findings highlight remarkable achievements and provide valuable insights into +their performances and features correlation. + +
+
+
+
+
+ + ☆ A Survey on Self-Supervised Learning for Non-Sequential Tabular Data + + +
+ Self-supervised learning (SSL) has been incorporated into many +state-of-the-art models in various domains, where SSL defines pretext tasks +based on unlabeled datasets to learn contextualized and robust representations. +Recently, SSL has been a new trend in exploring the representation learning +capability in the realm of tabular data, which is more challenging due to not +having explicit relations for learning descriptive representations. This survey +aims to systematically review and summarize the recent progress and challenges +of SSL for non-sequential tabular data (SSL4NS-TD). We first present a formal +definition of NS-TD and clarify its correlation to related studies. Then, these +approaches are categorized into three groups -- predictive learning, +contrastive learning, and hybrid learning, with their motivations and strengths +of representative methods within each direction. On top of this, application +issues of SSL4NS-TD are presented, including automatic data engineering, +cross-table transferability, and domain knowledge integration. In addition, we +elaborate on existing benchmarks and datasets for NS-TD applications to discuss +the performance of existing tabular models. Finally, we discuss the challenges +of SSL4NS-TD and provide potential directions for future research. We expect +our work to be useful in terms of encouraging more research on lowering the +barrier to entry SSL for the tabular domain and improving the foundations for +implicit tabular data. + +
+
+ comment: The paper list can be found at + https://github.com/wwweiwei/awesome-self-supervised-learning-for-tabular-data +
+
+
+
+
+ + ☆ Structured World Modeling via Semantic Vector Quantization ICLR 2024 + + +
+ Neural discrete representations are crucial components of modern neural +networks. However, their main limitation is that the primary strategies such as +VQ-VAE can only provide representations at the patch level. Therefore, one of +the main goals of representation learning, acquiring structured, semantic, and +compositional abstractions such as the color and shape of an object, remains +elusive. In this paper, we present the first approach to semantic neural +discrete representation learning. The proposed model, called Semantic +Vector-Quantized Variational Autoencoder (SVQ), leverages recent advances in +unsupervised object-centric learning to address this limitation. Specifically, +we observe that a simple approach quantizing at the object level poses a +significant challenge and propose constructing scene representations +hierarchically, from low-level discrete concept schemas to object +representations. Additionally, we suggest a novel method for structured +semantic world modeling by training a prior over these representations, +enabling the ability to generate images by sampling the semantic properties of +the objects in the scene. In experiments on various 2D and 3D object-centric +datasets, we find that our model achieves superior generation performance +compared to non-semantic vector quantization methods such as VQ-VAE and +previous object-centric generative models. Furthermore, we find that the +semantic discrete representations can solve downstream scene understanding +tasks that require reasoning about the properties of different objects in the +scene. + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ☆ Few-Shot Class-Incremental Learning with Prior Knowledge + + +
+ To tackle the issues of catastrophic forgetting and overfitting in few-shot +class-incremental learning (FSCIL), previous work has primarily concentrated on +preserving the memory of old knowledge during the incremental phase. The role +of pre-trained model in shaping the effectiveness of incremental learning is +frequently underestimated in these studies. Therefore, to enhance the +generalization ability of the pre-trained model, we propose Learning with Prior +Knowledge (LwPK) by introducing nearly free prior knowledge from a few +unlabeled data of subsequent incremental classes. We cluster unlabeled +incremental class samples to produce pseudo-labels, then jointly train these +with labeled base class samples, effectively allocating embedding space for +both old and new class data. Experimental results indicate that LwPK +effectively enhances the model resilience against catastrophic forgetting, with +theoretical analysis based on empirical risk minimization and class distance +measurement corroborating its operational principles. The source code of LwPK +is publicly available at: \url{https://github.com/StevenJ308/LwPK}. + +
+
+
+
+
+ + ♻ ☆ Inversion by Direct Iteration: An Alternative to Denoising Diffusion for + Image Restoration + + +
+ Inversion by Direct Iteration (InDI) is a new formulation for supervised +image restoration that avoids the so-called "regression to the mean" effect and +produces more realistic and detailed images than existing regression-based +methods. It does this by gradually improving image quality in small steps, +similar to generative denoising diffusion models. Image restoration is an +ill-posed problem where multiple high-quality images are plausible +reconstructions of a given low-quality input. Therefore, the outcome of a +single step regression model is typically an aggregate of all possible +explanations, therefore lacking details and realism. The main advantage of InDI +is that it does not try to predict the clean target image in a single step but +instead gradually improves the image in small steps, resulting in better +perceptual quality. While generative denoising diffusion models also work in +small steps, our formulation is distinct in that it does not require knowledge +of any analytic form of the degradation process. Instead, we directly learn an +iterative restoration process from low-quality and high-quality paired +examples. InDI can be applied to virtually any image degradation, given paired +training data. In conditional denoising diffusion image restoration the +denoising network generates the restored image by repeatedly denoising an +initial image of pure noise, conditioned on the degraded input. Contrary to +conditional denoising formulations, InDI directly proceeds by iteratively +restoring the input low-quality image, producing high-quality results on a +variety of image restoration tasks, including motion and out-of-focus +deblurring, super-resolution, compression artifact removal, and denoising. + +
+
+
+
+
+ + ♻ ☆ Nonlinear Filtering with Brenier Optimal Transport Maps + + +
+ This paper is concerned with the problem of nonlinear filtering, i.e., +computing the conditional distribution of the state of a stochastic dynamical +system given a history of noisy partial observations. Conventional sequential +importance resampling (SIR) particle filters suffer from fundamental +limitations, in scenarios involving degenerate likelihoods or high-dimensional +states, due to the weight degeneracy issue. In this paper, we explore an +alternative method, which is based on estimating the Brenier optimal transport +(OT) map from the current prior distribution of the state to the posterior +distribution at the next time step. Unlike SIR particle filters, the OT +formulation does not require the analytical form of the likelihood. Moreover, +it allows us to harness the approximation power of neural networks to model +complex and multi-modal distributions and employ stochastic optimization +algorithms to enhance scalability. Extensive numerical experiments are +presented that compare the OT method to the SIR particle filter and the +ensemble Kalman filter, evaluating the performance in terms of sample +efficiency, high-dimensional scalability, and the ability to capture complex +and multi-modal distributions. + +
+
+ comment: 25 pages, 16 figures, 1 Table +
+
+
+
+
+ + ♻ ☆ New Online Communities: Graph Deep Learning on Anonymous Voting Networks + to Identify Sybils in Polycentric Governance + + +
+ This research examines the polycentric governance of digital assets in +blockchain-based Decentralized Autonomous Organizations (DAOs). It offers a +theoretical framework and addresses a critical challenge facing decentralized +governance by developing a method to identify sybils, or spurious identities. +Sybils pose significant organizational sustainability threats to DAOs and +other, commons-based online communities, and threat models are identified. The +experimental method uses graph deep learning techniques to identify sybil +activity in a DAO governance dataset (snapshot.org). Specifically, a Graph +Convolutional Neural Network (GCNN) learned voting behaviours and a fast +k-means vector clustering algorithm (FAISS) used high-dimensional embeddings to +identify similar nodes in a graph. The results reveal that deep learning can +effectively identify sybils, reducing the voting graph by 2-5%. This research +underscores the importance of sybil resistance in DAOs and offers a novel +perspective on decentralized governance, informing future policy, regulation, +and governance practices. + +
+
+
+
+
+ + ♻ ☆ STELLA: Continual Audio-Video Pre-training with Spatio-Temporal + Localized Alignment + + +
+ Continuously learning a variety of audio-video semantics over time is crucial +for audio-related reasoning tasks in our ever-evolving world. However, this is +a nontrivial problem and poses two critical challenges: sparse spatio-temporal +correlation between audio-video pairs and multimodal correlation overwriting +that forgets audio-video relations. To tackle this problem, we propose a new +continual audio-video pre-training method with two novel ideas: (1) Localized +Patch Importance Scoring: we introduce a multimodal encoder to determine the +importance score for each patch, emphasizing semantically intertwined +audio-video patches. (2) Replay-guided Correlation Assessment: to reduce the +corruption of previously learned audiovisual knowledge due to drift, we propose +to assess the correlation of the current patches on the past steps to identify +the patches exhibiting high correlations with the past steps. Based on the +results from the two ideas, we perform probabilistic patch selection for +effective continual audio-video pre-training. Experimental validation on +multiple benchmarks shows that our method achieves a 3.69%p of relative +performance gain in zero-shot retrieval tasks compared to strong continual +learning baselines, while reducing memory consumption by ~45%. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ The Benefits of Being Categorical Distributional: Uncertainty-aware + Regularized Exploration in Reinforcement Learning + + +
+ The theoretical advantages of distributional reinforcement learning~(RL) over +classical RL remain elusive despite its remarkable empirical performance. +Starting from Categorical Distributional RL~(CDRL), we attribute the potential +superiority of distributional RL to a derived distribution-matching +regularization by applying a return density function decomposition technique. +This unexplored regularization in the distributional RL context is aimed at +capturing additional return distribution information regardless of only its +expectation, contributing to an augmented reward signal in the policy +optimization. Compared with the entropy regularization in MaxEnt RL that +explicitly optimizes the policy to encourage the exploration, the resulting +regularization in CDRL implicitly optimizes policies guided by the new reward +signal to align with the uncertainty of target return distributions, leading to +an uncertainty-aware exploration effect. Finally, extensive experiments +substantiate the importance of this uncertainty-aware regularization in +distributional RL on the empirical benefits over classical RL. + +
+
+
+
+
+ + ♻ ☆ NoFunEval: Funny How Code LMs Falter on Requirements Beyond Functional + Correctness + + +
+ Existing evaluation benchmarks of language models of code (code LMs) focus +almost exclusively on whether the LMs can generate functionally-correct code. +In real-world software engineering, developers think beyond functional +correctness. They have requirements on "how" a functionality should be +implemented to meet overall system design objectives like efficiency, security, +and maintainability. They would also trust the code LMs more if the LMs +demonstrate robust understanding of requirements and code semantics. + We propose a new benchmark NoFunEval to evaluate code LMs on non-functional +requirements and simple classification instances for both functional and +non-functional requirements. We propose a prompting method, Coding Concepts +(CoCo), as a way for a developer to communicate the domain knowledge to the +LMs. We conduct an extensive evaluation of twenty-two code LMs. Our finding is +that they generally falter when tested on our benchmark, hinting at fundamental +blindspots in their training setups. Surprisingly, even the classification +accuracy on functional-correctness instances derived from the popular HumanEval +benchmark is low, calling in question the depth of their comprehension and the +source of their success in generating functionally-correct code in the first +place. We will release our benchmark and evaluation scripts publicly at +https://aka.ms/NoFunEval. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Foundation Model's Embedded Representations May Detect Distribution + Shift + + +
+ Sampling biases can cause distribution shifts between train and test datasets +for supervised learning tasks, obscuring our ability to understand the +generalization capacity of a model. This is especially important considering +the wide adoption of pre-trained foundational neural networks -- whose behavior +remains poorly understood -- for transfer learning (TL) tasks. We present a +case study for TL on the Sentiment140 dataset and show that many pre-trained +foundation models encode different representations of Sentiment140's manually +curated test set $M$ from the automatically labeled training set $P$, +confirming that a distribution shift has occurred. We argue training on $P$ and +measuring performance on $M$ is a biased measure of generalization. Experiments +on pre-trained GPT-2 show that the features learnable from $P$ do not improve +(and in fact hamper) performance on $M$. Linear probes on pre-trained GPT-2's +representations are robust and may even outperform overall fine-tuning, +implying a fundamental importance for discerning distribution shift in +train/test splits for model interpretation. + +
+
+ comment: 17 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ How Powerful are Decoder-Only Transformer Neural Models? + + +
+ In this article we prove that the general transformer neural model +undergirding modern large language models (LLMs) is Turing complete under +reasonable assumptions. This is the first work to directly address the Turing +completeness of the underlying technology employed in GPT-x as past work has +focused on the more expressive, full auto-encoder transformer architecture. +From this theoretical analysis, we show that the sparsity/compressibility of +the word embedding is an important consideration for Turing completeness to +hold. We also show that Transformers are are a variant of B machines studied by +Hao Wang. + +
+
+
+
+
+ + ♻ ☆ Machine Learning with Requirements: a Manifesto + + +
+ In the recent years, machine learning has made great advancements that have +been at the root of many breakthroughs in different application domains. +However, it is still an open issue how make them applicable to high-stakes or +safety-critical application domains, as they can often be brittle and +unreliable. In this paper, we argue that requirements definition and +satisfaction can go a long way to make machine learning models even more +fitting to the real world, especially in critical domains. To this end, we +present two problems in which (i) requirements arise naturally, (ii) machine +learning models are or can be fruitfully deployed, and (iii) neglecting the +requirements can have dramatic consequences. We show how the requirements +specification can be fruitfully integrated into the standard machine learning +development pipeline, proposing a novel pyramid development process in which +requirements definition may impact all the subsequent phases in the pipeline, +and viceversa. + +
+
+
+
+
+ + ♻ ☆ Distributional Reinforcement Learning by Sinkhorn Divergence + + +
+ The empirical success of distributional reinforcement learning~(RL) highly +depends on the distribution representation and the choice of distribution +divergence. In this paper, we propose \textit{Sinkhorn distributional +RL~(SinkhornDRL)} that learns unrestricted statistics from return distributions +and leverages Sinkhorn divergence to minimize the difference between current +and target Bellman return distributions. Theoretically, we prove the +contraction properties of SinkhornDRL, consistent with the interpolation nature +of Sinkhorn divergence between Wasserstein distance and Maximum Mean +Discrepancy~(MMD). We also establish the equivalence between Sinkhorn +divergence and a regularized MMD with a regularized Moment Matching behavior, +contributing to explaining the superiority of SinkhornDRL. Empirically, we show +that SinkhornDRL is consistently better or comparable to existing algorithms on +the Atari games suite. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2110.03155 +
+
+
+
+
+ + ♻ ☆ CroissantLLM: A Truly Bilingual French-English Language Model + + +
+ We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T +English and French tokens, to bring to the research and industrial community a +high-performance, fully open-sourced bilingual model that runs swiftly on +consumer-grade local hardware. To that end, we pioneer the approach of training +an intrinsically bilingual model with a 1:1 English-to-French pretraining data +ratio, a custom tokenizer, and bilingual finetuning datasets. We release the +training dataset, notably containing a French split with manually curated, +high-quality, and varied data sources. To assess performance outside of +English, we craft a novel benchmark, FrenchBench, consisting of an array of +classification and generation tasks, covering various orthogonal aspects of +model performance in the French Language. Additionally, rooted in transparency +and to foster further Large Language Model research, we release codebases, and +dozens of checkpoints across various model sizes, training data distributions, +and training steps, as well as fine-tuned Chat models, and strong translation +models. We evaluate our model through the FMTI framework, and validate 81 % of +the transparency criteria, far beyond the scores of even most open initiatives. +This work enriches the NLP landscape, breaking away from previous +English-centric work in order to strengthen our understanding of +multilinguality in language models. + +
+
+
+
+
+ + ♻ ☆ Deep graph kernel point processes + + +
+ Point process models are widely used for continuous asynchronous event data, +where each data point includes time and additional information called "marks", +which can be locations, nodes, or event types. This paper presents a novel +point process model for discrete event data over graphs, where the event +interaction occurs within a latent graph structure. Our model builds upon +Hawkes's classic influence kernel-based formulation in the original +self-exciting point processes work to capture the influence of historical +events on future events' occurrence. The key idea is to represent the influence +kernel by Graph Neural Networks (GNN) to capture the underlying graph structure +while harvesting the strong representation power of GNNs. Compared with prior +works focusing on directly modeling the conditional intensity function using +neural networks, our kernel presentation herds the repeated event influence +patterns more effectively by combining statistical and deep models, achieving +better model estimation/learning efficiency and superior predictive +performance. Our work significantly extends the existing deep spatio-temporal +kernel for point process data, which is inapplicable to our setting due to the +fundamental difference in the nature of the observation space being Euclidean +rather than a graph. We present comprehensive experiments on synthetic and +real-world data to show the superior performance of the proposed approach +against the state-of-the-art in predicting future events and uncovering the +relational structure among data. + +
+
+
+
+
+ + ♻ ☆ CAST: Cluster-Aware Self-Training for Tabular Data + + +
+ Self-training has gained attraction because of its simplicity and +versatility, yet it is vulnerable to noisy pseudo-labels caused by erroneous +confidence. Several solutions have been proposed to handle the problem, but +they require significant modifications in self-training algorithms or model +architecture, and most have limited applicability in tabular domains. To +address this issue, we explore a novel direction of reliable confidence in +self-training contexts and conclude that the confidence, which represents the +value of the pseudo-label, should be aware of the cluster assumption. In this +regard, we propose Cluster-Aware Self-Training (CAST) for tabular data, which +enhances existing self-training algorithms at a negligible cost without +significant modifications. Concretely, CAST regularizes the confidence of the +classifier by leveraging local density for each class in the labeled training +data, forcing the pseudo-labels in low-density regions to have lower +confidence. Extensive empirical evaluations on up to 21 real-world datasets +confirm not only the superior performance of CAST but also its robustness in +various setups in self-training contexts. + +
+
+ comment: 10 pages for main body, and 16 additional pages for reference and + appendix +
+
+
+
+
+ + ♻ ☆ An Accurate and Low-Parameter Machine Learning Architecture for Next + Location Prediction + + +
+ Next location prediction is a discipline that involves predicting a users +next location. Its applications include resource allocation, quality of +service, energy efficiency, and traffic management. This paper proposes an +energy-efficient, small, and low parameter machine learning (ML) architecture +for accurate next location prediction, deployable on modest base stations and +edge devices. To accomplish this we ran a hundred hyperparameter experiments on +the full human mobility patterns of an entire city, to determine an exact ML +architecture that reached a plateau of accuracy with the least amount of model +parameters. We successfully achieved a reduction in the number of model +parameters within published ML architectures from 202 million down to 2 +million. This reduced the total size of the model parameters from 791 MB down +to 8 MB. Additionally, this decreased the training time by a factor of four, +the amount of graphics processing unit (GPU) memory needed for training by a +factor of twenty, and the overall accuracy was increased from 80.16% to 82.54%. +This improvement allows for modest base stations and edge devices which do not +have a large amount of memory or storage, to deploy and utilize the proposed ML +architecture for next location prediction. + +
+
+ comment: Paper was accepted and presented in person at the 2023 IEEE Future + Networks World Forum, in Baltimore, Maryland, USA +
+
+
+
+
+ + ♻ ☆ Fix-Con: Automatic Fault Localization and Repair of Deep Learning Model + Conversions + + +
+ Converting deep learning models between frameworks is a common step to +maximize model compatibility across devices and leverage optimization features +that may be exclusively provided in one deep learning framework. However, this +conversion process may be riddled with bugs, making the converted models either +undeployable or problematic, considerably degrading their prediction +correctness. + We propose an automated approach for fault localization and repair, Fix-Con, +during model conversion between deep learning frameworks. Fix-Con is capable of +detecting and fixing faults introduced in model input, parameters, +hyperparameters, and the model graph during conversion. + Fix-Con uses a set of fault types mined from surveying conversion issues +raised to localize potential conversion faults in the converted target model, +and then repairs them appropriately, e.g. replacing the parameters of the +target model with those from the source model. This is done iteratively for +every image in the dataset with output label differences between the source +model and the converted target model until all differences are resolved. We +evaluate the effectiveness of Fix-Con in fixing model conversion bugs of three +widely used image recognition models converted across four different deep +learning frameworks. Overall, Fix-Con was able to either completely repair, or +significantly improve the performance of 14 out of the 15 erroneous conversion +cases. + +
+
+ comment: 12 pages, 3 figures, 4 tables, 1 algorithm +
+
+
+
+
+ + ♻ ☆ Analog-digital Scheduling for Federated Learning: A + Communication-Efficient Approach + + +
+ Over-the-air (OTA) computation has recently emerged as a +communication-efficient Federated Learning (FL) paradigm to train machine +learning models over wireless networks. However, its performance is limited by +the device with the worst SNR, resulting in fast yet noisy updates. On the +other hand, allocating orthogonal resource blocks (RB) to individual devices +via digital channels mitigates the noise problem, at the cost of increased +communication latency. In this paper, we address this discrepancy and present +ADFL, a novel Analog-Digital FL scheme: in each round, the parameter server +(PS) schedules each device to either upload its gradient via the analog OTA +scheme or transmit its quantized gradient over an orthogonal RB using the +``digital" scheme. Focusing on a single FL round, we cast the optimal +scheduling problem as the minimization of the mean squared error (MSE) on the +estimated global gradient at the PS, subject to a delay constraint, yielding +the optimal device scheduling configuration and quantization bits for the +digital devices. Our simulation results show that ADFL, by scheduling most of +the devices in the OTA scheme while also occasionally employing the digital +scheme for a few devices, consistently outperforms OTA-only and digital-only +schemes, in both i.i.d. and non-i.i.d. settings. + +
+
+ comment: Appeared at the 2023 Asilomar Conference on Signals, Systems, and + Computers +
+
+
+
+
+ + ♻ ☆ MagiCapture: High-Resolution Multi-Concept Portrait Customization + + +
+ Large-scale text-to-image models including Stable Diffusion are capable of +generating high-fidelity photorealistic portrait images. There is an active +research area dedicated to personalizing these models, aiming to synthesize +specific subjects or styles using provided sets of reference images. However, +despite the plausible results from these personalization methods, they tend to +produce images that often fall short of realism and are not yet on a +commercially viable level. This is particularly noticeable in portrait image +generation, where any unnatural artifact in human faces is easily discernible +due to our inherent human bias. To address this, we introduce MagiCapture, a +personalization method for integrating subject and style concepts to generate +high-resolution portrait images using just a few subject and style references. +For instance, given a handful of random selfies, our fine-tuned model can +generate high-quality portrait images in specific styles, such as passport or +profile photos. The main challenge with this task is the absence of ground +truth for the composed concepts, leading to a reduction in the quality of the +final output and an identity shift of the source subject. To address these +issues, we present a novel Attention Refocusing loss coupled with auxiliary +priors, both of which facilitate robust learning within this weakly supervised +learning setting. Our pipeline also includes additional post-processing steps +to ensure the creation of highly realistic outputs. MagiCapture outperforms +other baselines in both quantitative and qualitative evaluations and can also +be generalized to other non-human objects. + +
+
+ comment: 18 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Almost Equivariance via Lie Algebra Convolutions + + +
+ Recently, the equivariance of models with respect to a group action has +become an important topic of research in machine learning. Analysis of the +built-in equivariance of existing neural network architectures, as well as the +study of building models that explicitly "bake in" equivariance, have become +significant research areas in their own right. However, imbuing an architecture +with a specific group equivariance imposes a strong prior on the types of data +transformations that the model expects to see. While strictly-equivariant +models enforce symmetries, real-world data does not always conform to such +strict equivariances. In such cases, the prior of strict equivariance can +actually prove too strong and cause models to underperform. Therefore, in this +work we study a closely related topic, that of almost equivariance. We provide +a definition of almost equivariance and give a practical method for encoding +almost equivariance in models by appealing to the Lie algebra of a Lie group. +Specifically, we define Lie algebra convolutions and demonstrate that they +offer several benefits over Lie group convolutions, including being +well-defined for non-compact Lie groups having non-surjective exponential map. +From there, we demonstrate connections between the notions of equivariance and +isometry and those of almost equivariance and almost isometry. We prove two +existence theorems, one showing the existence of almost isometries within +bounded distance of isometries of a manifold, and another showing the converse +for Hilbert spaces. We extend these theorems to prove the existence of almost +equivariant manifold embeddings within bounded distance of fully equivariant +embedding functions, subject to certain constraints on the group action and the +function class. Finally, we demonstrate the validity of our approach by +benchmarking against datasets in fully equivariant and almost equivariant +settings. + +
+
+
+
+
+ + ♻ ☆ Improving Monte Carlo Evaluation with Offline Data + + +
+ Most reinforcement learning practitioners evaluate their policies with online +Monte Carlo estimators for either hyperparameter tuning or testing different +algorithmic design choices, where the policy is repeatedly executed in the +environment to get the average outcome. Such massive interactions with the +environment are prohibitive in many scenarios. In this paper, we propose novel +methods that improve the data efficiency of online Monte Carlo estimators while +maintaining their unbiasedness. We first propose a tailored closed-form +behavior policy that provably reduces the variance of an online Monte Carlo +estimator. We then design efficient algorithms to learn this closed-form +behavior policy from previously collected offline data. Theoretical analysis is +provided to characterize how the behavior policy learning error affects the +amount of reduced variance. Compared with previous works, our method achieves +better empirical performance in a broader set of environments, with fewer +requirements for offline data. + +
+
+
+
+
+ + ♻ ☆ How to escape sharp minima with random perturbations + + +
+ Modern machine learning applications have witnessed the remarkable success of +optimization algorithms that are designed to find flat minima. Motivated by +this design choice, we undertake a formal study that (i) formulates the notion +of flat minima, and (ii) studies the complexity of finding them. Specifically, +we adopt the trace of the Hessian of the cost function as a measure of +flatness, and use it to formally define the notion of approximate flat minima. +Under this notion, we then analyze algorithms that find approximate flat minima +efficiently. For general cost functions, we discuss a gradient-based algorithm +that finds an approximate flat local minimum efficiently. The main component of +the algorithm is to use gradients computed from randomly perturbed iterates to +estimate a direction that leads to flatter minima. For the setting where the +cost function is an empirical risk over training data, we present a faster +algorithm that is inspired by a recently proposed practical algorithm called +sharpness-aware minimization, supporting its success in practice. + +
+
+ comment: Comments would be appreciated! +
+
+
+
+
+ + ♻ ☆ On the Identification and Optimization of Nonsmooth Superposition + Operators in Semilinear Elliptic PDEs + + +
+ We study an infinite-dimensional optimization problem that aims to identify +the Nemytskii operator in the nonlinear part of a prototypical semilinear +elliptic partial differential equation (PDE) which minimizes the distance +between the PDE-solution and a given desired state. In contrast to previous +works, we consider this identification problem in a low-regularity regime in +which the function inducing the Nemytskii operator is a-priori only known to be +an element of $H^1_{loc}(\mathbb{R})$. This makes the studied problem class a +suitable point of departure for the rigorous analysis of training problems for +learning-informed PDEs in which an unknown superposition operator is +approximated by means of a neural network with nonsmooth activation functions +(ReLU, leaky-ReLU, etc.). We establish that, despite the low regularity of the +controls, it is possible to derive a classical stationarity system for local +minimizers and to solve the considered problem by means of a gradient +projection method. The convergence of the resulting algorithm is proven in the +function space setting. It is also shown that the established first-order +necessary optimality conditions imply that locally optimal superposition +operators share various characteristic properties with commonly used activation +functions: They are always sigmoidal, continuously differentiable away from the +origin, and typically possess a distinct kink at zero. The paper concludes with +numerical experiments which confirm the theoretical findings. + +
+
+ comment: Minor revision; to appear in ESAIM COCV +
+
+
+
+
+ + ♻ ☆ Online Variational Sequential Monte Carlo + + +
+ Being the most classical generative model for serial data, state-space models +(SSM) are fundamental in AI and statistical machine learning. In SSM, any form +of parameter learning or latent state inference typically involves the +computation of complex latent-state posteriors. In this work, we build upon the +variational sequential Monte Carlo (VSMC) method, which provides +computationally efficient and accurate model parameter estimation and Bayesian +latent-state inference by combining particle methods and variational inference. +While standard VSMC operates in the offline mode, by re-processing repeatedly a +given batch of data, we distribute the approximation of the gradient of the +VSMC surrogate ELBO in time using stochastic approximation, allowing for online +learning in the presence of streams of data. This results in an algorithm, +online VSMC, that is capable of performing efficiently, entirely on-the-fly, +both parameter estimation and particle proposal adaptation. In addition, we +provide rigorous theoretical results describing the algorithm's convergence +properties as the number of data tends to infinity as well as numerical +illustrations of its excellent convergence properties and usefulness also in +batch-processing settings. + +
+
+ comment: In this version there are additional simulations in Section 5.1, some + added references, and minor typos fixed +
+
+
+
+
+ + ♻ ☆ Task Aware Dreamer for Task Generalization in Reinforcement Learning + + +
+ A long-standing goal of reinforcement learning is to acquire agents that can +learn on training tasks and generalize well on unseen tasks that may share a +similar dynamic but with different reward functions. The ability to generalize +across tasks is important as it determines an agent's adaptability to +real-world scenarios where reward mechanisms might vary. In this work, we first +show that training a general world model can utilize similar structures in +these tasks and help train more generalizable agents. Extending world models +into the task generalization setting, we introduce a novel method named Task +Aware Dreamer (TAD), which integrates reward-informed features to identify +consistent latent characteristics across tasks. Within TAD, we compute the +variational lower bound of sample data log-likelihood, which introduces a new +term designed to differentiate tasks using their states, as the optimization +objective of our reward-informed world models. To demonstrate the advantages of +the reward-informed policy in TAD, we introduce a new metric called Task +Distribution Relevance (TDR) which quantitatively measures the relevance of +different tasks. For tasks exhibiting a high TDR, i.e., the tasks differ +significantly, we illustrate that Markovian policies struggle to distinguish +them, thus it is necessary to utilize reward-informed policies in TAD. +Extensive experiments in both image-based and state-based tasks show that TAD +can significantly improve the performance of handling different tasks +simultaneously, especially for those with high TDR, and display a strong +generalization ability to unseen tasks. + +
+
+
+
+
+ + ♻ ☆ Conditional Generative Representation for Black-Box Optimization with + Implicit Constraints + + +
+ Black-box optimization (BBO) has become increasingly relevant for tackling +complex decision-making problems, especially in public policy domains such as +police districting. However, its broader application in public policymaking is +hindered by the complexity of defining feasible regions and the +high-dimensionality of decisions. This paper introduces a novel BBO framework, +termed as the Conditional And Generative Black-box Optimization (CageBO). This +approach leverages a conditional variational autoencoder to learn the +distribution of feasible decisions, enabling a two-way mapping between the +original decision space and a simplified, constraint-free latent space. The +CageBO efficiently handles the implicit constraints often found in public +policy applications, allowing for optimization in the latent space while +evaluating objectives in the original space. We validate our method through a +case study on large-scale police districting problems in Atlanta, Georgia. Our +results reveal that our CageBO offers notable improvements in performance and +efficiency compared to the baselines. + +
+
+
+
+
+ + ♻ ☆ Random Exploration in Bayesian Optimization: Order-Optimal Regret and + Computational Efficiency + + +
+ We consider Bayesian optimization using Gaussian Process models, also +referred to as kernel-based bandit optimization. We study the methodology of +exploring the domain using random samples drawn from a distribution. We show +that this random exploration approach achieves the optimal error rates. Our +analysis is based on novel concentration bounds in an infinite dimensional +Hilbert space established in this work, which may be of independent interest. +We further develop an algorithm based on random exploration with domain +shrinking and establish its order-optimal regret guarantees under both +noise-free and noisy settings. In the noise-free setting, our analysis closes +the existing gap in regret performance and thereby resolves a COLT open +problem. The proposed algorithm also enjoys a computational advantage over +prevailing methods due to the random exploration that obviates the expensive +optimization of a non-convex acquisition function for choosing the query points +at each iteration. + +
+
+
+
+
+ + ♻ ☆ Are Normalizing Flows the Key to Unlocking the Exponential Mechanism? A + Path through the Accuracy-Privacy Ceiling Constraining Differentially Private + ML + + +
+ The state of the art and de facto standard for differentially private machine +learning (ML) is differentially private stochastic gradient descent (DPSGD). +Yet, the method is inherently wasteful. By adding noise to every gradient, it +diminishes the overall privacy with every gradient step. Despite 15 years of +fruitful research advancing the composition theorems, sub-sampling methods, and +implementation techniques, adequate accuracy and privacy is often unattainable +with current private ML methods. Meanwhile, the Exponential Mechanism (ExpM), +designed for private optimization, has been historically sidelined from +privately training modern ML algorithms primarily because ExpM requires +sampling from a historically intractable density. Despite the recent discovery +of Normalizing Flow models (NFs), expressive deep networks for approximating +intractable distributions, ExpM remains in the background. Our position is that +leveraging NFs to circumvent historic obstructions of ExpM is a potentially +transformational solution for differentially private ML worth attention. We +introduce a new training method, ExpM+NF, as a potential alternative to DPSGD, +and we provide experiment with logistic regression and a modern deep learning +model to test whether training via ExpM+NF is viable with "good" privacy +parameters. Under the assumption that the NF output distribution is the ExpM +distribution, we are able to achieve $\varepsilon$ a low as $1\mathrm{e}{-3}$ +-- three orders of magnitude stronger privacy with similar accuracy. This work +outlines a new avenue for advancing differentially private ML, namely +discovering NF approximation guarantees. Code to be provided after review. + +
+
+
+
+
+ + ♻ ☆ On the Convergence of Federated Averaging under Partial Participation + for Over-parameterized Neural Networks + + +
+ Federated learning (FL) is a widely employed distributed paradigm for +collaboratively training machine learning models from multiple clients without +sharing local data. In practice, FL encounters challenges in dealing with +partial client participation due to the limited bandwidth, intermittent +connection and strict synchronized delay. Simultaneously, there exist few +theoretical convergence guarantees in this practical setting, especially when +associated with the non-convex optimization of neural networks. To bridge this +gap, we focus on the training problem of federated averaging (FedAvg) method +for two canonical models: a deep linear network and a two-layer ReLU network. +Under the over-parameterized assumption, we provably show that FedAvg converges +to a global minimum at a linear rate $\mathcal{O}\left((1-\frac{min_{i \in +[t]}|S_i|}{N^2})^t\right)$ after $t$ iterations, where $N$ is the number of +clients and $|S_i|$ is the number of the participated clients in the $i$-th +iteration. Experimental evaluations confirm our theoretical results. + +
+
+
+
+
+ + ♻ ☆ Scaling Sparse Fine-Tuning to Large Language Models + + +
+ Large Language Models (LLMs) are difficult to fully fine-tune (e.g., with +instructions or human feedback) due to their sheer number of parameters. A +family of parameter-efficient sparse fine-tuning methods have proven promising +in terms of performance but their memory requirements increase proportionally +to the size of the LLMs. In this work, we scale sparse fine-tuning to +state-of-the-art LLMs like LLaMA 2 7B and 13B. We propose SpIEL, a novel sparse +fine-tuning method which, for a desired density level, maintains an array of +parameter indices and the deltas of these parameters relative to their +pretrained values. It iterates over: (a) updating the active deltas, (b) +pruning indices (based on the change of magnitude of their deltas) and (c) +regrowth of indices. For regrowth, we explore two criteria based on either the +accumulated gradients of a few candidate parameters or their approximate +momenta estimated using the efficient SM3 optimizer. We experiment with +instruction-tuning of LLMs on standard dataset mixtures, finding that SpIEL is +often superior to popular parameter-efficient fine-tuning methods like LoRA +(low-rank adaptation) in terms of performance and comparable in terms of run +time. We additionally show that SpIEL is compatible with both quantization and +efficient optimizers, to facilitate scaling to ever-larger model sizes. We +release the code for SpIEL at https://github.com/AlanAnsell/peft and for the +instruction-tuning experiments at https://github.com/ducdauge/sft-llm. + +
+
+
+
+
+ + ♻ ☆ Unbalanced and Light Optimal Transport + + +
+ While the field of continuous Entropic Optimal Transport (EOT) has been +actively developing in recent years, it became evident that the classic EOT +problem is prone to different issues like the sensitivity to outliers and +imbalance of classes in the source and target measures. This fact inspired the +development of solvers which deal with the unbalanced EOT (UEOT) problem - the +generalization of EOT allowing for mitigating the mentioned issues by relaxing +the marginal constraints. Surprisingly, it turns out that the existing solvers +are either based on heuristic principles or heavy-weighted with complex +optimization objectives involving several neural networks. We address this +challenge and propose a novel theoretically-justified and lightweight +unbalanced EOT solver. Our advancement consists in developing a novel view on +the optimization of the UEOT problem yielding tractable and non-minimax +optimization objective. We show that combined with a light parametrization +recently proposed in the field our objective leads to fast, simple and +effective solver. It allows solving the continuous UEOT problem in minutes on +CPU. We provide illustrative examples of the performance of our solver. + +
+
+
+
+
+ + ♻ ☆ ${\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative + Multi-Agent Reinforcement Learning + + +
+ Identification and analysis of symmetrical patterns in the natural world have +led to significant discoveries across various scientific fields, such as the +formulation of gravitational laws in physics and advancements in the study of +chemical structures. In this paper, we focus on exploiting Euclidean symmetries +inherent in certain cooperative multi-agent reinforcement learning (MARL) +problems and prevalent in many applications. We begin by formally +characterizing a subclass of Markov games with a general notion of symmetries +that admits the existence of symmetric optimal values and policies. Motivated +by these properties, we design neural network architectures with symmetric +constraints embedded as an inductive bias for multi-agent actor-critic methods. +This inductive bias results in superior performance in various cooperative MARL +benchmarks and impressive generalization capabilities such as zero-shot +learning and transfer learning in unseen scenarios with repeated symmetric +patterns. The code is available at: https://github.com/dchen48/E3AC. + +
+
+
+
+
+ + ♻ ☆ Understanding Grokking Through A Robustness Viewpoint + + +
+ Recently, an interesting phenomenon called grokking has gained much +attention, where generalization occurs long after the models have initially +overfitted the training data. We try to understand this seemingly strange +phenomenon through the robustness of the neural network. From a robustness +perspective, we show that the popular $l_2$ weight norm (metric) of the neural +network is actually a sufficient condition for grokking. Based on the previous +observations, we propose perturbation-based methods to speed up the +generalization process. In addition, we examine the standard training process +on the modulo addition dataset and find that it hardly learns other basic group +operations before grokking, for example, the commutative law. Interestingly, +the speed-up of generalization when using our proposed method can be explained +by learning the commutative law, a necessary condition when the model groks on +the test dataset. We also empirically find that $l_2$ norm correlates with +grokking on the test data not in a timely way, we propose new metrics based on +robustness and information theory and find that our new metrics correlate well +with the grokking phenomenon and may be used to predict grokking. + +
+
+
+
+
+ + ♻ ☆ Label Propagation Techniques for Artifact Detection in Imbalanced + Classes using Photoplethysmogram Signals + + +
+ Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring +vital signs, but they are susceptible to motion artifacts that can lead to +inaccurate interpretations. In this study, the use of label propagation +techniques to propagate labels among PPG samples is explored, particularly in +imbalanced class scenarios where clean PPG samples are significantly +outnumbered by artifact-contaminated samples. With a precision of 91%, a recall +of 90% and an F1 score of 90% for the class without artifacts, the results +demonstrate its effectiveness in labeling a medical dataset, even when clean +samples are rare. For the classification of artifacts our study compares +supervised classifiers such as conventional classifiers and neural networks +(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm. +With a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN +supervised model gives good results, but the semi-supervised algorithm performs +better in detecting artifacts. The findings suggest that the semi-supervised +algorithm label propagation hold promise for artifact detection in PPG signals, +which can enhance the reliability of PPG-based health monitoring systems in +real-world applications. + +
+
+ comment: Under preparation to submit to IEEE for possible publications +
+
+
+
+
+ + ♻ ☆ I Prefer not to Say: Protecting User Consent in Models with Optional + Personal Data AAAI-24 + + +
+ We examine machine learning models in a setup where individuals have the +choice to share optional personal information with a decision-making system, as +seen in modern insurance pricing models. Some users consent to their data being +used whereas others object and keep their data undisclosed. In this work, we +show that the decision not to share data can be considered as information in +itself that should be protected to respect users' privacy. This observation +raises the overlooked problem of how to ensure that users who protect their +personal data do not suffer any disadvantages as a result. To address this +problem, we formalize protection requirements for models which only use the +information for which active user consent was obtained. This excludes implicit +information contained in the decision to share data or not. We offer the first +solution to this problem by proposing the notion of Protected User Consent +(PUC), which we prove to be loss-optimal under our protection requirement. We +observe that privacy and performance are not fundamentally at odds with each +other and that it is possible for a decision maker to benefit from additional +data while respecting users' consent. To learn PUC-compliant models, we devise +a model-agnostic data augmentation strategy with finite sample convergence +guarantees. Finally, we analyze the implications of PUC on challenging real +datasets, tasks, and models. + +
+
+ comment: v5: AAAI-24 Camera-Ready Version Including Appendices. v1: NeurIPS + 2022 Workshop on Algorithmic Fairness through the Lens of Causality and + Privacy (AFCP) +
+
+
+
+
+ + ♻ ☆ Enhancing Business Process Simulation Models with Extraneous Activity + Delays + + +
+ Business Process Simulation (BPS) is a common approach to estimate the impact +of changes to a business process on its performance measures. For example, it +allows us to estimate what would be the cycle time of a process if we automated +one of its activities, or if some resources become unavailable. The starting +point of BPS is a business process model annotated with simulation parameters +(a BPS model). In traditional approaches, BPS models are manually designed by +modeling specialists. This approach is time-consuming and error-prone. To +address this shortcoming, several studies have proposed methods to +automatically discover BPS models from event logs via process mining +techniques. However, current techniques in this space discover BPS models that +only capture waiting times caused by resource contention or resource +unavailability. Oftentimes, a considerable portion of the waiting time in a +business process corresponds to extraneous delays, e.g., a resource waits for +the customer to return a phone call. This article proposes a method that +discovers extraneous delays from event logs of business process executions. The +proposed approach computes, for each pair of causally consecutive activity +instances in the event log, the time when the target activity instance should +theoretically have started, given the availability of the relevant resource. +Based on the difference between the theoretical and the actual start times, the +approach estimates the distribution of extraneous delays, and it enhances the +BPS model with timer events to capture these delays. An empirical evaluation +involving synthetic and real-life logs shows that the approach produces BPS +models that better reflect the temporal dynamics of the process, relative to +BPS models that do not capture extraneous delays. + +
+
+ comment: Extended version of the ICPM 2022 publication (see v1) +
+
+
+
+
+ + ♻ ☆ A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with + Uniform PAC Guarantees + + +
+ We study a primal-dual reinforcement learning (RL) algorithm for the online +constrained Markov decision processes (CMDP) problem, wherein the agent +explores an optimal policy that maximizes return while satisfying constraints. +Despite its widespread practical use, the existing theoretical literature on +primal-dual RL algorithms for this problem only provides sublinear regret +guarantees and fails to ensure convergence to optimal policies. In this paper, +we introduce a novel policy gradient primal-dual algorithm with uniform +probably approximate correctness (Uniform-PAC) guarantees, simultaneously +ensuring convergence to optimal policies, sublinear regret, and polynomial +sample complexity for any target accuracy. Notably, this represents the first +Uniform-PAC algorithm for the online CMDP problem. In addition to the +theoretical guarantees, we empirically demonstrate in a simple CMDP that our +algorithm converges to optimal policies, while an existing algorithm exhibits +oscillatory performance and constraint violation. + +
+
+
+
+
+ + ♻ ☆ Learning Directed Graphical Models with Optimal Transport + + +
+ Estimating the parameters of a probabilistic directed graphical model from +incomplete data remains a long-standing challenge. This is because, in the +presence of latent variables, both the likelihood function and posterior +distribution are intractable without further assumptions about structural +dependencies or model classes. While existing learning methods are +fundamentally based on likelihood maximization, here we offer a new view of the +parameter learning problem through the lens of optimal transport. This +perspective licenses a general framework that operates on any directed graphs +without making unrealistic assumptions on the posterior over the latent +variables or resorting to black-box variational approximations. We develop a +theoretical framework and support it with extensive empirical evidence +demonstrating the flexibility and versatility of our approach. Across +experiments, we show that not only can our method recover the ground-truth +parameters but it also performs comparably or better on downstream +applications, notably the non-trivial task of discrete representation learning. + +
+
+
+
+
+ + ♻ ☆ Document-Level In-Context Few-Shot Relation Extraction via Pre-Trained + Language Models + + +
+ Relation extraction aims at inferring structured human knowledge from textual +documents. State-of-the-art methods based on language models commonly have two +limitations: (1) they require named entities to be either given as input or +infer them, which introduces additional noise, and (2) they require human +annotations of documents. As a remedy, we present a novel framework for +document-level in-context few-shot relation extraction via pre-trained language +models. We achieve crucial benefits in that we eliminate the need for both +named entity recognition and human annotation of documents. Unlike existing +methods based on fine-tuning, our framework is flexible in that it can be +easily updated for a new set of relations without re-training. We evaluate our +framework using DocRED, the largest publicly available dataset for +document-level relation extraction, and demonstrate that our framework achieves +state-of-the-art performance. Finally, we show that our framework actually +performs much better than the original labels from the development set of +DocRED. To the best of our knowledge, we are the first to reformulate the +document-level relation extraction task as a tailored in-context few-shot +learning paradigm. + +
+
+
+
+
+ + ♻ ☆ Variational Linearized Laplace Approximation for Bayesian Deep Learning + + +
+ The Linearized Laplace Approximation (LLA) has been recently used to perform +uncertainty estimation on the predictions of pre-trained deep neural networks +(DNNs). However, its widespread application is hindered by significant +computational costs, particularly in scenarios with a large number of training +points or DNN parameters. Consequently, additional approximations of LLA, such +as Kronecker-factored or diagonal approximate GGN matrices, are utilized, +potentially compromising the model's performance. To address these challenges, +we propose a new method for approximating LLA using a variational sparse +Gaussian Process (GP). Our method is based on the dual RKHS formulation of GPs +and retains as the predictive mean the output of the original DNN. Furthermore, +it allows for efficient stochastic optimization, which results in sub-linear +training time in the size of the training dataset. Specifically, its training +cost is independent of the number of training points. We compare our proposed +method against accelerated LLA (ELLA), which relies on the Nystr\"om +approximation, as well as other LLA variants employing the sample-then-optimize +principle. Experimental results, both on regression and classification +datasets, show that our method outperforms these already existing efficient +variants of LLA, both in terms of the quality of the predictive distribution +and in terms of total computational time. + +
+
+ comment: Pre-print, under revision +
+
+
+
+
+ + ♻ ☆ Minimizing $f$-Divergences by Interpolating Velocity Fields + + +
+ Many machine learning problems can be formulated as approximating a target +distribution using a particle distribution by minimizing a statistical +discrepancy. Wasserstein Gradient Flow can be employed to move particles along +a path that minimizes the $f$-divergence between the \textit{target} and +\textit{particle} distributions. To perform such movements we need to calculate +the corresponding velocity fields which include a density ratio function +between these two distributions. While previous works estimated the density +ratio function first and then differentiated the estimated ratio, this approach +may suffer from overfitting, which leads to a less accurate estimate. Inspired +by non-parametric curve fitting, we directly estimate these velocity fields +using interpolation. We prove that our method is asymptotically consistent +under mild conditions. We validate the effectiveness using novel applications +on domain adaptation and missing data imputation. + +
+
+
+
+
+ + ♻ ☆ RACH-Space: Reconstructing Adaptive Convex Hull Space with Applications + in Weak Supervision + + +
+ We introduce RACH-Space, an algorithm for labelling unlabelled data in weakly +supervised learning, given incomplete, noisy information about the labels. +RACH-Space offers simplicity in implementation without requiring hard +assumptions on data or the sources of weak supervision, and is well suited for +practical applications where fully labelled data is not available. Our method +is built upon a geometrical interpretation of the space spanned by the set of +weak signals. We also analyze the theoretical properties underlying the +relationship between the convex hulls in this space and the accuracy of our +output labels, bridging geometry with machine learning. Empirical results +demonstrate that RACH-Space works well in practice and compares favorably to +the best existing label models for weakly supervised learning. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Multi-intention Inverse Q-learning for Interpretable Behavior + Representation + + +
+ In advancing the understanding of decision-making processes, Inverse +Reinforcement Learning (IRL) have proven instrumental in reconstructing +animal's multiple intentions amidst complex behaviors. Given the recent +development of a continuous-time multi-intention IRL framework, there has been +persistent inquiry into inferring discrete time-varying rewards with IRL. To +tackle the challenge, we introduce Latent (Markov) Variable Inverse Q-learning +(L(M)V-IQL), a novel class of IRL algorthms tailored for accommodating discrete +intrinsic reward functions. Leveraging an Expectation-Maximization approach, we +cluster observed expert trajectories into distinct intentions and independently +solve the IRL problem for each. Demonstrating the efficacy of L(M)V-IQL through +simulated experiments and its application to different real mouse behavior +datasets, our approach surpasses current benchmarks in animal behavior +prediction, producing interpretable reward functions. This advancement holds +promise for neuroscience and cognitive science, contributing to a deeper +understanding of decision-making and uncovering underlying brain mechanisms. + +
+
+
+
+
+ + ♻ ☆ Learning efficient backprojections across cortical hierarchies in real + time + + +
+ Models of sensory processing and learning in the cortex need to efficiently +assign credit to synapses in all areas. In deep learning, a known solution is +error backpropagation, which however requires biologically implausible weight +transport from feed-forward to feedback paths. + We introduce Phaseless Alignment Learning (PAL), a bio-plausible method to +learn efficient feedback weights in layered cortical hierarchies. This is +achieved by exploiting the noise naturally found in biophysical systems as an +additional carrier of information. In our dynamical system, all weights are +learned simultaneously with always-on plasticity and using only information +locally available to the synapses. Our method is completely phase-free (no +forward and backward passes or phased learning) and allows for efficient error +propagation across multi-layer cortical hierarchies, while maintaining +biologically plausible signal transport and learning. + Our method is applicable to a wide class of models and improves on previously +known biologically plausible ways of credit assignment: compared to random +synaptic feedback, it can solve complex tasks with less neurons and learn more +useful latent representations. We demonstrate this on various classification +tasks using a cortical microcircuit model with prospective coding. + +
+
+ comment: Updated with streamlined main part, CIFAR-10 simulations, including + DFA and minor fixes +
+
+
+
+
+ + ♻ ☆ An Information Theoretic Approach to Interaction-Grounded Learning + + +
+ Reinforcement learning (RL) problems where the learner attempts to infer an +unobserved reward from some feedback variables have been studied in several +recent papers. The setting of Interaction-Grounded Learning (IGL) is an example +of such feedback-based RL tasks where the learner optimizes the return by +inferring latent binary rewards from the interaction with the environment. In +the IGL setting, a relevant assumption used in the RL literature is that the +feedback variable $Y$ is conditionally independent of the context-action +$(X,A)$ given the latent reward $R$. In this work, we propose Variational +Information-based IGL (VI-IGL) as an information-theoretic method to enforce +the conditional independence assumption in the IGL-based RL problem. The VI-IGL +framework learns a reward decoder using an information-based objective based on +the conditional mutual information (MI) between $(X,A)$ and $Y$. To estimate +and optimize the information-based terms for the continuous random variables in +the RL problem, VI-IGL leverages the variational representation of mutual +information to obtain a min-max optimization problem. Also, we extend the +VI-IGL framework to general $f$-Information measures leading to the generalized +$f$-VI-IGL framework for the IGL-based RL problems. We present numerical +results on several reinforcement learning settings indicating an improved +performance compared to the existing IGL-based RL algorithm. + +
+
+
+
+
+ + ♻ ☆ Deception Abilities Emerged in Large Language Models + + +
+ Large language models (LLMs) are currently at the forefront of intertwining +artificial intelligence (AI) systems with human communication and everyday +life. Thus, aligning them with human values is of great importance. However, +given the steady increase in reasoning abilities, future LLMs are under +suspicion of becoming able to deceive human operators and utilizing this +ability to bypass monitoring efforts. As a prerequisite to this, LLMs need to +possess a conceptual understanding of deception strategies. This study reveals +that such strategies emerged in state-of-the-art LLMs, such as GPT-4, but were +non-existent in earlier LLMs. We conduct a series of experiments showing that +state-of-the-art LLMs are able to understand and induce false beliefs in other +agents, that their performance in complex deception scenarios can be amplified +utilizing chain-of-thought reasoning, and that eliciting Machiavellianism in +LLMs can alter their propensity to deceive. In sum, revealing hitherto unknown +machine behavior in LLMs, our study contributes to the nascent field of machine +psychology. + +
+
+
+
+
+ + ♻ ☆ You Shall Pass: Dealing with the Zero-Gradient Problem in Predict and + Optimize for Convex Optimization + + +
+ Predict and optimize is an increasingly popular decision-making paradigm that +employs machine learning to predict unknown parameters of optimization +problems. Instead of minimizing the prediction error of the parameters, it +trains predictive models using task performance as a loss function. The key +challenge to train such models is the computation of the Jacobian of the +solution of the optimization problem with respect to its parameters. For linear +problems, this Jacobian is known to be zero or undefined; hence, approximations +are usually employed. For non-linear convex problems, however, it is common to +use the exact Jacobian. This paper demonstrates that the zero-gradient problem +appears in the non-linear case as well -- the Jacobian can have a sizeable null +space, thereby causing the training process to get stuck in suboptimal points. +Through formal proofs, this paper shows that smoothing the feasible set +resolves this problem. Combining this insight with known techniques from the +literature, such as quadratic programming approximation and projection distance +regularization, a novel method to approximate the Jacobian is derived. In +simulation experiments, the proposed method increases the performance in the +non-linear case and at least matches the existing state-of-the-art methods for +linear problems. + +
+
+
+
+
+ + ♻ ☆ Lessons Learned from EXMOS User Studies: A Technical Report Summarizing + Key Takeaways from User Studies Conducted to Evaluate The EXMOS Platform + + +
+ In the realm of interactive machine-learning systems, the provision of +explanations serves as a vital aid in the processes of debugging and enhancing +prediction models. However, the extent to which various global model-centric +and data-centric explanations can effectively assist domain experts in +detecting and resolving potential data-related issues for the purpose of model +improvement has remained largely unexplored. In this technical report, we +summarise the key findings of our two user studies. Our research involved a +comprehensive examination of the impact of global explanations rooted in both +data-centric and model-centric perspectives within systems designed to support +healthcare experts in optimising machine learning models through both automated +and manual data configurations. To empirically investigate these dynamics, we +conducted two user studies, comprising quantitative analysis involving a sample +size of 70 healthcare experts and qualitative assessments involving 30 +healthcare experts. These studies were aimed at illuminating the influence of +different explanation types on three key dimensions: trust, understandability, +and model improvement. Results show that global model-centric explanations +alone are insufficient for effectively guiding users during the intricate +process of data configuration. In contrast, data-centric explanations exhibited +their potential by enhancing the understanding of system changes that occur +post-configuration. However, a combination of both showed the highest level of +efficacy for fostering trust, improving understandability, and facilitating +model enhancement among healthcare experts. We also present essential +implications for developing interactive machine-learning systems driven by +explanations. These insights can guide the creation of more effective systems +that empower domain experts to harness the full potential of machine learning + +
+
+ comment: It is a technical report only. The contents are not peer-reviewed. + Please reach out to the main author for any questions +
+
+
+
+
+ + ♻ ☆ Multi-Relational Hyperbolic Word Embeddings from Natural Language + Definitions EACL 2024 + + +
+ Natural language definitions possess a recursive, self-explanatory semantic +structure that can support representation learning methods able to preserve +explicit conceptual relations and constraints in the latent space. This paper +presents a multi-relational model that explicitly leverages such a structure to +derive word embeddings from definitions. By automatically extracting the +relations linking defined and defining terms from dictionaries, we demonstrate +how the problem of learning word embeddings can be formalised via a +translational framework in Hyperbolic space and used as a proxy to capture the +global semantic structure of definitions. An extensive empirical analysis +demonstrates that the framework can help imposing the desired structural +constraints while preserving the semantic mapping required for controllable and +interpretable traversal. Moreover, the experiments reveal the superiority of +the Hyperbolic word embeddings over the Euclidean counterparts and demonstrate +that the multi-relational approach can obtain competitive results when compared +to state-of-the-art neural models, with the advantage of being intrinsically +more efficient and interpretable. + +
+
+ comment: Accepted at the 18th Conference of the European Chapter of the + Association for Computational Linguistics (EACL 2024), camera-ready +
+
+
+
+
+ + ♻ ☆ Conditional Diffusion Models for Semantic 3D Brain MRI Synthesis + + +
+ Artificial intelligence (AI) in healthcare, especially in medical imaging, +faces challenges due to data scarcity and privacy concerns. Addressing these, +we introduce Med-DDPM, a diffusion model designed for 3D semantic brain MRI +synthesis. This model effectively tackles data scarcity and privacy issues by +integrating semantic conditioning. This involves the channel-wise concatenation +of a conditioning image to the model input, enabling control in image +generation. Med-DDPM demonstrates superior stability and performance compared +to existing 3D brain imaging synthesis methods. It generates diverse, +anatomically coherent images with high visual fidelity. In terms of dice score +accuracy in the tumor segmentation task, Med-DDPM achieves 0.6207, close to the +0.6531 accuracy of real images, and outperforms baseline models. Combined with +real images, it further increases segmentation accuracy to 0.6675, showing the +potential of our proposed method for data augmentation. This model represents +the first use of a diffusion model in 3D semantic brain MRI synthesis, +producing high-quality images. Its semantic conditioning feature also shows +potential for image anonymization in biomedical imaging, addressing data and +privacy issues. We provide the code and model weights for Med-DDPM on our +GitHub repository (https://github.com/mobaidoctor/med-ddpm/) to support +reproducibility. + +
+
+
+
+
+ + ♻ ☆ DQNC2S: DQN-based Cross-stream Crisis event Summarizer ECIR 2024 + + +
+ Summarizing multiple disaster-relevant data streams simultaneously is +particularly challenging as existing Retrieve&Re-ranking strategies suffer from +the inherent redundancy of multi-stream data and limited scalability in a +multi-query setting. This work proposes an online approach to crisis timeline +generation based on weak annotation with Deep Q-Networks. It selects on-the-fly +the relevant pieces of text without requiring neither human annotations nor +content re-ranking. This makes the inference time independent of the number of +input queries. The proposed approach also incorporates a redundancy filter into +the reward function to effectively handle cross-stream content overlaps. The +achieved ROUGE and BERTScore results are superior to those of best-performing +models on the CrisisFACTS 2022 benchmark. + +
+
+ comment: accepted at ECIR 2024 +
+
+
+
+
+ + ♻ ☆ Forward $χ^2$ Divergence Based Variational Importance Sampling + + +
+ Maximizing the log-likelihood is a crucial aspect of learning latent variable +models, and variational inference (VI) stands as the commonly adopted method. +However, VI can encounter challenges in achieving a high log-likelihood when +dealing with complicated posterior distributions. In response to this +limitation, we introduce a novel variational importance sampling (VIS) approach +that directly estimates and maximizes the log-likelihood. VIS leverages the +optimal proposal distribution, achieved by minimizing the forward $\chi^2$ +divergence, to enhance log-likelihood estimation. We apply VIS to various +popular latent variable models, including mixture models, variational +auto-encoders, and partially observable generalized linear models. Results +demonstrate that our approach consistently outperforms state-of-the-art +baselines, both in terms of log-likelihood and model parameter estimation. + +
+
+
+
+
+ + ♻ ☆ A Statistical Learning View of Simple Kriging + + +
+ In the Big Data era, with the ubiquity of geolocation sensors in particular, +massive datasets exhibiting a possibly complex spatial dependence structure are +becoming increasingly available. In this context, the standard probabilistic +theory of statistical learning does not apply directly and guarantees of the +generalization capacity of predictive rules learned from such data are left to +establish. We analyze here the simple Kriging task from a statistical learning +perspective, i.e. by carrying out a nonparametric finite-sample predictive +analysis. Given $d\geq 1$ values taken by a realization of a square integrable +random field $X=\{X_s\}_{s\in S}$, $S\subset \mathbb{R}^2$, with unknown +covariance structure, at sites $s_1,\; \ldots,\; s_d$ in $S$, the goal is to +predict the unknown values it takes at any other location $s\in S$ with minimum +quadratic risk. The prediction rule being derived from a training spatial +dataset: a single realization $X'$ of $X$, independent from those to be +predicted, observed at $n\geq 1$ locations $\sigma_1,\; \ldots,\; \sigma_n$ in +$S$. Despite the connection of this minimization problem with kernel ridge +regression, establishing the generalization capacity of empirical risk +minimizers is far from straightforward, due to the non independent and +identically distributed nature of the training data $X'_{\sigma_1},\; \ldots,\; +X'_{\sigma_n}$ involved in the learning procedure. In this article, +non-asymptotic bounds of order $O_{\mathbb{P}}(1/\sqrt{n})$ are proved for the +excess risk of a plug-in predictive rule mimicking the true minimizer in the +case of isotropic stationary Gaussian processes, observed at locations forming +a regular grid in the learning stage. These theoretical results are illustrated +by various numerical experiments, on simulated data and on real-world datasets. + +
+
+ comment: 41 pages +
+
+
+
+
+ + ♻ ☆ Activity Detection for Massive Connectivity in Cell-free Networks with + Unknown Large-scale Fading, Channel Statistics, Noise Variance, and Activity + Probability: A Bayesian Approach + + +
+ Activity detection is an important task in the next generation grant-free +multiple access. While there are a number of existing algorithms designed for +this purpose, they mostly require precise information about the network, such +as large-scale fading coefficients, small-scale fading channel statistics, +noise variance at the access points, and user activity probability. Acquiring +these information would take a significant overhead and their estimated values +might not be accurate. This problem is even more severe in cell-free networks +as there are many of these parameters to be acquired. Therefore, this paper +sets out to investigate the activity detection problem without the +above-mentioned information. In order to handle so many unknown parameters, +this paper employs the Bayesian approach, where the unknown variables are +endowed with prior distributions which effectively act as regularizations. +Together with the likelihood function, a maximum a posteriori (MAP) estimator +and a variational inference algorithm are derived. Extensive simulations +demonstrate that the proposed methods, even without the knowledge of these +system parameters, perform better than existing state-of-the-art methods, such +as covariance-based and approximate message passing methods. + +
+
+ comment: 16 pages, 9 figures, accepted for publication in IEEE Transactions on + Signal Processing +
+
+
+
+
+ + ♻ ☆ Ordinal Potential-based Player Rating + + +
+ It was recently observed that Elo ratings fail at preserving transitive +relations among strategies and therefore cannot correctly extract the +transitive component of a game. We provide a characterization of transitive +games as a weak variant of ordinal potential games and show that Elo ratings +actually do preserve transitivity when computed in the right space, using +suitable invertible mappings. Leveraging this insight, we introduce a new game +decomposition of an arbitrary game into transitive and cyclic components that +is learnt using a neural network-based architecture and that prioritises +capturing the sign pattern of the game, namely transitive and cyclic relations +among strategies. We link our approach to the known concept of sign-rank, and +evaluate our methodology using both toy examples and empirical data from +real-world games. + +
+
+
+
+
+ + ♻ ☆ On the explainable properties of 1-Lipschitz Neural Networks: An Optimal + Transport Perspective + + +
+ Input gradients have a pivotal role in a variety of applications, including +adversarial attack algorithms for evaluating model robustness, explainable AI +techniques for generating Saliency Maps, and counterfactual +explanations.However, Saliency Maps generated by traditional neural networks +are often noisy and provide limited insights. In this paper, we demonstrate +that, on the contrary, the Saliency Maps of 1-Lipschitz neural networks, +learned with the dual loss of an optimal transportation problem, exhibit +desirable XAI properties:They are highly concentrated on the essential parts of +the image with low noise, significantly outperforming state-of-the-art +explanation approaches across various models and metrics. We also prove that +these maps align unprecedentedly well with human explanations on ImageNet.To +explain the particularly beneficial properties of the Saliency Map for such +models, we prove this gradient encodes both the direction of the transportation +plan and the direction towards the nearest adversarial attack. Following the +gradient down to the decision boundary is no longer considered an adversarial +attack, but rather a counterfactual explanation that explicitly transports the +input from one class to another. Thus, Learning with such a loss jointly +optimizes the classification objective and the alignment of the gradient, i.e. +the Saliency Map, to the transportation plan direction.These networks were +previously known to be certifiably robust by design, and we demonstrate that +they scale well for large problems and models, and are tailored for +explainability using a fast and straightforward method. + +
+
+
+
+
+ + ♻ ☆ Harmonizing Covariance and Expressiveness for Deep Hamiltonian + Regression in Crystalline Material Research: a Hybrid Cascaded Regression + Framework + + +
+ Deep learning for Hamiltonian regression of quantum systems in material +research necessitates satisfying the covariance laws, among which achieving +SO(3)-equivariance without sacrificing the expressiveness capability of +networks remains unsolved due to the restriction on non-linear mappings in +assuring theoretical equivariance. To alleviate the covariance-expressiveness +dilemma, we make an exploration on non-linear covariant deep learning with a +hybrid framework consisting of two cascaded regression stages. The first stage, +i.e., a theoretically-guaranteed covariant neural network modeling symmetry +properties of 3D atom systems, predicts baseline Hamiltonians with +theoretically covariant features extracted, assisting the second stage in +learning covariance. Meanwhile, the second stage, powered by a non-linear 3D +graph Transformer network we propose for structural modeling of atomic systems, +refines the first stage's output as a fine-grained prediction of Hamiltonians +with better expressiveness capability. The novel combination of a theoretically +covariant yet inevitably less expressive model with a highly expressive +non-linear network enables precise, generalizable predictions while maintaining +robust covariance under coordinate transformations. We achieve state-of-the-art +performance in Hamiltonian prediction, confirmed through experiments on six +crystalline material databases. + +
+
+
+
+
+ + ♻ ☆ How Does a Deep Learning Model Architecture Impact Its Privacy? A + Comprehensive Study of Privacy Attacks on CNNs and Transformers USENIX Security 2024 + + +
+ As a booming research area in the past decade, deep learning technologies +have been driven by big data collected and processed on an unprecedented scale. +However, privacy concerns arise due to the potential leakage of sensitive +information from the training data. Recent research has revealed that deep +learning models are vulnerable to various privacy attacks, including membership +inference attacks, attribute inference attacks, and gradient inversion attacks. +Notably, the efficacy of these attacks varies from model to model. In this +paper, we answer a fundamental question: Does model architecture affect model +privacy? By investigating representative model architectures from convolutional +neural networks (CNNs) to Transformers, we demonstrate that Transformers +generally exhibit higher vulnerability to privacy attacks than CNNs. +Additionally, we identify the micro design of activation layers, stem layers, +and LN layers, as major factors contributing to the resilience of CNNs against +privacy attacks, while the presence of attention modules is another main factor +that exacerbates the privacy vulnerability of Transformers. Our discovery +reveals valuable insights for deep learning models to defend against privacy +attacks and inspires the research community to develop privacy-friendly model +architectures. + +
+
+ comment: To appear in USENIX Security 2024 +
+
+
+
+
+ + ♻ ☆ Marginal Laplacian Score + + +
+ High-dimensional imbalanced data poses a machine learning challenge. In the +absence of sufficient or high-quality labels, unsupervised feature selection +methods are crucial for the success of subsequent algorithms. Therefore, we +introduce a Marginal Laplacian Score (MLS), a modification of the well known +Laplacian Score (LS) tailored to better address imbalanced data. We introduce +an assumption that the minority class or anomalous appear more frequently in +the margin of the features. Consequently, MLS aims to preserve the local +structure of the dataset's margin. We propose its integration into modern +feature selection methods that utilize the Laplacian score. We integrate the +MLS algorithm into the Differentiable Unsupervised Feature Selection (DUFS), +resulting in DUFS-MLS. The proposed methods demonstrate robust and improved +performance on synthetic and public datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Real-time Extended Reality Video Transmission Optimization Based on + Frame-priority Scheduling + + +
+ Extended Reality (XR) is an important service in the 5G network and in future +6G networks. In contrast to traditional video on demand services, real-time XR +video is transmitted frame by frame, requiring low latency and being highly +sensitive to network fluctuations. In this paper, we model the quality of +experience (QoE) for real-time XR video transmission on a frame-by-frame basis. +Based on the proposed QoE model, we formulate an optimization problem that +maximizes QoE with constraints on wireless resources and long-term energy +consumption. We utilize Lyapunov optimization to transform the original problem +into a single-frame optimization problem and then allocate wireless +subchannels. We propose an adaptive XR video bitrate algorithm that employs a +Long Short Term Memory (LSTM) based Deep Q-Network (DQN) algorithm for video +bitrate selection. Through numerical results, we show that our proposed +algorithm outperforms the baseline algorithms, with the average QoE +improvements of 5.9% to 80.0%. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ IIANet: An Intra- and Inter-Modality Attention Network for Audio-Visual + Speech Separation + + +
+ Recent research has made significant progress in designing fusion modules for +audio-visual speech separation. However, they predominantly focus on +multi-modal fusion at a single temporal scale of auditory and visual features +without employing selective attention mechanisms, which is in sharp contrast +with the brain. To address this issue, We propose a novel model called Intra- +and Inter-Attention Network (IIANet), which leverages the attention mechanism +for efficient audio-visual feature fusion. IIANet consists of two types of +attention blocks: intra-attention (IntraA) and inter-attention (InterA) blocks, +where the InterA blocks are distributed at the top, middle and bottom of +IIANet. Heavily inspired by the way how human brain selectively focuses on +relevant content at various temporal scales, these blocks maintain the ability +to learn modality-specific features and enable the extraction of different +semantics from audio-visual features. Comprehensive experiments on three +standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of IIANet, outperforming previous +state-of-the-art methods while maintaining comparable inference time. In +particular, the fast version of IIANet (IIANet-fast) has only 7% of CTCNet's +MACs and is 40% faster than CTCNet on CPUs while achieving better separation +quality, showing the great potential of attention mechanism for efficient and +effective multimodal fusion. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Prompting Segmentation with Sound Is Generalizable Audio-Visual Source + Localizer AAAI 2024 + + +
+ Never having seen an object and heard its sound simultaneously, can the model +still accurately localize its visual position from the input audio? In this +work, we concentrate on the Audio-Visual Localization and Segmentation tasks +but under the demanding zero-shot and few-shot scenarios. To achieve this goal, +different from existing approaches that mostly employ the +encoder-fusion-decoder paradigm to decode localization information from the +fused audio-visual feature, we introduce the encoder-prompt-decoder paradigm, +aiming to better fit the data scarcity and varying data distribution dilemmas +with the help of abundant knowledge from pre-trained models. Specifically, we +first propose to construct Semantic-aware Audio Prompt (SAP) to help the visual +foundation model focus on sounding objects, meanwhile, the semantic gap between +the visual and audio modalities is also encouraged to shrink. Then, we develop +a Correlation Adapter (ColA) to keep minimal training efforts as well as +maintain adequate knowledge of the visual foundation model. By equipping with +these means, extensive experiments demonstrate that this new paradigm +outperforms other fusion-based methods in both the unseen class and +cross-dataset settings. We hope that our work can further promote the +generalization study of Audio-Visual Localization and Segmentation in practical +application scenarios. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Detecting Multimedia Generated by Large AI Models: A Survey + + +
+ The rapid advancement of Large AI Models (LAIMs), particularly diffusion +models and large language models, has marked a new era where AI-generated +multimedia is increasingly integrated into various aspects of daily life. +Although beneficial in numerous fields, this content presents significant +risks, including potential misuse, societal disruptions, and ethical concerns. +Consequently, detecting multimedia generated by LAIMs has become crucial, with +a marked rise in related research. Despite this, there remains a notable gap in +systematic surveys that focus specifically on detecting LAIM-generated +multimedia. Addressing this, we provide the first survey to comprehensively +cover existing research on detecting multimedia (such as text, images, videos, +audio, and multimodal content) created by LAIMs. Specifically, we introduce a +novel taxonomy for detection methods, categorized by media modality, and +aligned with two perspectives: pure detection (aiming to enhance detection +performance) and beyond detection (adding attributes like generalizability, +robustness, and interpretability to detectors). Additionally, we have presented +a brief overview of generation mechanisms, public datasets, and online +detection tools to provide a valuable resource for researchers and +practitioners in this field. Furthermore, we identify current challenges in +detection and propose directions for future research that address unexplored, +ongoing, and emerging issues in detecting multimedia generated by LAIMs. Our +aim for this survey is to fill an academic gap and contribute to global AI +security efforts, helping to ensure the integrity of information in the digital +realm. The project link is +https://github.com/Purdue-M2/Detect-LAIM-generated-Multimedia-Survey. + +
+
+
+
+
+ + ♻ ☆ A multi-modal approach for identifying schizophrenia using cross-modal + attention + + +
+ This study focuses on how different modalities of human communication can be +used to distinguish between healthy controls and subjects with schizophrenia +who exhibit strong positive symptoms. We developed a multi-modal schizophrenia +classification system using audio, video, and text. Facial action units and +vocal tract variables were extracted as low-level features from video and audio +respectively, which were then used to compute high-level coordination features +that served as the inputs to the audio and video modalities. +Context-independent text embeddings extracted from transcriptions of speech +were used as the input for the text modality. The multi-modal system is +developed by fusing a segment-to-session-level classifier for video and audio +modalities with a text model based on a Hierarchical Attention Network (HAN) +with cross-modal attention. The proposed multi-modal system outperforms the +previous state-of-the-art multi-modal system by 8.53% in the weighted average +F1 score. + +
+
+ comment: Submitted to Annual International Conference of the IEEE Engineering + in Medicine and Biology Society 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 82 + +
+
+
+ + ☆ Evaluating Large Language Models for Generalization and Robustness via + Data Compression + + +
+ Existing methods for evaluating large language models face challenges such as +data contamination, sensitivity to prompts, and the high cost of benchmark +creation. To address this, we propose a lossless data compression based +evaluation approach that tests how models' predictive abilities generalize +after their training cutoff. Specifically, we collect comprehensive test data +spanning 83 months from 2017 to 2023 and split the data into training and +testing periods according to models' training data cutoff. We measure: 1) the +compression performance on the testing period as a measure of generalization on +unseen data; and 2) the performance gap between the training and testing period +as a measure of robustness. Our experiments test 14 representative large +language models with various sizes on sources including Wikipedia, news +articles, code, arXiv papers, and multi-modal data. We find that the +compression rate of many models reduces significantly after their cutoff date, +but models such as Mistral and Llama-2 demonstrate a good balance between +performance and robustness. Results also suggest that models struggle to +generalize on news and code data, but work especially well on arXiv papers. We +also find the context size and tokenization implementation have a big impact of +on the overall compression performance. + +
+
+
+
+
+ + ☆ Can Large Language Models Understand Context? EACL 2024 + + +
+ Understanding context is key to understanding human language, an ability +which Large Language Models (LLMs) have been increasingly seen to demonstrate +to an impressive extent. However, though the evaluation of LLMs encompasses +various domains within the realm of Natural Language Processing, limited +attention has been paid to probing their linguistic capability of understanding +contextual features. This paper introduces a context understanding benchmark by +adapting existing datasets to suit the evaluation of generative models. This +benchmark comprises of four distinct tasks and nine datasets, all featuring +prompts designed to assess the models' ability to understand context. First, we +evaluate the performance of LLMs under the in-context learning pretraining +scenario. Experimental results indicate that pre-trained dense models struggle +with understanding more nuanced contextual features when compared to +state-of-the-art fine-tuned models. Second, as LLM compression holds growing +significance in both research and real-world applications, we assess the +context understanding of quantized models under in-context-learning settings. +We find that 3-bit post-training quantization leads to varying degrees of +performance reduction on our benchmark. We conduct an extensive analysis of +these scenarios to substantiate our experimental results. + +
+
+ comment: Findings of EACL 2024 +
+
+
+
+
+ + ☆ Towards Efficient and Exact Optimization of Language Model Alignment + + +
+ The alignment of language models with human preferences is vital for their +application in real-world tasks. The problem is formulated as optimizing the +model's policy to maximize the expected reward that reflects human preferences +with minimal deviation from the initial policy. While considered as a +straightforward solution, reinforcement learning (RL) suffers from high +variance in policy updates, which impedes efficient policy improvement. +Recently, direct preference optimization (DPO) was proposed to directly +optimize the policy from preference data. Though simple to implement, DPO is +derived based on the optimal policy that is not assured to be achieved in +practice, which undermines its convergence to the intended solution. + In this paper, we propose efficient exact optimization (EXO) of the alignment +objective. We prove that EXO is guaranteed to optimize in the same direction as +the RL algorithms asymptotically for arbitary parametrization of the policy, +while enables efficient optimization by circumventing the complexities +associated with RL algorithms. We compare our method to DPO with both +theoretical and empirical analyses, and further demonstrate the advantages of +our method over existing approaches on realistic human preference data. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ Tiny Titans: Can Smaller Large Language Models Punch Above Their Weight + in the Real World for Meeting Summarization? + + +
+ Large Language Models (LLMs) have demonstrated impressive capabilities to +solve a wide range of tasks without being explicitly fine-tuned on +task-specific datasets. However, deploying LLMs in the real world is not +trivial, as it requires substantial computing resources. In this paper, we +investigate whether smaller, compact LLMs are a good alternative to the +comparatively Larger LLMs2 to address significant costs associated with +utilizing LLMs in the real world. In this regard, we study the meeting +summarization task in a real-world industrial environment and conduct extensive +experiments by comparing the performance of fine-tuned compact LLMs (e.g., +FLAN-T5, TinyLLaMA, LiteLLaMA) with zero-shot larger LLMs (e.g., LLaMA-2, +GPT-3.5, PaLM-2). We observe that most smaller LLMs, even after fine-tuning, +fail to outperform larger zero-shot LLMs in meeting summarization datasets. +However, a notable exception is FLAN-T5 (780M parameters), which performs on +par or even better than many zero-shot Larger LLMs (from 7B to above 70B +parameters), while being significantly smaller. This makes compact LLMs like +FLAN-T5 a suitable cost-efficient solution for real-world industrial +deployment. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ OLMo: Accelerating the Science of Language Models + + +
+ Language models (LMs) have become ubiquitous in both NLP research and in +commercial product offerings. As their commercial importance has surged, the +most powerful models have become closed off, gated behind proprietary +interfaces, with important details of their training data, architectures, and +development undisclosed. Given the importance of these details in +scientifically studying these models, including their biases and potential +risks, we believe it is essential for the research community to have access to +powerful, truly open LMs. To this end, this technical report details the first +release of OLMo, a state-of-the-art, truly Open Language Model and its +framework to build and study the science of language modeling. Unlike most +prior efforts that have only released model weights and inference code, we +release OLMo and the whole framework, including training data and training and +evaluation code. We hope this release will empower and strengthen the open +research community and inspire a new wave of innovation. + +
+
+
+
+
+ + ☆ ALISON: Fast and Effective Stylometric Authorship Obfuscation AAAI + + +
+ Authorship Attribution (AA) and Authorship Obfuscation (AO) are two competing +tasks of increasing importance in privacy research. Modern AA leverages an +author's consistent writing style to match a text to its author using an AA +classifier. AO is the corresponding adversarial task, aiming to modify a text +in such a way that its semantics are preserved, yet an AA model cannot +correctly infer its authorship. To address privacy concerns raised by +state-of-the-art (SOTA) AA methods, new AO methods have been proposed but +remain largely impractical to use due to their prohibitively slow training and +obfuscation speed, often taking hours. To this challenge, we propose a +practical AO method, ALISON, that (1) dramatically reduces training/obfuscation +time, demonstrating more than 10x faster obfuscation than SOTA AO methods, (2) +achieves better obfuscation success through attacking three transformer-based +AA methods on two benchmark datasets, typically performing 15% better than +competing methods, (3) does not require direct signals from a target AA +classifier during obfuscation, and (4) utilizes unique stylometric features, +allowing sound model interpretation for explainable obfuscation. We also +demonstrate that ALISON can effectively prevent four SOTA AA methods from +accurately determining the authorship of ChatGPT-generated texts, all while +minimally changing the original text semantics. To ensure the reproducibility +of our findings, our code and data are available at: +https://github.com/EricX003/ALISON. + +
+
+ comment: 10 pages, 6 figures, 4 tables. To be published in the Proceedings of + the 38th Annual AAAI Conference on Artificial Intelligence (AAAI-24) +
+
+
+
+
+ + ☆ Formal-LLM: Integrating Formal Language and Natural Language for + Controllable LLM-based Agents + + +
+ Recent advancements on Large Language Models (LLMs) enable AI Agents to +automatically generate and execute multi-step plans to solve complex tasks. +However, since LLM's content generation process is hardly controllable, current +LLM-based agents frequently generate invalid or non-executable plans, which +jeopardizes the performance of the generated plans and corrupts users' trust in +LLM-based agents. In response, this paper proposes a novel ``Formal-LLM'' +framework for LLM-based agents by integrating the expressiveness of natural +language and the precision of formal language. Specifically, the framework +allows human users to express their requirements or constraints for the +planning process as an automaton. A stack-based LLM plan generation process is +then conducted under the supervision of the automaton to ensure that the +generated plan satisfies the constraints, making the planning process +controllable. We conduct experiments on both benchmark tasks and practical +real-life tasks, and our framework achieves over 50% overall performance +increase, which validates the feasibility and effectiveness of employing +Formal-LLM to guide the plan generation of agents, preventing the agents from +generating invalid and unsuccessful plans. Further, more controllable LLM-based +agents can facilitate the broader utilization of LLM in application scenarios +where high validity of planning is essential. The work is open-sourced at +https://github.com/agiresearch/Formal-LLM. + +
+
+ comment: 21 pages, 6 figures; working in process, suggestions are welcome +
+
+
+
+
+ + ☆ ReAGent: Towards A Model-agnostic Feature Attribution Method for + Generative Language Models + + +
+ Feature attribution methods (FAs), such as gradients and attention, are +widely employed approaches to derive the importance of all input features to +the model predictions. Existing work in natural language processing has mostly +focused on developing and testing FAs for encoder-only language models (LMs) in +classification tasks. However, it is unknown if it is faithful to use these FAs +for decoder-only models on text generation, due to the inherent differences +between model architectures and task settings respectively. Moreover, previous +work has demonstrated that there is no `one-wins-all' FA across models and +tasks. This makes the selection of a FA computationally expensive for large LMs +since input importance derivation often requires multiple forward and backward +passes including gradient computations that might be prohibitive even with +access to large compute. To address these issues, we present a model-agnostic +FA for generative LMs called Recursive Attribution Generator (ReAGent). Our +method updates the token importance distribution in a recursive manner. For +each update, we compute the difference in the probability distribution over the +vocabulary for predicting the next token between using the original input and +using a modified version where a part of the input is replaced with RoBERTa +predictions. Our intuition is that replacing an important token in the context +should have resulted in a larger change in the model's confidence in predicting +the token than replacing an unimportant token. Our method can be universally +applied to any generative LM without accessing internal model weights or +additional training and fine-tuning, as most other FAs require. We extensively +compare the faithfulness of ReAGent with seven popular FAs across six +decoder-only LMs of various sizes. The results show that our method +consistently provides more faithful token importance distributions. + +
+
+
+
+
+ + ☆ CroissantLLM: A Truly Bilingual French-English Language Model + + +
+ We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T +English and French tokens, to bring to the research and industrial community a +high-performance, fully open-sourced bilingual model that runs swiftly on +consumer-grade local hardware. To that end, we pioneer the approach of training +an intrinsically bilingual model with a 1:1 English-to-French pretraining data +ratio, a custom tokenizer, and bilingual finetuning datasets. We release the +training dataset, notably containing a French split with manually curated, +high-quality, and varied data sources. To assess performance outside of +English, we craft a novel benchmark, FrenchBench, consisting of an array of +classification and generation tasks, covering various orthogonal aspects of +model performance in the French Language. Additionally, rooted in transparency +and to foster further Large Language Model research, we release codebases, and +dozens of checkpoints across various model sizes, training data distributions, +and training steps, as well as fine-tuned Chat models, and strong translation +models. We evaluate our model through the FMTI framework, and validate 81 % of +the transparency criteria, far beyond the scores of even most open initiatives. +This work enriches the NLP landscape, breaking away from previous +English-centric work in order to strengthen our understanding of +multilinguality in language models. + +
+
+
+
+
+ + ☆ Health-LLM: Personalized Retrieval-Augmented Disease Prediction Model + + +
+ Artificial intelligence (AI) in healthcare has significantly advanced +intelligent medical treatment. However, traditional intelligent healthcare is +limited by static data and unified standards, preventing full integration with +individual situations and other challenges. Hence, a more professional and +detailed intelligent healthcare method is needed for development. To this end, +we propose an innovative framework named Heath-LLM, which combines large-scale +feature extraction and medical knowledge trade-off scoring. Compared to +traditional health management methods, our approach has three main advantages. +First, our method integrates health reports into a large model to provide +detailed task information. Second, professional medical expertise is used to +adjust the weighted scores of health characteristics. Third, we use a +semi-automated feature extraction framework to enhance the analytical power of +language models and incorporate expert insights to improve the accuracy of +disease prediction. We have conducted disease prediction experiments on a large +number of health reports to assess the effectiveness of Health-LLM. The results +of the experiments indicate that the proposed method surpasses traditional +methods and has the potential to revolutionize disease prediction and +personalized health management. The code is available at +https://github.com/jmyissb/HealthLLM. + +
+
+
+
+
+ + ☆ Enhancing Ethical Explanations of Large Language Models through + Iterative Symbolic Refinement EACL 2024 + + +
+ An increasing amount of research in Natural Language Inference (NLI) focuses +on the application and evaluation of Large Language Models (LLMs) and their +reasoning capabilities. Despite their success, however, LLMs are still prone to +factual errors and inconsistencies in their explanations, offering limited +control and interpretability for inference in complex domains. In this paper, +we focus on ethical NLI, investigating how hybrid neuro-symbolic techniques can +enhance the logical validity and alignment of ethical explanations produced by +LLMs. Specifically, we present an abductive-deductive framework named +Logic-Explainer, which integrates LLMs with an external backward-chaining +solver to refine step-wise natural language explanations and jointly verify +their correctness, reduce incompleteness and minimise redundancy. An extensive +empirical analysis demonstrates that Logic-Explainer can improve explanations +generated via in-context learning methods and Chain-of-Thought (CoT) on +challenging ethical NLI tasks, while, at the same time, producing formal proofs +describing and supporting models' reasoning. As ethical NLI requires +commonsense reasoning to identify underlying moral violations, our results +suggest the effectiveness of neuro-symbolic methods for multi-step NLI more +broadly, opening new opportunities to enhance the logical consistency, +reliability, and alignment of LLMs. + +
+
+ comment: Camera-ready for EACL 2024 +
+
+
+
+
+ + ☆ BATON: Aligning Text-to-Audio Model with Human Preference Feedback + + +
+ With the development of AI-Generated Content (AIGC), text-to-audio models are +gaining widespread attention. However, it is challenging for these models to +generate audio aligned with human preference due to the inherent information +density of natural language and limited model understanding ability. To +alleviate this issue, we formulate the BATON, a framework designed to enhance +the alignment between generated audio and text prompt using human preference +feedback. Our BATON comprises three key stages: Firstly, we curated a dataset +containing both prompts and the corresponding generated audio, which was then +annotated based on human feedback. Secondly, we introduced a reward model using +the constructed dataset, which can mimic human preference by assigning rewards +to input text-audio pairs. Finally, we employed the reward model to fine-tune +an off-the-shelf text-to-audio model. The experiment results demonstrate that +our BATON can significantly improve the generation quality of the original +text-to-audio models, concerning audio integrity, temporal relationship, and +alignment with human preference. + +
+
+
+
+
+ + ☆ Benefits of Transformer: In-Context Learning in Linear Regression Tasks + with Unstructured Data + + +
+ In practice, it is observed that transformer-based models can learn concepts +in context in the inference stage. While existing literature, e.g., +\citet{zhang2023trained,huang2023context}, provide theoretical explanations on +this in-context learning ability, they assume the input $x_i$ and the output +$y_i$ for each sample are embedded in the same token (i.e., structured data). +However, in reality, they are presented in two tokens (i.e., unstructured data +\cite{wibisono2023role}). In this case, this paper conducts experiments in +linear regression tasks to study the benefits of the architecture of +transformers and provides some corresponding theoretical intuitions to explain +why the transformer can learn from unstructured data. We study the exact +components in a transformer that facilitate the in-context learning. In +particular, we observe that (1) a transformer with two layers of softmax +(self-)attentions with look-ahead attention mask can learn from the prompt if +$y_i$ is in the token next to $x_i$ for each example; (2) positional encoding +can further improve the performance; and (3) multi-head attention with a high +input embedding dimension has a better prediction performance than single-head +attention. + +
+
+
+
+
+ + ☆ Transforming and Combining Rewards for Aligning Large Language Models + + +
+ A common approach for aligning language models to human preferences is to +first learn a reward model from preference data, and then use this reward model +to update the language model. We study two closely related problems that arise +in this approach. First, any monotone transformation of the reward model +preserves preference ranking; is there a choice that is ``better'' than others? +Second, we often wish to align language models to multiple properties: how +should we combine multiple reward models? Using a probabilistic interpretation +of the alignment procedure, we identify a natural choice for transformation for +(the common case of) rewards learned from Bradley-Terry preference models. This +derived transformation has two important properties. First, it emphasizes +improving poorly-performing outputs, rather than outputs that already score +well. This mitigates both underfitting (where some prompts are not improved) +and reward hacking (where the model learns to exploit misspecification of the +reward model). Second, it enables principled aggregation of rewards by linking +summation to logical conjunction: the sum of transformed rewards corresponds to +the probability that the output is ``good'' in all measured properties, in a +sense we make precise. Experiments aligning language models to be both helpful +and harmless using RLHF show substantial improvements over the baseline +(non-transformed) approach. + +
+
+
+
+
+ + ☆ Improving Semantic Control in Discrete Latent Spaces with Transformer + Quantized Variational Autoencoders + + +
+ Achieving precise semantic control over the latent spaces of Variational +AutoEncoders (VAEs) holds significant value for downstream tasks in NLP as the +underlying generative mechanisms could be better localised, explained and +improved upon. Recent research, however, has struggled to achieve consistent +results, primarily due to the inevitable loss of semantic information in the +variational bottleneck and limited control over the decoding mechanism. To +overcome these challenges, we investigate discrete latent spaces in Vector +Quantized Variational AutoEncoders (VQVAEs) to improve semantic control and +generation in Transformer-based VAEs. In particular, We propose T5VQVAE, a +novel model that leverages the controllability of VQVAEs to guide the +self-attention mechanism in T5 at the token-level, exploiting its full +generalization capabilities. Experimental results indicate that T5VQVAE +outperforms existing state-of-the-art VAE models, including Optimus, in terms +of controllability and preservation of semantic information across different +tasks such as auto-encoding of sentences and mathematical expressions, text +transfer, and inference. Moreover, T5VQVAE exhibits improved inference +capabilities, suggesting potential applications for downstream natural language +and symbolic reasoning tasks. + +
+
+
+
+
+ + ☆ Explaining Text Classifiers with Counterfactual Representations + + +
+ One well motivated explanation method for classifiers leverages +counterfactuals which are hypothetical events identical to real observations in +all aspects except for one categorical feature. Constructing such +counterfactual poses specific challenges for texts, however, as some attribute +values may not necessarily align with plausible real-world events. In this +paper we propose a simple method for generating counterfactuals by intervening +in the space of text representations which bypasses this limitation. We argue +that our interventions are minimally disruptive and that they are theoretically +sound as they align with counterfactuals as defined in Pearl's causal inference +framework. To validate our method, we first conduct experiments on a synthetic +dataset of counterfactuals, allowing for a direct comparison between classifier +predictions based on ground truth counterfactuals (obtained through explicit +text interventions) and our counterfactuals, derived through interventions in +the representation space. Second, we study a real world scenario where our +counterfactuals can be leveraged both for explaining a classifier and for bias +mitigation. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Non-Exchangeable Conformal Language Generation with Nearest Neighbors + + +
+ Quantifying uncertainty in automatically generated text is important for +letting humans check potential hallucinations and making systems more reliable. +Conformal prediction is an attractive framework to provide predictions imbued +with statistical guarantees, however, its application to text generation is +challenging since any i.i.d. assumptions are not realistic. In this paper, we +bridge this gap by leveraging recent results on non-exchangeable conformal +prediction, which still ensures bounds on coverage. The result, +non-exchangeable conformal nucleus sampling, is a novel extension of the +conformal prediction framework to generation based on nearest neighbors. Our +method can be used post-hoc for an arbitrary model without extra training and +supplies token-level, calibrated prediction sets equipped with statistical +guarantees. Experiments in machine translation and language modeling show +encouraging results in generation quality. By also producing tighter prediction +sets with good coverage, we thus give a more theoretically principled way to +perform sampling with conformal guarantees. + +
+
+
+
+
+ + ☆ Improving Weak-to-Strong Generalization with Scalable Oversight and + Ensemble Learning + + +
+ This paper presents a follow-up study to OpenAI's recent superalignment work +on Weak-to-Strong Generalization (W2SG). Superalignment focuses on ensuring +that high-level AI systems remain consistent with human values and intentions +when dealing with complex, high-risk tasks. The W2SG framework has opened new +possibilities for empirical research in this evolving field. Our study +simulates two phases of superalignment under the W2SG framework: the +development of general superhuman models and the progression towards +superintelligence. In the first phase, based on human supervision, the quality +of weak supervision is enhanced through a combination of scalable oversight and +ensemble learning, reducing the capability gap between weak teachers and strong +students. In the second phase, an automatic alignment evaluator is employed as +the weak supervisor. By recursively updating this auto aligner, the +capabilities of the weak teacher models are synchronously enhanced, achieving +weak-to-strong supervision over stronger student models.We also provide an +initial validation of the proposed approach for the first phase. Using the SciQ +task as example, we explore ensemble learning for weak teacher models through +bagging and boosting. Scalable oversight is explored through two auxiliary +settings: human-AI interaction and AI-AI debate. Additionally, the paper +discusses the impact of improved weak supervision on enhancing weak-to-strong +generalization based on in-context learning. Experiment code and dataset will +be released at https://github.com/ADaM-BJTU/W2SG. + +
+
+
+
+
+ + ☆ Learning Planning-based Reasoning by Trajectories Collection and Process + Reward Synthesizing + + +
+ Large Language Models (LLMs) have demonstrated significant potential in +handling complex reasoning tasks through step-by-step rationale generation. +However, recent studies have raised concerns regarding the hallucination and +flaws in their reasoning process. Substantial efforts are being made to improve +the reliability and faithfulness of the generated rationales. Some approaches +model reasoning as planning, while others focus on annotating for process +supervision. Nevertheless, the planning-based search process often results in +high latency due to the frequent assessment of intermediate reasoning states +and the extensive exploration space. Additionally, supervising the reasoning +process with human annotation is costly and challenging to scale for LLM +training. To address these issues, in this paper, we propose a framework to +learn planning-based reasoning through direct preference optimization (DPO) on +collected trajectories, which are ranked according to synthesized process +rewards. Our results on challenging logical reasoning benchmarks demonstrate +the effectiveness of our learning framework, showing that our 7B model can +surpass the strong counterparts like GPT-3.5-Turbo. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Prosody in Cascade and Direct Speech-to-Text Translation: a case study + on Korean Wh-Phrases EACL 2024 + + +
+ Speech-to-Text Translation (S2TT) has typically been addressed with cascade +systems, where speech recognition systems generate a transcription that is +subsequently passed to a translation model. While there has been a growing +interest in developing direct speech translation systems to avoid propagating +errors and losing non-verbal content, prior work in direct S2TT has struggled +to conclusively establish the advantages of integrating the acoustic signal +directly into the translation process. This work proposes using contrastive +evaluation to quantitatively measure the ability of direct S2TT systems to +disambiguate utterances where prosody plays a crucial role. Specifically, we +evaluated Korean-English translation systems on a test set containing +wh-phrases, for which prosodic features are necessary to produce translations +with the correct intent, whether it's a statement, a yes/no question, a +wh-question, and more. Our results clearly demonstrate the value of direct +translation systems over cascade translation models, with a notable 12.9% +improvement in overall accuracy in ambiguous cases, along with up to a 15.6% +increase in F1 scores for one of the major intent categories. To the best of +our knowledge, this work stands as the first to provide quantitative evidence +that direct S2TT models can effectively leverage prosody. The code for our +evaluation is openly accessible and freely available for review and +utilisation. + +
+
+ comment: Accepted at Findings of EACL 2024 +
+
+
+
+
+ + ☆ Actor Identification in Discourse: A Challenge for LLMs? EACL 2024 + + +
+ The identification of political actors who put forward claims in public +debate is a crucial step in the construction of discourse networks, which are +helpful to analyze societal debates. Actor identification is, however, rather +challenging: Often, the locally mentioned speaker of a claim is only a pronoun +("He proposed that [claim]"), so recovering the canonical actor name requires +discourse understanding. We compare a traditional pipeline of dedicated NLP +components (similar to those applied to the related task of coreference) with a +LLM, which appears a good match for this generation task. Evaluating on a +corpus of German actors in newspaper reports, we find surprisingly that the LLM +performs worse. Further analysis reveals that the LLM is very good at +identifying the right reference, but struggles to generate the correct +canonical form. This points to an underlying issue in LLMs with controlling +generated output. Indeed, a hybrid model combining the LLM with a classifier to +normalize its output substantially outperforms both initial models. + +
+
+ comment: Proceedings of the EACL 2024 workshop on Computational Models of + Discourse (St. Julian's, Malta) +
+
+
+
+
+ + ☆ A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for + Verifiers of Reasoning Chains + + +
+ Prompting language models to provide step-by-step answers (e.g., +"Chain-of-Thought") is the prominent approach for complex reasoning tasks, +where more accurate reasoning chains typically improve downstream task +performance. Recent literature discusses automatic methods to verify reasoning +steps to evaluate and improve their correctness. However, no fine-grained +step-level datasets are available to enable thorough evaluation of such +verification methods, hindering progress in this direction. We introduce +Reveal: Reasoning Verification Evaluation, a new dataset to benchmark automatic +verifiers of complex Chain-of-Thought reasoning in open-domain question +answering settings. Reveal includes comprehensive labels for the relevance, +attribution to evidence passages, and logical correctness of each reasoning +step in a language model's answer, across a wide variety of datasets and +state-of-the-art language models. + +
+
+ comment: https://huggingface.co/datasets/google/reveal +
+
+
+
+
+ + ☆ Superfiltering: Weak-to-Strong Data Filtering for Fast + Instruction-Tuning + + +
+ Instruction tuning is critical to improve LLMs but usually suffers from +low-quality and redundant data. Data filtering for instruction tuning has +proved important in improving both the efficiency and performance of the tuning +process. But it also leads to extra cost and computation due to the involvement +of LLMs in this process. To reduce the filtering cost, we study Superfiltering: +Can we use a smaller and weaker model to select data for finetuning a larger +and stronger model? Despite the performance gap between weak and strong +language models, we find their highly consistent capability to perceive +instruction difficulty and data selection results. This enables us to use a +much smaller and more efficient model to filter the instruction data used to +train a larger language model. Not only does it largely speed up the data +filtering, but the filtered-data-finetuned LLM achieves even better performance +on standard benchmarks. Extensive experiments validate the efficacy and +efficiency of our approach. + +
+
+
+
+
+ + ☆ EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit + Large Language Models + + +
+ This work introduces EE-Tuning, a lightweight and economical solution to +training/tuning early-exit large language models (LLMs). In contrast to the +common approach of full-parameter pre-training, EE-Tuning augments any +pre-trained (and possibly fine-tuned) standard LLM with additional early-exit +layers that are tuned in a parameter-efficient manner, which requires +significantly less computational resources and training data. Our +implementation of EE-Tuning achieves outstanding training efficiency via +extensive performance optimizations, as well as scalability due to its full +compatibility with 3D parallelism. Results of systematic experiments validate +the efficacy of EE-Tuning, confirming that effective early-exit LLM inference +can be achieved with a limited training budget. In hope of making early-exit +LLMs accessible to the community, we release the source code of our +implementation of EE-Tuning at https://github.com/pan-x-c/EE-LLM. + +
+
+
+
+
+ + ☆ SA-MDKIF: A Scalable and Adaptable Medical Domain Knowledge Injection + Framework for Large Language Models + + +
+ Recent advances in large language models (LLMs) have demonstrated exceptional +performance in various natural language processing (NLP) tasks. However, their +effective application in the medical domain is hampered by a lack of medical +domain knowledge. In this study, we present SA-MDKIF, a scalable and adaptable +framework that aims to inject medical knowledge into general-purpose LLMs +through instruction tuning, thereby enabling adaptability for various +downstream tasks. SA-MDKIF consists of two stages: skill training and skill +adaptation. In the first stage, we define 12 basic medical skills and use +AdaLoRA to train these skills based on uniformly formatted instructional +datasets that we have constructed. In the next stage, we train the skill router +using task-specific downstream data and use this router to integrate the +acquired skills with LLMs during inference. Experimental results on 9 different +medical tasks show that SA-MDKIF improves performance by 10-20% compared to the +original LLMs. Notably, this improvement is particularly pronounced for unseen +medical tasks, showing an improvement of up to 30%. + +
+
+
+
+
+ + ☆ Instruction Makes a Difference + + +
+ We introduce Instruction Document Visual Question Answering (iDocVQA) dataset +and Large Language Document (LLaDoc) model, for training Language-Vision (LV) +models for document analysis and predictions on document images, respectively. +Usually, deep neural networks for the DocVQA task are trained on datasets +lacking instructions. We show that using instruction-following datasets +improves performance. We compare performance across document-related datasets +using the recent state-of-the-art (SotA) Large Language and Vision Assistant +(LLaVA)1.5 as the base model. We also evaluate the performance of the derived +models for object hallucination using the Polling-based Object Probing +Evaluation (POPE) dataset. The results show that instruction-tuning performance +ranges from 11X to 32X of zero-shot performance and from 0.1% to 4.2% over +non-instruction (traditional task) finetuning. Despite the gains, these still +fall short of human performance (94.36%), implying there's much room for +improvement. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Improving Dialog Safety using Socially Aware Contrastive Learning SC + + +
+ State-of-the-art conversational AI systems raise concerns due to their +potential risks of generating unsafe, toxic, unethical, or dangerous content. +Previous works have developed datasets to teach conversational agents the +appropriate social paradigms to respond effectively to specifically designed +hazardous content. However, models trained on these adversarial datasets still +struggle to recognize subtle unsafe situations that appear naturally in +conversations or introduce an inappropriate response in a casual context. To +understand the extent of this problem, we study prosociality in both +adversarial and casual dialog contexts and audit the response quality of +general-purpose language models in terms of propensity to produce unsafe +content. We propose a dual-step fine-tuning process to address these issues +using a socially aware n-pair contrastive loss. Subsequently, we train a base +model that integrates prosocial behavior by leveraging datasets like Moral +Integrity Corpus (MIC) and ProsocialDialog. Experimental results on several +dialog datasets demonstrate the effectiveness of our approach in generating +socially appropriate responses. + +
+
+ comment: SCI-CHAT@EACL2024 +
+
+
+
+
+ + ☆ From PARIS to LE-PARIS: Toward Patent Response Automation with + Recommender Systems and Collaborative Large Language Models + + +
+ In patent prosecution, timely and effective responses to Office Actions (OAs) +are crucial for acquiring patents, yet past automation and AI research have +scarcely addressed this aspect. To address this gap, our study introduces the +Patent Office Action Response Intelligence System (PARIS) and its advanced +version, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are +designed to expedite the efficiency of patent attorneys in collaboratively +handling OA responses. The systems' key features include the construction of an +OA Topics Database, development of Response Templates, and implementation of +Recommender Systems and LLM-based Response Generation. Our validation involves +a multi-paradigmatic analysis using the USPTO Office Action database and +longitudinal data of attorney interactions with our systems over six years. +Through five studies, we examine the constructiveness of OA topics (studies 1 +and 2) using topic modeling and the proposed Delphi process, the efficacy of +our proposed hybrid recommender system tailored for OA (both LLM-based and +non-LLM-based) (study 3), the quality of response generation (study 4), and the +practical value of the systems in real-world scenarios via user studies (study +5). Results demonstrate that both PARIS and LE-PARIS significantly meet key +metrics and positively impact attorney performance. + +
+
+ comment: 14 pages, 4 figures, summitted to a journal +
+
+
+
+
+ + ☆ Prompt-Time Symbolic Knowledge Capture with Large Language Models + + +
+ Augmenting large language models (LLMs) with user-specific knowledge is +crucial for real-world applications, such as personal AI assistants. However, +LLMs inherently lack mechanisms for prompt-driven knowledge capture. This paper +investigates utilizing the existing LLM capabilities to enable prompt-driven +knowledge capture, with a particular emphasis on knowledge graphs. We address +this challenge by focusing on prompt-to-triple (P2T) generation. We explore +three methods: zero-shot prompting, few-shot prompting, and fine-tuning, and +then assess their performance via a specialized synthetic dataset. Our code and +datasets are publicly available at https://github.com/HaltiaAI/paper-PTSKC. + +
+
+ comment: 8 pages, 5 figures, 1 table preprint. Under review +
+
+
+
+
+ + ☆ Hidding the Ghostwriters: An Adversarial Evaluation of AI-Generated + Student Essay Detection EMNLP 2023 + + +
+ Large language models (LLMs) have exhibited remarkable capabilities in text +generation tasks. However, the utilization of these models carries inherent +risks, including but not limited to plagiarism, the dissemination of fake news, +and issues in educational exercises. Although several detectors have been +proposed to address these concerns, their effectiveness against adversarial +perturbations, specifically in the context of student essay writing, remains +largely unexplored. This paper aims to bridge this gap by constructing +AIG-ASAP, an AI-generated student essay dataset, employing a range of text +perturbation methods that are expected to generate high-quality essays while +evading detection. Through empirical experiments, we assess the performance of +current AIGC detectors on the AIG-ASAP dataset. The results reveal that the +existing detectors can be easily circumvented using straightforward automatic +adversarial attacks. Specifically, we explore word substitution and sentence +substitution perturbation methods that effectively evade detection while +maintaining the quality of the generated essays. This highlights the urgent +need for more accurate and robust methods to detect AI-generated student essays +in the education domain. + +
+
+ comment: Accepted by EMNLP 2023 Main conference, Oral Presentation +
+
+
+
+
+ + ☆ Investigating Bias Representations in Llama 2 Chat via Activation + Steering + + +
+ We address the challenge of societal bias in Large Language Models (LLMs), +focusing on the Llama 2 7B Chat model. As LLMs are increasingly integrated into +decision-making processes with substantial societal impact, it becomes +imperative to ensure these models do not reinforce existing biases. Our +approach employs activation steering to probe for and mitigate biases related +to gender, race, and religion. This method manipulates model activations to +direct responses towards or away from biased outputs, utilizing steering +vectors derived from the StereoSet dataset and custom GPT4 generated gender +bias prompts. Our findings reveal inherent gender bias in Llama 2 7B Chat, +persisting even after Reinforcement Learning from Human Feedback (RLHF). We +also observe a predictable negative correlation between bias and the model's +tendency to refuse responses. Significantly, our study uncovers that RLHF tends +to increase the similarity in the model's representation of different forms of +societal biases, which raises questions about the model's nuanced understanding +of different forms of bias. This work also provides valuable insights into +effective red-teaming strategies for LLMs using activation steering, +particularly emphasizing the importance of integrating a refusal vector. + +
+
+
+
+
+ + ☆ Efficient Exploration for LLMs + + +
+ We present evidence of substantial benefit from efficient exploration in +gathering human feedback to improve large language models. In our experiments, +an agent sequentially generates queries while fitting a reward model to the +feedback received. Our best-performing agent generates queries using double +Thompson sampling, with uncertainty represented by an epistemic neural network. +Our results demonstrate that efficient exploration enables high levels of +performance with far fewer queries. Further, both uncertainty estimation and +the choice of exploration scheme play critical roles. + +
+
+
+
+
+ + ☆ Computational Morphology and Lexicography Modeling of Modern Standard + Arabic Nominals EACL 2024 + + +
+ Modern Standard Arabic (MSA) nominals present many morphological and lexical +modeling challenges that have not been consistently addressed previously. This +paper attempts to define the space of such challenges, and leverage a recently +proposed morphological framework to build a comprehensive and extensible model +for MSA nominals. Our model design addresses the nominals' intricate +morphotactics, as well as their paradigmatic irregularities. Our implementation +showcases enhanced accuracy and consistency compared to a commonly used MSA +morphological analyzer and generator. We make our models publicly available. + +
+
+ comment: Findings of the Association for Computational Linguistics: EACL 2024 +
+
+
+
+
+ + ☆ What Does the Bot Say? Opportunities and Risks of Large Language Models + in Social Media Bot Detection + + +
+ Social media bot detection has always been an arms race between advancements +in machine learning bot detectors and adversarial bot strategies to evade +detection. In this work, we bring the arms race to the next level by +investigating the opportunities and risks of state-of-the-art large language +models (LLMs) in social bot detection. To investigate the opportunities, we +design novel LLM-based bot detectors by proposing a +mixture-of-heterogeneous-experts framework to divide and conquer diverse user +information modalities. To illuminate the risks, we explore the possibility of +LLM-guided manipulation of user textual and structured information to evade +detection. Extensive experiments with three LLMs on two datasets demonstrate +that instruction tuning on merely 1,000 annotated examples produces specialized +LLMs that outperform state-of-the-art baselines by up to 9.1% on both datasets, +while LLM-guided manipulation strategies could significantly bring down the +performance of existing bot detectors by up to 29.6% and harm the calibration +and reliability of bot detection systems. + +
+
+
+
+
+ + ☆ Don't Hallucinate, Abstain: Identifying LLM Knowledge Gaps via Multi-LLM + Collaboration + + +
+ Despite efforts to expand the knowledge of large language models (LLMs), +knowledge gaps -- missing or outdated information in LLMs -- might always +persist given the evolving nature of knowledge. In this work, we study +approaches to identify LLM knowledge gaps and abstain from answering questions +when knowledge gaps are present. We first adapt existing approaches to model +calibration or adaptation through fine-tuning/prompting and analyze their +ability to abstain from generating low-confidence outputs. Motivated by their +failures in self-reflection and over-reliance on held-out sets, we propose two +novel approaches that are based on model collaboration, i.e., LLMs probing +other LLMs for knowledge gaps, either cooperatively or competitively. Extensive +experiments with three LLMs on four QA tasks featuring diverse knowledge +domains demonstrate that both cooperative and competitive approaches to +unveiling LLM knowledge gaps achieve up to 19.3% improvements on abstain +accuracy against the strongest baseline. Further analysis reveals that our +proposed mechanisms could help identify failure cases in retrieval augmentation +and pinpoint knowledge gaps in multi-hop reasoning. + +
+
+
+
+
+ + ☆ IndiVec: An Exploration of Leveraging Large Language Models for Media + Bias Detection with Fine-Grained Bias Indicators + + +
+ This study focuses on media bias detection, crucial in today's era of +influential social media platforms shaping individual attitudes and opinions. +In contrast to prior work that primarily relies on training specific models +tailored to particular datasets, resulting in limited adaptability and subpar +performance on out-of-domain data, we introduce a general bias detection +framework, IndiVec, built upon large language models. IndiVec begins by +constructing a fine-grained media bias database, leveraging the robust +instruction-following capabilities of large language models and vector database +techniques. When confronted with new input for bias detection, our framework +automatically selects the most relevant indicator from the vector database and +employs majority voting to determine the input's bias label. IndiVec excels +compared to previous methods due to its adaptability (demonstrating consistent +performance across diverse datasets from various sources) and explainability +(providing explicit top-k indicators to interpret bias predictions). +Experimental results on four political bias datasets highlight IndiVec's +significant superiority over baselines. Furthermore, additional experiments and +analysis provide profound insights into the framework's effectiveness. + +
+
+
+
+
+ + ☆ Bias in Opinion Summarisation from Pre-training to Adaptation: A Case + Study in Political Bias EACL 2024 + + +
+ Opinion summarisation aims to summarise the salient information and opinions +presented in documents such as product reviews, discussion forums, and social +media texts into short summaries that enable users to effectively understand +the opinions therein. Generating biased summaries has the risk of potentially +swaying public opinion. Previous studies focused on studying bias in opinion +summarisation using extractive models, but limited research has paid attention +to abstractive summarisation models. In this study, using political bias as a +case study, we first establish a methodology to quantify bias in abstractive +models, then trace it from the pre-trained models to the task of summarising +social media opinions using different models and adaptation methods. We find +that most models exhibit intrinsic bias. Using a social media text +summarisation dataset and contrasting various adaptation methods, we find that +tuning a smaller number of parameters is less biased compared to standard +fine-tuning; however, the diversity of topics in training data used for +fine-tuning is critical. + +
+
+ comment: 15 pages, 1 figure, 6 tables, Accepted to EACL 2024 +
+
+
+
+
+ + ☆ A Crucial Parameter for Rank-Frequency Relation in Natural Languages + + +
+ $f \propto r^{-\alpha} \cdot (r+\gamma)^{-\beta}$ has been empirically shown +more precise than a na\"ive power law $f\propto r^{-\alpha}$ to model the +rank-frequency ($r$-$f$) relation of words in natural languages. This work +shows that the only crucial parameter in the formulation is $\gamma$, which +depicts the resistance to vocabulary growth on a corpus. A method of parameter +estimation by searching an optimal $\gamma$ is proposed, where a ``zeroth +word'' is introduced technically for the calculation. The formulation and +parameters are further discussed with several case studies. + +
+
+
+
+
+ + ☆ Does \textsc{DetectGPT} Fully Utilize Perturbation? Selective + Perturbation on Model-Based Contrastive Learning Detector would be Better + + +
+ The burgeoning capabilities of large language models (LLMs) have raised +growing concerns about abuse. DetectGPT, a zero-shot metric-based unsupervised +machine-generated text detector, first introduces perturbation and shows great +performance improvement. However, DetectGPT's random perturbation strategy +might introduce noise, limiting the distinguishability and further performance +improvements. Moreover, its logit regression module relies on setting the +threshold, which harms the generalizability and applicability of individual or +small-batch inputs. Hence, we propose a novel detector, \modelname{}, which +uses selective strategy perturbation to relieve the important information loss +caused by random masking, and multi-pair contrastive learning to capture the +implicit pattern information during perturbation, facilitating few-shot +performance. The experiments show that \modelname{} outperforms the SOTA method +by 1.20\% in accuracy on average on four public datasets. We further analyze +the effectiveness, robustness, and generalization of our perturbation method. + +
+
+
+
+
+ + ☆ A Survey on Hallucination in Large Vision-Language Models + + +
+ Recent development of Large Vision-Language Models (LVLMs) has attracted +growing attention within the AI landscape for its practical implementation +potential. However, ``hallucination'', or more specifically, the misalignment +between factual visual content and corresponding textual generation, poses a +significant challenge of utilizing LVLMs. In this comprehensive survey, we +dissect LVLM-related hallucinations in an attempt to establish an overview and +facilitate future mitigation. Our scrutiny starts with a clarification of the +concept of hallucinations in LVLMs, presenting a variety of hallucination +symptoms and highlighting the unique challenges inherent in LVLM +hallucinations. Subsequently, we outline the benchmarks and methodologies +tailored specifically for evaluating hallucinations unique to LVLMs. +Additionally, we delve into an investigation of the root causes of these +hallucinations, encompassing insights from the training data and model +components. We also critically review existing methods for mitigating +hallucinations. The open questions and future directions pertaining to +hallucinations within LVLMs are discussed to conclude this survey. + +
+
+
+
+
+ + ☆ Efficient Non-Parametric Uncertainty Quantification for Black-Box Large + Language Models and Decision Planning + + +
+ Step-by-step decision planning with large language models (LLMs) is gaining +attention in AI agent development. This paper focuses on decision planning with +uncertainty estimation to address the hallucination problem in language models. +Existing approaches are either white-box or computationally demanding, limiting +use of black-box proprietary LLMs within budgets. The paper's first +contribution is a non-parametric uncertainty quantification method for LLMs, +efficiently estimating point-wise dependencies between input-decision on the +fly with a single inference, without access to token logits. This estimator +informs the statistical interpretation of decision trustworthiness. The second +contribution outlines a systematic design for a decision-making agent, +generating actions like ``turn on the bathroom light'' based on user prompts +such as ``take a bath''. Users will be asked to provide preferences when more +than one action has high estimated point-wise dependencies. In conclusion, our +uncertainty estimation and decision-making agent design offer a cost-efficient +approach for AI agent development. + +
+
+
+
+
+ + ☆ Evaluation Methodology for Large Language Models for Multilingual + Document Question and Answer + + +
+ With the widespread adoption of Large Language Models (LLMs), in this paper +we investigate the multilingual capability of these models. Our preliminary +results show that, translating the native language context, question and answer +into a high resource language produced the best results. + +
+
+
+
+
+ + ☆ Plan-Grounded Large Language Models for Dual Goal Conversational + Settings + + +
+ Training Large Language Models (LLMs) to follow user instructions has been +shown to supply the LLM with ample capacity to converse fluently while being +aligned with humans. Yet, it is not completely clear how an LLM can lead a +plan-grounded conversation in mixed-initiative settings where instructions flow +in both directions of the conversation, i.e. both the LLM and the user provide +instructions to one another. In this paper, we tackle a dual goal +mixed-initiative conversational setting where the LLM not only grounds the +conversation on an arbitrary plan but also seeks to satisfy both a procedural +plan and user instructions. The LLM is then responsible for guiding the user +through the plan and, at the same time, adapting to new circumstances, +answering questions, and activating safety guardrails when needed. We propose a +novel LLM that grounds the dialogue on a procedural plan, can take the dialogue +initiative, and enforces guardrails on the system's behavior, while also +improving the LLM's responses to unexpected user behavior. Experiments in +controlled settings and with real users show that the best-performing model, +which we call PlanLLM, achieves a 2.1x improvement over a strong baseline. +Moreover, experiments also show good generalization to unseen domains. + +
+
+
+
+
+ + ☆ Generation, Distillation and Evaluation of Motivational + Interviewing-Style Reflections with a Foundational Language Model EACL 2024 + + +
+ Large Foundational Language Models are capable of performing many tasks at a +high level but are difficult to deploy in many applications because of their +size and proprietary ownership. Many will be motivated to distill specific +capabilities of foundational models into smaller models that can be owned and +controlled. In the development of a therapeutic chatbot, we wish to distill a +capability known as reflective listening, in which a therapist produces +reflections of client speech. These reflections either restate what a client +has said, or connect what was said to a relevant observation, idea or guess +that encourages and guides the client to continue contemplation. In this paper, +we present a method for distilling the generation of reflections from a +Foundational Language Model (GPT-4) into smaller models. We first show that +GPT-4, using zero-shot prompting, can generate reflections at near 100% success +rate, superior to all previous methods. Using reflections generated by GPT-4, +we fine-tune different sizes of the GPT-2 family. The GPT-2-small model +achieves 83% success on a hold-out test set and the GPT-2 XL achieves 90% +success. We also show that GPT-4 can help in the labor-intensive task of +evaluating the quality of the distilled models, using it as a zero-shot +classifier. Using triple-human review as a guide, the classifier achieves a +Cohen-Kappa of 0.66, a substantial inter-rater reliability figure. + +
+
+ comment: Accepted to EACL 2024 Long Paper +
+
+
+
+
+ + ☆ Getting the most out of your tokenizer for pre-training and domain + adaptation + + +
+ Tokenization is an understudied and often neglected component of modern LLMs. +Most published works use a single tokenizer for all experiments, often borrowed +from another model, without performing ablations or analysis to optimize +tokenization. Moreover, the tokenizer is generally kept unchanged when +fine-tuning a base model. In this paper, we show that the size, +pre-tokenization regular expression, and training data of a tokenizer can +significantly impact the model's generation speed, effective context size, +memory usage, and downstream performance. We train specialized Byte-Pair +Encoding code tokenizers, and conduct extensive ablations on the impact of +tokenizer design on the performance of LLMs for code generation tasks such as +HumanEval and MBPP, and provide recommendations for tokenizer hyper-parameters +selection and switching the tokenizer in a pre-trained LLM. We perform our +experiments on models trained from scratch and from pre-trained models, +verifying their applicability to a wide range of use-cases. We find that when +fine-tuning on more than 50 billion tokens, we can specialize the tokenizer of +a pre-trained LLM to obtain large gains in generation speed and effective +context size. + +
+
+
+
+
+ + ☆ Repeat After Me: Transformers are Better than State Space Models at + Copying + + +
+ Transformers are the dominant architecture for sequence modeling, but there +is growing interest in models that use a fixed-size latent state that does not +depend on the sequence length, which we refer to as "generalized state space +models" (GSSMs). In this paper we show that while GSSMs are promising in terms +of inference-time efficiency, they are limited compared to transformer models +on tasks that require copying from the input context. We start with a +theoretical analysis of the simple task of string copying and prove that a two +layer transformer can copy strings of exponential length while GSSMs are +fundamentally limited by their fixed-size latent state. Empirically, we find +that transformers outperform GSSMs in terms of efficiency and generalization on +synthetic tasks that require copying the context. Finally, we evaluate +pretrained large language models and find that transformer models dramatically +outperform state space models at copying and retrieving information from +context. Taken together, these results suggest a fundamental gap between +transformers and GSSMs on tasks of practical interest. + +
+
+
+
+
+ + ☆ Executable Code Actions Elicit Better LLM Agents + + +
+ Large Language Model (LLM) agents, capable of performing a broad range of +actions, such as invoking tools and controlling robots, show great potential in +tackling real-world challenges. LLM agents are typically prompted to produce +actions by generating JSON or text in a pre-defined format, which is usually +limited by constrained action space (e.g., the scope of pre-defined tools) and +restricted flexibility (e.g., inability to compose multiple tools). This work +proposes to use executable Python code to consolidate LLM agents' actions into +a unified action space (CodeAct). Integrated with a Python interpreter, CodeAct +can execute code actions and dynamically revise prior actions or emit new +actions upon new observations through multi-turn interactions. Our extensive +analysis of 17 LLMs on API-Bank and a newly curated benchmark shows that +CodeAct outperforms widely used alternatives (up to 20% higher success rate). +The encouraging performance of CodeAct motivates us to build an open-source LLM +agent that interacts with environments by executing interpretable code and +collaborates with users using natural language. To this end, we collect an +instruction-tuning dataset CodeActInstruct that consists of 7k multi-turn +interactions using CodeAct. We show that it can be used with existing data to +improve models in agent-oriented tasks without compromising their general +capability. CodeActAgent, finetuned from Llama2 and Mistral, is integrated with +Python interpreter and uniquely tailored to perform sophisticated tasks (e.g., +model training) using existing libraries and autonomously self-debug. + +
+
+ comment: Code, data, model, and demo are available at + https://github.com/xingyaoww/code-act +
+
+
+
+
+ + ☆ Graph-based Clustering for Detecting Semantic Change Across Time and + Languages EACL2024 + + +
+ Despite the predominance of contextualized embeddings in NLP, approaches to +detect semantic change relying on these embeddings and clustering methods +underperform simpler counterparts based on static word embeddings. This stems +from the poor quality of the clustering methods to produce sense clusters -- +which struggle to capture word senses, especially those with low frequency. +This issue hinders the next step in examining how changes in word senses in one +language influence another. To address this issue, we propose a graph-based +clustering approach to capture nuanced changes in both high- and low-frequency +word senses across time and languages, including the acquisition and loss of +these senses over time. Our experimental results show that our approach +substantially surpasses previous approaches in the SemEval2020 binary +classification task across four languages. Moreover, we showcase the ability of +our approach as a versatile visualization tool to detect semantic changes in +both intra-language and inter-language setups. We make our code and data +publicly available. + +
+
+ comment: EACL2024 Camera Ready (20 pages) +
+
+
+
+
+ + ☆ Domain-Independent Deception: A New Taxonomy and Linguistic Analysis + + +
+ Internet-based economies and societies are drowning in deceptive attacks. +These attacks take many forms, such as fake news, phishing, and job scams, +which we call ``domains of deception.'' Machine-learning and +natural-language-processing researchers have been attempting to ameliorate this +precarious situation by designing domain-specific detectors. Only a few recent +works have considered domain-independent deception. We collect these disparate +threads of research and investigate domain-independent deception. First, we +provide a new computational definition of deception and break down deception +into a new taxonomy. Then, we analyze the debate on linguistic cues for +deception and supply guidelines for systematic reviews. Finally, we investigate +common linguistic features and give evidence for knowledge transfer across +different forms of deception. + +
+
+ comment: 33 pages. arXiv admin note: text overlap with arXiv:2207.01738 +
+
+
+
+
+ + ☆ HR-MultiWOZ: A Task Oriented Dialogue (TOD) Dataset for HR LLM Agent + + +
+ Recent advancements in Large Language Models (LLMs) have been reshaping +Natural Language Processing (NLP) task in several domains. Their use in the +field of Human Resources (HR) has still room for expansions and could be +beneficial for several time consuming tasks. Examples such as time-off +submissions, medical claims filing, and access requests are noteworthy, but +they are by no means the sole instances. However, the aforementioned +developments must grapple with the pivotal challenge of constructing a +high-quality training dataset. On one hand, most conversation datasets are +solving problems for customers not employees. On the other hand, gathering +conversations with HR could raise privacy concerns. To solve it, we introduce +HR-Multiwoz, a fully-labeled dataset of 550 conversations spanning 10 HR +domains to evaluate LLM Agent. Our work has the following contributions: (1) It +is the first labeled open-sourced conversation dataset in the HR domain for NLP +research. (2) It provides a detailed recipe for the data generation procedure +along with data analysis and human evaluations. The data generation pipeline is +transferable and can be easily adapted for labeled conversation data generation +in other domains. (3) The proposed data-collection pipeline is mostly based on +LLMs with minimal human involvement for annotation, which is time and +cost-efficient. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ An Information-Theoretic Approach to Analyze NLP Classification Tasks + + +
+ Understanding the importance of the inputs on the output is useful across +many tasks. This work provides an information-theoretic framework to analyse +the influence of inputs for text classification tasks. Natural language +processing (NLP) tasks take either a single element input or multiple element +inputs to predict an output variable, where an element is a block of text. Each +text element has two components: an associated semantic meaning and a +linguistic realization. Multiple-choice reading comprehension (MCRC) and +sentiment classification (SC) are selected to showcase the framework. For MCRC, +it is found that the context influence on the output compared to the question +influence reduces on more challenging datasets. In particular, more challenging +contexts allow a greater variation in complexity of questions. Hence, test +creators need to carefully consider the choice of the context when designing +multiple-choice questions for assessment. For SC, it is found the semantic +meaning of the input text dominates (above 80\% for all datasets considered) +compared to its linguistic realisation when determining the sentiment. The +framework is made available at: +https://github.com/WangLuran/nlp-element-influence + +
+
+ comment: 21 pages, 10 figures, 11 tables +
+
+
+
+
+ + ☆ SPARQL Generation with Entity Pre-trained GPT for KG Question Answering SP + + +
+ Knowledge Graphs popularity has been rapidly growing in last years. All that +knowledge is available for people to query it through the many online databases +on the internet. Though, it would be a great achievement if non-programmer +users could access whatever information they want to know. There has been a lot +of effort oriented to solve this task using natural language processing tools +and creativity encouragement by way of many challenges. Our approach focuses on +assuming a correct entity linking on the natural language questions and +training a GPT model to create SPARQL queries from them. We managed to isolate +which property of the task can be the most difficult to solve at few or +zero-shot and we proposed pre-training on all entities (under CWA) to improve +the performance. We obtained a 62.703% accuracy of exact SPARQL matches on +testing at 3-shots, a F1 of 0.809 on the entity linking challenge and a F1 of +0.009 on the question answering challenge. + +
+
+ comment: 7 pages, 1 figure, 2 tables. For the implementation, see + https://github.com/DiegoEmilio01/SPARQL-generation-with-entity-pre-trained-GPT-for-KG-Question-Answering +
+
+
+
+
+ + ☆ Exploring Spatial Schema Intuitions in Large Language and Vision Models + + +
+ Despite the ubiquity of large language models (LLMs) in AI research, the +question of embodiment in LLMs remains underexplored, distinguishing them from +embodied systems in robotics where sensory perception directly informs physical +action. Our investigation navigates the intriguing terrain of whether LLMs, +despite their non-embodied nature, effectively capture implicit human +intuitions about fundamental, spatial building blocks of language. We employ +insights from spatial cognitive foundations developed through early +sensorimotor experiences, guiding our exploration through the reproduction of +three psycholinguistic experiments. Surprisingly, correlations between model +outputs and human responses emerge, revealing adaptability without a tangible +connection to embodied experiences. Notable distinctions include polarized +language model responses and reduced correlations in vision language models. +This research contributes to a nuanced understanding of the interplay between +language, spatial experiences, and the computations made by large language +models. More at https://cisnlp.github.io/Spatial_Schemas/ + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Institutional Platform for Secure Self-Service Large Language Model + Exploration + + +
+ This paper introduces a user-friendly platform developed by the University of +Kentucky Center for Applied AI, designed to make large, customized language +models (LLMs) more accessible. By capitalizing on recent advancements in +multi-LoRA inference, the system efficiently accommodates custom adapters for a +diverse range of users and projects. The paper outlines the system's +architecture and key features, encompassing dataset curation, model training, +secure inference, and text-based feature extraction. + We illustrate the establishment of a tenant-aware computational network using +agent-based methods, securely utilizing islands of isolated resources as a +unified system. The platform strives to deliver secure LLM services, +emphasizing process and data isolation, end-to-end encryption, and role-based +resource authentication. This contribution aligns with the overarching goal of +enabling simplified access to cutting-edge AI models and technology in support +of scientific discovery. + +
+
+ comment: 10 pages 11 figures, 5 listings, 4 tables +
+
+
+
+
+ + ♻ ☆ RLHF and IIA: Perverse Incentives + + +
+ Existing algorithms for reinforcement learning from human feedback (RLHF) can +incentivize responses at odds with preferences because they are based on models +that assume independence of irrelevant alternatives (IIA). The perverse +incentives induced by IIA hinder innovations on query formats and learning +algorithms. + +
+
+
+
+
+ + ♻ ☆ Engineering A Large Language Model From Scratch + + +
+ The proliferation of deep learning in natural language processing (NLP) has +led to the development and release of innovative technologies capable of +understanding and generating human language with remarkable proficiency. +Atinuke, a Transformer-based neural network, optimises performance across +various language tasks by utilising a unique configuration. The architecture +interweaves layers for processing sequential data with attention mechanisms to +draw meaningful affinities between inputs and outputs. Due to the configuration +of its topology and hyperparameter tuning, it can emulate human-like language +by extracting features and learning complex mappings. Atinuke is modular, +extensible, and integrates seamlessly with existing machine learning pipelines. +Advanced matrix operations like softmax, embeddings, and multi-head attention +enable nuanced handling of textual, acoustic, and visual signals. By unifying +modern deep learning techniques with software design principles and +mathematical theory, the system achieves state-of-the-art results on natural +language tasks whilst remaining interpretable and robust. + +
+
+
+
+
+ + ♻ ☆ Revisiting the Role of Language Priors in Vision-Language Models + + +
+ Vision-language models (VLMs) are impactful in part because they can be +applied to a variety of visual understanding tasks in a zero-shot fashion, +without any fine-tuning. We study $\textit{generative VLMs}$ that are trained +for next-word generation given an image. We explore their zero-shot performance +on the illustrative task of image-text retrieval across 8 popular +vision-language benchmarks. Our first observation is that they can be +repurposed for discriminative tasks (such as image-text retrieval) by simply +computing the match score of generating a particular text string given an +image. We call this probabilistic score the $\textit{Visual Generative +Pre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces +near-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on +others. We analyze this behavior through a probabilistic lens, pointing out +that some benchmarks inadvertently capture unnatural language distributions by +creating adversarial but unlikely text captions. In fact, we demonstrate that +even a "blind" language model that ignores any image evidence can sometimes +outperform all prior art, reminiscent of similar challenges faced by the +visual-question answering (VQA) community many years ago. We derive a +probabilistic post-processing scheme that controls for the amount of linguistic +bias in generative VLMs at test time without having to retrain or fine-tune the +model. We show that the VisualGPTScore, when appropriately debiased, is a +strong zero-shot baseline for vision-language understanding, oftentimes +producing state-of-the-art accuracy. + +
+
+ comment: Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ +
+
+
+
+
+ + ♻ ☆ Distilling Mathematical Reasoning Capabilities into Small Language + Models + + +
+ This work addresses the challenge of democratizing advanced Large Language +Models (LLMs) by compressing their mathematical reasoning capabilities into +sub-billion parameter Small Language Models (SLMs) without compromising +performance. We introduce Equation-of-Thought Distillation (EoTD), a novel +technique that encapsulates the reasoning process into equation-based +representations to construct an EoTD dataset for fine-tuning SLMs. +Additionally, we propose the Ensemble Thoughts Distillation (ETD) framework to +enhance the reasoning performance of SLMs. This involves creating a reasoning +dataset with multiple thought processes, including Chain-of-Thought (CoT), +Program-of-Thought (PoT), and Equation-of-Thought (EoT), and using it for +fine-tuning. Our experimental findings demonstrate that EoTD significantly +boosts the reasoning abilities of SLMs, while ETD enables these models to +achieve state-of-the-art reasoning performance. + +
+
+
+
+
+ + ♻ ☆ Stars Are All You Need: A Distantly Supervised Pyramid Network for + Unified Sentiment Analysis + + +
+ Data for the Rating Prediction (RP) sentiment analysis task such as star +reviews are readily available. However, data for aspect-category detection +(ACD) and aspect-category sentiment analysis (ACSA) is often desired because of +the fine-grained nature but are expensive to collect. In this work, we propose +Unified Sentiment Analysis (Uni-SA) to understand aspect and review sentiment +in a unified manner. Specifically, we propose a Distantly Supervised Pyramid +Network (DSPN) to efficiently perform ACD, ACSA, and RP using only RP labels +for training. We evaluate DSPN on multi-aspect review datasets in English and +Chinese and find that in addition to the internal efficiency of sample size, +DSPN also performs comparably well to a variety of benchmark models. We also +demonstrate the interpretability of DSPN's outputs on reviews to show the +pyramid structure inherent in unified sentiment analysis. + +
+
+ comment: 15 pages, 3 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Leveraging Open Information Extraction for More Robust Domain Transfer + of Event Trigger Detection EACL 2024 + + +
+ Event detection is a crucial information extraction task in many domains, +such as Wikipedia or news. The task typically relies on trigger detection (TD) +-- identifying token spans in the text that evoke specific events. While the +notion of triggers should ideally be universal across domains, domain transfer +for TD from high- to low-resource domains results in significant performance +drops. We address the problem of negative transfer in TD by coupling triggers +between domains using subject-object relations obtained from a rule-based open +information extraction (OIE) system. We demonstrate that OIE relations injected +through multi-task training can act as mediators between triggers in different +domains, enhancing zero- and few-shot TD domain transfer and reducing +performance drops, in particular when transferring from a high-resource source +domain (Wikipedia) to a low(er)-resource target domain (news). Additionally, we +combine this improved transfer with masked language modeling on the target +domain, observing further TD transfer gains. Finally, we demonstrate that the +gains are robust to the choice of the OIE system. + +
+
+ comment: Accepted at EACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ CARTIER: Cartographic lAnguage Reasoning Targeted at Instruction + Execution for Robots + + +
+ This work explores the capacity of large language models (LLMs) to address +problems at the intersection of spatial planning and natural language +interfaces for navigation. We focus on following complex instructions that are +more akin to natural conversation than traditional explicit procedural +directives typically seen in robotics. Unlike most prior work where navigation +directives are provided as simple imperative commands (e.g., "go to the +fridge"), we examine implicit directives obtained through conversational +interactions.We leverage the 3D simulator AI2Thor to create household query +scenarios at scale, and augment it by adding complex language queries for 40 +object types. We demonstrate that a robot using our method CARTIER +(Cartographic lAnguage Reasoning Targeted at Instruction Execution for Robots) +can parse descriptive language queries up to 42% more reliably than existing +LLM-enabled methods by exploiting the ability of LLMs to interpret the user +interaction in the context of the objects in the scenario. + +
+
+
+
+
+ + ♻ ☆ Multi-Relational Hyperbolic Word Embeddings from Natural Language + Definitions EACL 2024 + + +
+ Natural language definitions possess a recursive, self-explanatory semantic +structure that can support representation learning methods able to preserve +explicit conceptual relations and constraints in the latent space. This paper +presents a multi-relational model that explicitly leverages such a structure to +derive word embeddings from definitions. By automatically extracting the +relations linking defined and defining terms from dictionaries, we demonstrate +how the problem of learning word embeddings can be formalised via a +translational framework in Hyperbolic space and used as a proxy to capture the +global semantic structure of definitions. An extensive empirical analysis +demonstrates that the framework can help imposing the desired structural +constraints while preserving the semantic mapping required for controllable and +interpretable traversal. Moreover, the experiments reveal the superiority of +the Hyperbolic word embeddings over the Euclidean counterparts and demonstrate +that the multi-relational approach can obtain competitive results when compared +to state-of-the-art neural models, with the advantage of being intrinsically +more efficient and interpretable. + +
+
+ comment: Accepted at the 18th Conference of the European Chapter of the + Association for Computational Linguistics (EACL 2024) +
+
+
+
+
+ + ♻ ☆ Scaling up Discovery of Latent Concepts in Deep NLP Models EACL 2024 + + +
+ Despite the revolution caused by deep NLP models, they remain black boxes, +necessitating research to understand their decision-making processes. A recent +work by Dalvi et al. (2022) carried out representation analysis through the +lens of clustering latent spaces within pre-trained models (PLMs), but that +approach is limited to small scale due to the high cost of running +Agglomerative hierarchical clustering. This paper studies clustering algorithms +in order to scale the discovery of encoded concepts in PLM representations to +larger datasets and models. We propose metrics for assessing the quality of +discovered latent concepts and use them to compare the studied clustering +algorithms. We found that K-Means-based concept discovery significantly +enhances efficiency while maintaining the quality of the obtained concepts. +Furthermore, we demonstrate the practicality of this newfound efficiency by +scaling latent concept discovery to LLMs and phrasal concepts. + +
+
+ comment: 14 pages, accepted to The 18th Conference of the European Chapter of + the Association for Computational Linguistics (EACL 2024) +
+
+
+
+
+ + ♻ ☆ LinguAlchemy: Fusing Typological and Geographical Elements for Unseen + Language Generalization + + +
+ Pretrained language models (PLMs) have shown remarkable generalization toward +multiple tasks and languages. Nonetheless, the generalization of PLMs towards +unseen languages is poor, resulting in significantly worse language +performance, or even generating nonsensical responses that are comparable to a +random baseline. This limitation has been a longstanding problem of PLMs +raising the problem of diversity and equal access to language modeling +technology. In this work, we solve this limitation by introducing LinguAlchemy, +a regularization technique that incorporates various aspects of languages +covering typological, geographical, and phylogenetic constraining the resulting +representation of PLMs to better characterize the corresponding linguistics +constraints. LinguAlchemy significantly improves the accuracy performance of +mBERT and XLM-R on unseen languages by ~18% and ~2%, respectively compared to +fully finetuned models and displaying a high degree of unseen language +generalization. We further introduce AlchemyScale and AlchemyTune, extension of +LinguAlchemy which adjusts the linguistic regularization weights automatically, +alleviating the need for hyperparameter search. LinguAlchemy enables better +cross-lingual generalization to unseen languages which is vital for better +inclusivity and accessibility of PLMs. + +
+
+
+
+
+ + ♻ ☆ Small Language Models Improve Giants by Rewriting Their Outputs EACL 2024 + + +
+ Despite the impressive performance of large language models (LLMs), they +often lag behind specialized models in various tasks. LLMs only use a fraction +of the existing training data for in-context learning, while task-specific +models harness the full dataset for fine-tuning. In this work, we tackle the +problem of leveraging training data to improve the performance of LLMs without +fine-tuning. Our approach directly targets LLM predictions without requiring +access to their weights. We create a pool of candidates from the LLM through +few-shot prompting and we employ a compact model, the LM-corrector (LMCor), +specifically trained to merge these candidates to produce an enhanced output. +Our experiments on four natural language generation tasks demonstrate that even +a small LMCor model (250M) substantially improves the few-shot performance of +LLMs (62B), matching and even outperforming standard fine-tuning. Furthermore, +we illustrate the robustness of LMCor against different prompts, thereby +minimizing the need for extensive prompt engineering. Finally, we show that +LMCor can be seamlessly integrated with different LLMs at inference, serving as +a plug-and-play module to improve their performance. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ♻ ☆ SELF: Self-Evolution with Language Feedback + + +
+ Large Language Models (LLMs) have demonstrated remarkable versatility across +various domains. To further advance LLMs, we propose 'SELF' (Self-Evolution +with Language Feedback), a novel approach that enables LLMs to self-improve +through self-reflection, akin to human learning processes. SELF initiates with +a meta-skill learning process that equips the LLMs with capabilities for +self-feedback and self-refinement. Subsequently, the model undergoes an +iterative process of self-evolution. In each iteration, it utilizes an +unlabeled dataset of instructions to generate initial responses. These +responses are enhanced through self-feedback and self-refinement. The model is +then fine-tuned using this enhanced data. The model undergoes progressive +improvement through this iterative self-evolution process. Moreover, the SELF +framework enables the model to apply self-refinement during inference, which +further improves response quality. Our experiments in mathematics and general +tasks demonstrate that SELF can enhance the capabilities of LLMs without human +intervention. The SELF framework indicates a promising direction for the +autonomous evolution of LLMs, transitioning them from passive information +receivers to active participants in their development. + +
+
+ comment: 20 pages, 4 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Using Large Language Models to Generate, Validate, and Apply User Intent + Taxonomies + + +
+ Log data can reveal valuable information about how users interact with Web +search services, what they want, and how satisfied they are. However, analyzing +user intents in log data is not easy, especially for emerging forms of Web +search such as AI-driven chat. To understand user intents from log data, we +need a way to label them with meaningful categories that capture their +diversity and dynamics. Existing methods rely on manual or machine-learned +labeling, which are either expensive or inflexible for large and dynamic +datasets. We propose a novel solution using large language models (LLMs), which +can generate rich and relevant concepts, descriptions, and examples for user +intents. However, using LLMs to generate a user intent taxonomy and apply it +for log analysis can be problematic for two main reasons: (1) such a taxonomy +is not externally validated; and (2) there may be an undesirable feedback loop. +To address this, we propose a new methodology with human experts and assessors +to verify the quality of the LLM-generated taxonomy. We also present an +end-to-end pipeline that uses an LLM with human-in-the-loop to produce, refine, +and apply labels for user intent analysis in log data. We demonstrate its +effectiveness by uncovering new insights into user intents from search and chat +logs from the Microsoft Bing commercial search engine. The proposed work's +novelty stems from the method for generating purpose-driven user intent +taxonomies with strong validation. This method not only helps remove +methodological and practical bottlenecks from intent-focused research, but also +provides a new framework for generating, validating, and applying other kinds +of taxonomies in a scalable and adaptable way with minimal human effort. + +
+
+
+
+
+ + ♻ ☆ Small LLMs Are Weak Tool Learners: A Multi-LLM Agent + + +
+ Large Language Model (LLM) agents significantly extend the capabilities of +standalone LLMs, empowering them to interact with external tools (e.g., APIs, +functions) and complete complex tasks in a self-directed fashion. The challenge +of tool use demands that LLMs not only understand user queries and generate +answers but also excel in task planning, memory management, tool invocation, +and result summarization. While traditional approaches focus on training a +single LLM with all these capabilities, performance limitations become +apparent, particularly with smaller models. Moreover, the entire LLM may +require retraining when tools are updated. To overcome these challenges, we +propose a novel strategy that decomposes the aforementioned capabilities into a +planner, caller, and summarizer. Each component is implemented by a single LLM +that focuses on a specific capability and collaborates with other components to +accomplish the task. This modular framework facilitates individual updates and +the potential use of smaller LLMs for building each capability. To effectively +train this framework, we introduce a two-stage training paradigm. First, we +fine-tune a backbone LLM on the entire dataset without discriminating +sub-tasks, providing the model with a comprehensive understanding of the task. +Second, the fine-tuned LLM is used to instantiate the planner, caller, and +summarizer respectively, which are continually fine-tuned on respective +sub-tasks. Evaluation across various tool-use benchmarks illustrates that our +proposed multi-LLM framework surpasses the traditional single-LLM approach, +highlighting its efficacy and advantages in tool learning. + +
+
+ comment: On progress, github repo: https://github.com/X-PLUG/Multi-LLM-Agent +
+
+
+
+
+ + ♻ ☆ Leveraging Implicit Feedback from Deployment Data in Dialogue EACL 2024 + + +
+ We study improving social conversational agents by learning from natural +dialogue between users and a deployed model, without extra annotations. To +implicitly measure the quality of a machine-generated utterance, we leverage +signals like user response length, sentiment and reaction of the future human +utterances in the collected dialogue episodes. Our experiments use the publicly +released deployment data from BlenderBot (Xu et al., 2023). Human evaluation +indicates improvements in our new models over baseline responses; however, we +find that some proxy signals can lead to more generations with undesirable +properties as well. For example, optimizing for conversation length can lead to +more controversial or unfriendly generations compared to the baseline, whereas +optimizing for positive sentiment or reaction can decrease these behaviors. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ♻ ☆ Meta Prompting for AGI Systems + + +
+ This paper presents a comprehensive study of Meta Prompting, an innovative +technique reshaping the utilization of large language models (LLMs), +multi-modal foundation models, and AI systems in problem-solving and data +interaction. Grounded in type theory and category theory, Meta Prompting +emphasizes the structure and syntax of information over traditional +content-centric methods. The paper explores the formal definitions of Meta +Prompting (MP), sets it apart from Few-Shot Prompting, and underlines its +effectiveness in various AI applications. A key focus is applying Meta +Prompting for complex reasoning (MP-CR) tasks, showing how it effectively +deconstructs intricate problems into simpler sub-problems, enhancing token +efficiency, and enabling more equitable problem-solving comparisons, especially +against few-shot prompting methods. Additionally, the paper introduces Meta +Prompting for prompting tasks, allowing LLMs to self-generate new prompts in a +recursive, metaprogramming-like manner. This approach marks a significant leap +in AI's autonomous and adaptive capabilities. The paper also introduces the +integration of Meta Prompting into multi-modal foundation model settings, +tackling the challenges and opportunities of incorporating varied data types +such as images, audio, and video within the structured Meta Prompting +framework. Empirical experiments, including solving the Game of 24 tasks with +100% success rate, demonstrate the MP-CR Agent's enhanced reasoning +capabilities, achieving high accuracy and efficiency, and showcasing Meta +Prompting's transformative impact on AI problem-solving. (The code is available +at https://github.com/meta-prompting/meta-prompting) + +
+
+
+
+
+ + ♻ ☆ On the Affinity, Rationality, and Diversity of Hierarchical Topic + Modeling AAAI2024 + + +
+ Hierarchical topic modeling aims to discover latent topics from a corpus and +organize them into a hierarchy to understand documents with desirable semantic +granularity. However, existing work struggles with producing topic hierarchies +of low affinity, rationality, and diversity, which hampers document +understanding. To overcome these challenges, we in this paper propose Transport +Plan and Context-aware Hierarchical Topic Model (TraCo). Instead of early +simple topic dependencies, we propose a transport plan dependency method. It +constrains dependencies to ensure their sparsity and balance, and also +regularizes topic hierarchy building with them. This improves affinity and +diversity of hierarchies. We further propose a context-aware disentangled +decoder. Rather than previously entangled decoding, it distributes different +semantic granularity to topics at different levels by disentangled decoding. +This facilitates the rationality of hierarchies. Experiments on benchmark +datasets demonstrate that our method surpasses state-of-the-art baselines, +effectively improving the affinity, rationality, and diversity of hierarchical +topic modeling with better performance on downstream tasks. + +
+
+ comment: Accepted to AAAI2024 conference. Our code is available at + https://github.com/bobxwu/TraCo +
+
+
+
+
+ + ♻ ☆ Zero-shot Generative Large Language Models for Systematic Review + Screening Automation ECIR2024 + + +
+ Systematic reviews are crucial for evidence-based medicine as they +comprehensively analyse published research findings on specific questions. +Conducting such reviews is often resource- and time-intensive, especially in +the screening phase, where abstracts of publications are assessed for inclusion +in a review. This study investigates the effectiveness of using zero-shot large +language models~(LLMs) for automatic screening. We evaluate the effectiveness +of eight different LLMs and investigate a calibration technique that uses a +predefined recall threshold to determine whether a publication should be +included in a systematic review. Our comprehensive evaluation using five +standard test collections shows that instruction fine-tuning plays an important +role in screening, that calibration renders LLMs practical for achieving a +targeted recall, and that combining both with an ensemble of zero-shot models +saves significant screening time compared to state-of-the-art approaches. + +
+
+ comment: Accepted to ECIR2024 full paper (findings) +
+
+
+
+
+ + ♻ ☆ Commonsense for Zero-Shot Natural Language Video Localization AAAI 2024 + + +
+ Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited +promising results in training NLVL models exclusively with raw video data by +dynamically generating video segments and pseudo-query annotations. However, +existing pseudo-queries often lack grounding in the source video, resulting in +unstructured and disjointed content. In this paper, we investigate the +effectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we +present CORONET, a zero-shot NLVL framework that leverages commonsense to +bridge the gap between videos and generated pseudo-queries via a commonsense +enhancement module. CORONET employs Graph Convolution Networks (GCN) to encode +commonsense information extracted from a knowledge graph, conditioned on the +video, and cross-attention mechanisms to enhance the encoded video and +pseudo-query representations prior to localization. Through empirical +evaluations on two benchmark datasets, we demonstrate that CORONET surpasses +both zero-shot and weakly supervised baselines, achieving improvements up to +32.13% across various recall thresholds and up to 6.33% in mIoU. These results +underscore the significance of leveraging commonsense reasoning for zero-shot +NLVL. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ A First Look at Information Highlighting in Stack Overflow Answers + + +
+ Context: Navigating the knowledge of Stack Overflow (SO) remains challenging. +To make the posts vivid to users, SO allows users to write and edit posts with +Markdown or HTML so that users can leverage various formatting styles (e.g., +bold, italic, and code) to highlight the important information. Nonetheless, +there have been limited studies on the highlighted information. Objective: We +carried out the first large-scale exploratory study on the information +highlighted in SO answers in our recent study. To extend our previous study, we +develop approaches to automatically recommend highlighted content with +formatting styles using neural network architectures initially designed for the +Named Entity Recognition task. Method: In this paper, we studied 31,169,429 +answers of Stack Overflow. For training recommendation models, we choose CNN +and BERT models for each type of formatting (i.e., Bold, Italic, Code, and +Heading) using the information highlighting dataset we collected from SO +answers. Results: Our models based on CNN architecture achieve precision +ranging from 0.71 to 0.82. The trained model for automatic code content +highlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming +the trained models for other formatting styles. The BERT models have even lower +recalls and F1 scores than the CNN models. Our analysis of failure cases +indicates that the majority of the failure cases are missing identification +(i.e., the model misses the content that is supposed to be highlighted) due to +the models tend to learn the frequently highlighted words while struggling to +learn less frequent words. Conclusion: Our findings suggest that it is possible +to develop recommendation models for highlighting information for answers with +different formatting styles on Stack Overflow. + +
+
+ comment: This work is submitted to Information and Software Technology Journal +
+
+
+
+
+ + ♻ ☆ UNSEE: Unsupervised Non-contrastive Sentence Embeddings EACL 2024 + + +
+ We present UNSEE: Unsupervised Non-Contrastive Sentence Embeddings, a novel +approach that outperforms SimCSE in the Massive Text Embedding benchmark. Our +exploration begins by addressing the challenge of representation collapse, a +phenomenon observed when contrastive objectives in SimCSE are replaced with +non-contrastive objectives. To counter this issue, we propose a straightforward +solution known as the target network, effectively mitigating representation +collapse. The introduction of the target network allows us to leverage +non-contrastive objectives, maintaining training stability while achieving +performance improvements comparable to contrastive objectives. Our method has +achieved peak performance in non-contrastive sentence embeddings through +meticulous fine-tuning and optimization. This comprehensive effort has yielded +superior sentence representation models, showcasing the effectiveness of our +approach. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Models on Graphs: A Comprehensive Survey + + +
+ Large language models (LLMs), such as GPT4 and LLaMA, are creating +significant advancements in natural language processing, due to their strong +text encoding/decoding ability and newly found emergent capability (e.g., +reasoning). While LLMs are mainly designed to process pure texts, there are +many real-world scenarios where text data is associated with rich structure +information in the form of graphs (e.g., academic networks, and e-commerce +networks) or scenarios where graph data is paired with rich textual information +(e.g., molecules with descriptions). Besides, although LLMs have shown their +pure text-based reasoning ability, it is underexplored whether such ability can +be generalized to graphs (i.e., graph-based reasoning). In this paper, we +provide a systematic review of scenarios and techniques related to large +language models on graphs. We first summarize potential scenarios of adopting +LLMs on graphs into three categories, namely pure graphs, text-attributed +graphs, and text-paired graphs. We then discuss detailed techniques for +utilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM +as Aligner, and compare the advantages and disadvantages of different schools +of models. Furthermore, we discuss the real-world applications of such methods +and summarize open-source codes and benchmark datasets. Finally, we conclude +with potential future research directions in this fast-growing field. The +related source can be found at +https://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Deciphering Textual Authenticity: A Generalized Strategy through the + Lens of Large Language Semantics for Detecting Human vs. Machine-Generated + Text + + +
+ With the recent proliferation of Large Language Models (LLMs), there has been +an increasing demand for tools to detect machine-generated text. The effective +detection of machine-generated text face two pertinent problems: First, they +are severely limited in generalizing against real-world scenarios, where +machine-generated text is produced by a variety of generators, including but +not limited to GPT-4 and Dolly, and spans diverse domains, ranging from +academic manuscripts to social media posts. Second, existing detection +methodologies treat texts produced by LLMs through a restrictive binary +classification lens, neglecting the nuanced diversity of artifacts generated by +different LLMs. In this work, we undertake a systematic study on the detection +of machine-generated text in real-world scenarios. We first study the +effectiveness of state-of-the-art approaches and find that they are severely +limited against text produced by diverse generators and domains in the real +world. Furthermore, t-SNE visualizations of the embeddings from a pretrained +LLM's encoder show that they cannot reliably distinguish between human and +machine-generated text. Based on our findings, we introduce a novel system, +T5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder +combined with LLM embedding sub-clustering to address the text produced by +diverse generators and domains in the real world. We evaluate our approach +across 9 machine-generated text systems and 9 domains and find that our +approach provides state-of-the-art generalization ability, with an average +increase in F1 score on machine-generated text of 19.6\% on unseen generators +and domains compared to the top performing existing approaches and correctly +attributes the generator of text with an accuracy of 93.6\%. + +
+
+
+
+
+ + ♻ ☆ Working Memory Capacity of ChatGPT: An Empirical Study AAAI + + +
+ Working memory is a critical aspect of both human intelligence and artificial +intelligence, serving as a workspace for the temporary storage and manipulation +of information. In this paper, we systematically assess the working memory +capacity of ChatGPT, a large language model developed by OpenAI, by examining +its performance in verbal and spatial n-back tasks under various conditions. +Our experiments reveal that ChatGPT has a working memory capacity limit +strikingly similar to that of humans. Furthermore, we investigate the impact of +different instruction strategies on ChatGPT's performance and observe that the +fundamental patterns of a capacity limit persist. From our empirical findings, +we propose that n-back tasks may serve as tools for benchmarking the working +memory capacity of large language models and hold potential for informing +future efforts aimed at enhancing AI working memory. + +
+
+ comment: Accepted at the 38th AAAI Conference on Artificial Intelligence + (AAAI-24) +
+
+
+
+
+ + ♻ ☆ Improving QA Model Performance with Cartographic Inoculation + + +
+ QA models are faced with complex and open-ended contextual reasoning +problems, but can often learn well-performing solution heuristics by exploiting +dataset-specific patterns in their training data. These patterns, or "dataset +artifacts", reduce the model's ability to generalize to real-world QA problems. +Utilizing an ElectraSmallDiscriminator model trained for QA, we analyze the +impacts and incidence of dataset artifacts using an adversarial challenge set +designed to confuse models reliant on artifacts for prediction. Extending +existing work on methods for mitigating artifact impacts, we propose +cartographic inoculation, a novel method that fine-tunes models on an optimized +subset of the challenge data to reduce model reliance on dataset artifacts. We +show that by selectively fine-tuning a model on ambiguous adversarial examples +from a challenge set, significant performance improvements can be made on the +full challenge dataset with minimal loss of model generalizability to other +challenging environments and QA datasets. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ A RAG-based Question Answering System Proposal for Understanding Islam: + MufassirQAS LLM + + +
+ Challenges exist in learning and understanding religions, such as the +complexity and depth of religious doctrines and teachings. Chatbots as +question-answering systems can help in solving these challenges. LLM chatbots +use NLP techniques to establish connections between topics and accurately +respond to complex questions. These capabilities make it perfect for +enlightenment on religion as a question-answering chatbot. However, LLMs also +tend to generate false information, known as hallucination. Also, the chatbots' +responses can include content that insults personal religious beliefs, +interfaith conflicts, and controversial or sensitive topics. It must avoid such +cases without promoting hate speech or offending certain groups of people or +their beliefs. This study uses a vector database-based Retrieval Augmented +Generation (RAG) approach to enhance the accuracy and transparency of LLMs. Our +question-answering system is called "MufassirQAS". We created a database +consisting of several open-access books that include Turkish context. These +books contain Turkish translations and interpretations of Islam. This database +is utilized to answer religion-related questions and ensure our answers are +trustworthy. The relevant part of the dataset, which LLM also uses, is +presented along with the answer. We have put careful effort into creating +system prompts that give instructions to prevent harmful, offensive, or +disrespectful responses to respect people's values and provide reliable +results. The system answers and shares additional information, such as the page +number from the respective book and the articles referenced for obtaining the +information. MufassirQAS and ChatGPT are also tested with sensitive questions. +We got better performance with our system. Study and enhancements are still in +progress. Results and future works are given. + +
+
+
+
+
+ + ♻ ☆ Entity Matching using Large Language Models + + +
+ Entity Matching is the task of deciding whether two entity descriptions refer +to the same real-world entity. It is a central step in most data integration +pipelines and an enabler for many e-commerce applications which require to +match products offers from different vendors. State-of-the-art entity matching +methods rely on pre-trained language models (PLMs) such as BERT or RoBERTa. Two +major drawbacks of these models for entity matching are that (i) the models +require significant amounts of task-specific training data and (ii) the +fine-tuned models are not robust concerning out-of-distribution entities. We +investigate using generative large language models (LLMs) for entity matching +as a less task-specific training data dependent and more robust alternative to +PLM-based matchers. Our study covers hosted LLMs as well as open-source LLMs +which can be run locally. We evaluate these models in a zero-shot scenario as +well as a scenario where task-specific training data is available. We compare +different prompt designs as well as the prompt sensitivity of the models and +show that there is no single best prompt but the prompt is akin to a +hyperparameter that needs to be estimated for each model/dataset combination. +We further investigate (i) the selection of in-context demonstrations, (ii) the +generation of matching rules, as well as (iii) fine-tuning a hosted LLM using +the same pool of training data. Our experiments show that the best LLMs require +no or only a few training examples to reach a similar performance as fine-tuned +PLMs. They further exhibit a higher robustness to unseen entities, which makes +them especially suited to use cases where no training data is available. We +show that for use cases that do not allow data to be shared with third parties, +open-source LLMs can be a viable alternative to hosted LLMs given that a small +amount of training data or matching knowledge... + +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Replace Economic Choice Prediction Labs? + + +
+ Economic choice prediction is an essential challenging task, often +constrained by the difficulties in acquiring human choice data. Indeed, +experimental economics studies had focused mostly on simple choice settings. +The AI community has recently contributed to that effort in two ways: +considering whether LLMs can substitute for humans in the above-mentioned +simple choice prediction settings, and the study through ML lens of more +elaborated but still rigorous experimental economics settings, employing +incomplete information, repetitive play, and natural language communication, +notably language-based persuasion games. This leaves us with a major +inspiration: can LLMs be used to fully simulate the economic environment and +generate data for efficient human choice prediction, substituting for the +elaborated economic lab studies? We pioneer the study of this subject, +demonstrating its feasibility. In particular, we show that a model trained +solely on LLM-generated data can effectively predict human behavior in a +language-based persuasion game, and can even outperform models trained on +actual human data. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 117 + +
+
+
+ + ☆ AToM: Amortized Text-to-Mesh using 2D Diffusion + + +
+ We introduce Amortized Text-to-Mesh (AToM), a feed-forward text-to-mesh +framework optimized across multiple text prompts simultaneously. In contrast to +existing text-to-3D methods that often entail time-consuming per-prompt +optimization and commonly output representations other than polygonal meshes, +AToM directly generates high-quality textured meshes in less than 1 second with +around 10 times reduction in the training cost, and generalizes to unseen +prompts. Our key idea is a novel triplane-based text-to-mesh architecture with +a two-stage amortized optimization strategy that ensures stable training and +enables scalability. Through extensive experiments on various prompt +benchmarks, AToM significantly outperforms state-of-the-art amortized +approaches with over 4 times higher accuracy (in DF415 dataset) and produces +more distinguishable and higher-quality 3D outputs. AToM demonstrates strong +generalizability, offering finegrained 3D assets for unseen interpolated +prompts without further optimization during inference, unlike per-prompt +solutions. + +
+
+ comment: 19 pages with appendix and references. Webpage: + https://snap-research.github.io/AToM/ +
+
+
+
+
+ + ☆ We're Not Using Videos Effectively: An Updated Domain Adaptive Video + Segmentation Baseline + + +
+ There has been abundant work in unsupervised domain adaptation for semantic +segmentation (DAS) seeking to adapt a model trained on images from a labeled +source domain to an unlabeled target domain. While the vast majority of prior +work has studied this as a frame-level Image-DAS problem, a few Video-DAS works +have sought to additionally leverage the temporal signal present in adjacent +frames. However, Video-DAS works have historically studied a distinct set of +benchmarks from Image-DAS, with minimal cross-benchmarking. In this work, we +address this gap. Surprisingly, we find that (1) even after carefully +controlling for data and model architecture, state-of-the-art Image-DAS methods +(HRDA and HRDA+MIC)} outperform Video-DAS methods on established Video-DAS +benchmarks (+14.5 mIoU on Viper$\rightarrow$CityscapesSeq, +19.0 mIoU on +Synthia$\rightarrow$CityscapesSeq), and (2) naive combinations of Image-DAS and +Video-DAS techniques only lead to marginal improvements across datasets. To +avoid siloed progress between Image-DAS and Video-DAS, we open-source our +codebase with support for a comprehensive set of Video-DAS and Image-DAS +methods on a common benchmark. Code available at +https://github.com/SimarKareer/UnifiedVideoDA + +
+
+ comment: TMLR 2024 +
+
+
+
+
+ + ☆ Towards Optimal Feature-Shaping Methods for Out-of-Distribution + Detection ICLR 2024 + + +
+ Feature shaping refers to a family of methods that exhibit state-of-the-art +performance for out-of-distribution (OOD) detection. These approaches +manipulate the feature representation, typically from the penultimate layer of +a pre-trained deep learning model, so as to better differentiate between +in-distribution (ID) and OOD samples. However, existing feature-shaping methods +usually employ rules manually designed for specific model architectures and OOD +datasets, which consequently limit their generalization ability. To address +this gap, we first formulate an abstract optimization framework for studying +feature-shaping methods. We then propose a concrete reduction of the framework +with a simple piecewise constant shaping function and show that existing +feature-shaping methods approximate the optimal solution to the concrete +optimization problem. Further, assuming that OOD data is inaccessible, we +propose a formulation that yields a closed-form solution for the piecewise +constant shaping function, utilizing solely the ID data. Through extensive +experiments, we show that the feature-shaping function optimized by our method +improves the generalization ability of OOD detection across a large variety of +datasets and model architectures. + +
+
+ comment: ICLR 2024. Project page: https://github.com/Qinyu-Allen-Zhao/OptFSOOD +
+
+
+
+
+ + ☆ ViCA-NeRF: View-Consistency-Aware 3D Editing of Neural Radiance Fields + + +
+ We introduce ViCA-NeRF, the first view-consistency-aware method for 3D +editing with text instructions. In addition to the implicit neural radiance +field (NeRF) modeling, our key insight is to exploit two sources of +regularization that explicitly propagate the editing information across +different views, thus ensuring multi-view consistency. For geometric +regularization, we leverage the depth information derived from NeRF to +establish image correspondences between different views. For learned +regularization, we align the latent codes in the 2D diffusion model between +edited and unedited images, enabling us to edit key views and propagate the +update throughout the entire scene. Incorporating these two strategies, our +ViCA-NeRF operates in two stages. In the initial stage, we blend edits from +different views to create a preliminary 3D edit. This is followed by a second +stage of NeRF training, dedicated to further refining the scene's appearance. +Experimental results demonstrate that ViCA-NeRF provides more flexible, +efficient (3 times faster) editing with higher levels of consistency and +details, compared with the state of the art. Our code is publicly available. + +
+
+ comment: Neurips2023; project page: https://github.com/Dongjiahua/VICA-NeRF +
+
+
+
+
+ + ☆ Geometry Transfer for Stylizing Radiance Fields + + +
+ Shape and geometric patterns are essential in defining stylistic identity. +However, current 3D style transfer methods predominantly focus on transferring +colors and textures, often overlooking geometric aspects. In this paper, we +introduce Geometry Transfer, a novel method that leverages geometric +deformation for 3D style transfer. This technique employs depth maps to extract +a style guide, subsequently applied to stylize the geometry of radiance fields. +Moreover, we propose new techniques that utilize geometric cues from the 3D +scene, thereby enhancing aesthetic expressiveness and more accurately +reflecting intended styles. Our extensive experiments show that Geometry +Transfer enables a broader and more expressive range of stylizations, thereby +significantly expanding the scope of 3D style transfer. + +
+
+ comment: project page: https://hyblue.github.io/geo-srf/ +
+
+
+
+
+ + ☆ BootsTAP: Bootstrapped Training for Tracking-Any-Point + + +
+ To endow models with greater understanding of physics and motion, it is +useful to enable them to perceive how solid surfaces move and deform in real +scenes. This can be formalized as Tracking-Any-Point (TAP), which requires the +algorithm to be able to track any point corresponding to a solid surface in a +video, potentially densely in space and time. Large-scale ground-truth training +data for TAP is only available in simulation, which currently has limited +variety of objects and motion. In this work, we demonstrate how large-scale, +unlabeled, uncurated real-world data can improve a TAP model with minimal +architectural changes, using a self-supervised student-teacher setup. We +demonstrate state-of-the-art performance on the TAP-Vid benchmark surpassing +previous results by a wide margin: for example, TAP-Vid-DAVIS performance +improves from 61.3% to 66.4%, and TAP-Vid-Kinetics from 57.2% to 61.5%. + +
+
+
+
+
+ + ☆ Emo-Avatar: Efficient Monocular Video Style Avatar through Texture + Rendering + + +
+ Artistic video portrait generation is a significant and sought-after task in +the fields of computer graphics and vision. While various methods have been +developed that integrate NeRFs or StyleGANs with instructional editing models +for creating and editing drivable portraits, these approaches face several +challenges. They often rely heavily on large datasets, require extensive +customization processes, and frequently result in reduced image quality. To +address the above problems, we propose the Efficient Monotonic Video Style +Avatar (Emo-Avatar) through deferred neural rendering that enhances StyleGAN's +capacity for producing dynamic, drivable portrait videos. We proposed a +two-stage deferred neural rendering pipeline. In the first stage, we utilize +few-shot PTI initialization to initialize the StyleGAN generator through +several extreme poses sampled from the video to capture the consistent +representation of aligned faces from the target portrait. In the second stage, +we propose a Laplacian pyramid for high-frequency texture sampling from UV maps +deformed by dynamic flow of expression for motion-aware texture prior +integration to provide torso features to enhance StyleGAN's ability to generate +complete and upper body for portrait video rendering. Emo-Avatar reduces style +customization time from hours to merely 5 minutes compared with existing +methods. In addition, Emo-Avatar requires only a single reference image for +editing and employs region-aware contrastive learning with semantic invariant +CLIP guidance, ensuring consistent high-resolution output and identity +preservation. Through both quantitative and qualitative assessments, Emo-Avatar +demonstrates superior performance over existing methods in terms of training +efficiency, rendering quality and editability in self- and cross-reenactment. + +
+
+
+
+
+ + ☆ AnimateLCM: Accelerating the Animation of Personalized Diffusion Models + and Adapters with Decoupled Consistency Learning + + +
+ Video diffusion models has been gaining increasing attention for its ability +to produce videos that are both coherent and of high fidelity. However, the +iterative denoising process makes it computationally intensive and +time-consuming, thus limiting its applications. Inspired by the Consistency +Model (CM) that distills pretrained image diffusion models to accelerate the +sampling with minimal steps and its successful extension Latent Consistency +Model (LCM) on conditional image generation, we propose AnimateLCM, allowing +for high-fidelity video generation within minimal steps. Instead of directly +conducting consistency learning on the raw video dataset, we propose a +decoupled consistency learning strategy that decouples the distillation of +image generation priors and motion generation priors, which improves the +training efficiency and enhance the generation visual quality. Additionally, to +enable the combination of plug-and-play adapters in stable diffusion community +to achieve various functions (e.g., ControlNet for controllable generation). we +propose an efficient strategy to adapt existing adapters to our distilled +text-conditioned video consistency model or train adapters from scratch without +harming the sampling speed. We validate the proposed strategy in +image-conditioned video generation and layout-conditioned video generation, all +achieving top-performing results. Experimental results validate the +effectiveness of our proposed method. Code and weights will be made public. +More details are available at https://github.com/G-U-N/AnimateLCM. + +
+
+ comment: Project Page: https://animatelcm.github.io/ +
+
+
+
+
+ + ☆ 360-GS: Layout-guided Panoramic Gaussian Splatting For Indoor Roaming + + +
+ 3D Gaussian Splatting (3D-GS) has recently attracted great attention with +real-time and photo-realistic renderings. This technique typically takes +perspective images as input and optimizes a set of 3D elliptical Gaussians by +splatting them onto the image planes, resulting in 2D Gaussians. However, +applying 3D-GS to panoramic inputs presents challenges in effectively modeling +the projection onto the spherical surface of ${360^\circ}$ images using 2D +Gaussians. In practical applications, input panoramas are often sparse, leading +to unreliable initialization of 3D Gaussians and subsequent degradation of +3D-GS quality. In addition, due to the under-constrained geometry of +texture-less planes (e.g., walls and floors), 3D-GS struggles to model these +flat regions with elliptical Gaussians, resulting in significant floaters in +novel views. To address these issues, we propose 360-GS, a novel $360^{\circ}$ +Gaussian splatting for a limited set of panoramic inputs. Instead of splatting +3D Gaussians directly onto the spherical surface, 360-GS projects them onto the +tangent plane of the unit sphere and then maps them to the spherical +projections. This adaptation enables the representation of the projection using +Gaussians. We guide the optimization of 360-GS by exploiting layout priors +within panoramas, which are simple to obtain and contain strong structural +information about the indoor scene. Our experimental results demonstrate that +360-GS allows panoramic rendering and outperforms state-of-the-art methods with +fewer artifacts in novel view synthesis, thus providing immersive roaming in +indoor scenarios. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ GS++: Error Analyzing and Optimal Gaussian Splatting + + +
+ 3D Gaussian Splatting has garnered extensive attention and application in +real-time neural rendering. Concurrently, concerns have been raised about the +limitations of this technology in aspects such as point cloud storage, +performance , and robustness in sparse viewpoints , leading to various +improvements. However, there has been a notable lack of attention to the +projection errors introduced by the local affine approximation inherent in the +splatting itself, and the consequential impact of these errors on the quality +of photo-realistic rendering. This paper addresses the projection error +function of 3D Gaussian Splatting, commencing with the residual error from the +first-order Taylor expansion of the projection function $\phi$. The analysis +establishes a correlation between the error and the Gaussian mean position. +Subsequently, leveraging function optimization theory, this paper analyzes the +function's minima to provide an optimal projection strategy for Gaussian +Splatting referred to Optimal Gaussian Splatting. Experimental validation +further confirms that this projection methodology reduces artifacts, resulting +in a more convincingly realistic rendering. + +
+
+
+
+
+ + ☆ DRSM: efficient neural 4d decomposition for dynamic reconstruction in + stationary monocular cameras + + +
+ With the popularity of monocular videos generated by video sharing and live +broadcasting applications, reconstructing and editing dynamic scenes in +stationary monocular cameras has become a special but anticipated technology. +In contrast to scene reconstructions that exploit multi-view observations, the +problem of modeling a dynamic scene from a single view is significantly more +under-constrained and ill-posed. Inspired by recent progress in neural +rendering, we present a novel framework to tackle 4D decomposition problem for +dynamic scenes in monocular cameras. Our framework utilizes decomposed static +and dynamic feature planes to represent 4D scenes and emphasizes the learning +of dynamic regions through dense ray casting. Inadequate 3D clues from a +single-view and occlusion are also particular challenges in scene +reconstruction. To overcome these difficulties, we propose deep supervised +optimization and ray casting strategies. With experiments on various videos, +our method generates higher-fidelity results than existing methods for +single-view dynamic scene representation. + +
+
+
+
+
+ + ☆ Automatic Segmentation of the Spinal Cord Nerve Rootlets + + +
+ Precise identification of spinal nerve rootlets is relevant to delineate +spinal levels for the study of functional activity in the spinal cord. The goal +of this study was to develop an automatic method for the semantic segmentation +of spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI) +scans. Images from two open-access MRI datasets were used to train a 3D +multi-class convolutional neural network using an active learning approach to +segment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal +level. The method was tested on 3T T2-weighted images from datasets unseen +during training to assess inter-site, inter-session, and inter-resolution +variability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation +across rootlets levels), suggesting a good performance. The method also +demonstrated low inter-vendor and inter-site variability (coefficient of +variation <= 1.41 %), as well as low inter-session variability (coefficient of +variation <= 1.30 %) indicating stable predictions across different MRI +vendors, sites, and sessions. The proposed methodology is open-source and +readily available in the Spinal Cord Toolbox (SCT) v6.2 and higher. + +
+
+
+
+
+ + ☆ ChaosBench: A Multi-Channel, Physics-Based Benchmark for + Subseasonal-to-Seasonal Climate Prediction + + +
+ Accurate prediction of climate in the subseasonal-to-seasonal scale is +crucial for disaster readiness, reduced economic risk, and improved +policy-making amidst climate change. Yet, S2S prediction remains challenging +due to the chaotic nature of the system. At present, existing benchmarks for +weather and climate applications, tend to (1) have shorter forecasting range of +up-to 14 days, (2) do not include a wide range of operational baseline +forecasts, and (3) lack physics-based constraints for explainability. Thus, we +propose ChaosBench, a large-scale, multi-channel, physics-based benchmark for +S2S prediction. ChaosBench has over 460K frames of real-world observations and +simulations, each with 60 variable-channels and spanning for up-to 45 years. We +also propose several physics-based, in addition to vision-based metrics, that +enables for a more physically-consistent model. Furthermore, we include a +diverse set of physics-based forecasts from 4 national weather agencies as +baselines to our data-driven counterpart. We establish two tasks that vary in +complexity: full and sparse dynamics prediction. Our benchmark is one of the +first to perform large-scale evaluation on existing models including +PanguWeather, FourCastNetV2, GraphCast, and ClimaX, and finds methods +originally developed for weather-scale applications fails on S2S task. We +release our benchmark code and datasets at +https://leap-stc.github.io/ChaosBench. + +
+
+ comment: 45 pages, 39 figures +
+
+
+
+
+ + ☆ Vehicle Perception from Satellite + + +
+ Satellites are capable of capturing high-resolution videos. It makes vehicle +perception from satellite become possible. Compared to street surveillance, +drive recorder or other equipments, satellite videos provide a much broader +city-scale view, so that the global dynamic scene of the traffic are captured +and displayed. Traffic monitoring from satellite is a new task with great +potential applications, including traffic jams prediction, path planning, +vehicle dispatching, \emph{etc.}. Practically, limited by the resolution and +view, the captured vehicles are very tiny (a few pixels) and move slowly. Worse +still, these satellites are in Low Earth Orbit (LEO) to capture such +high-resolution videos, so the background is also moving. Under this +circumstance, traffic monitoring from the satellite view is an extremely +challenging task. To attract more researchers into this field, we build a +large-scale benchmark for traffic monitoring from satellite. It supports +several tasks, including tiny object detection, counting and density +estimation. The dataset is constructed based on 12 satellite videos and 14 +synthetic videos recorded from GTA-V. They are separated into 408 video clips, +which contain 7,336 real satellite images and 1,960 synthetic images. 128,801 +vehicles are annotated totally, and the number of vehicles in each image varies +from 0 to 101. Several classic and state-of-the-art approaches in traditional +computer vision are evaluated on the datasets, so as to compare the performance +of different approaches, analyze the challenges in this task, and discuss the +future prospects. The dataset is available at: +https://github.com/Chenxi1510/Vehicle-Perception-from-Satellite-Videos. + +
+
+
+
+
+ + ☆ In-Bed Pose Estimation: A Review CCS24 + + +
+ Human pose estimation, the process of identifying joint positions in a +person's body from images or videos, represents a widely utilized technology +across diverse fields, including healthcare. One such healthcare application +involves in-bed pose estimation, where the body pose of an individual lying +under a blanket is analyzed. This task, for instance, can be used to monitor a +person's sleep behavior and detect symptoms early for potential disease +diagnosis in homes and hospitals. Several studies have utilized unimodal and +multimodal methods to estimate in-bed human poses. The unimodal studies +generally employ RGB images, whereas the multimodal studies use modalities +including RGB, long-wavelength infrared, pressure map, and depth map. +Multimodal studies have the advantage of using modalities in addition to RGB +that might capture information useful to cope with occlusions. Moreover, some +multimodal studies exclude RGB and, this way, better suit privacy preservation. +To expedite advancements in this domain, we conduct a review of existing +datasets and approaches. Our objectives are to show the limitations of the +previous studies, current challenges, and provide insights for future works on +the in-bed human pose estimation field. + +
+
+ comment: Accepted at HCCS24 Workshop @ International Conference on Pervasive + Computing and Communications (PerCom 2024) +
+
+
+
+
+ + ☆ Approximating Optimal Morphing Attacks using Template Inversion + + +
+ Recent works have demonstrated the feasibility of inverting face recognition +systems, enabling to recover convincing face images using only their +embeddings. We leverage such template inversion models to develop a novel type +ofdeep morphing attack based on inverting a theoretical optimal morph +embedding, which is obtained as an average of the face embeddings of source +images. We experiment with two variants of this approach: the first one +exploits a fully self-contained embedding-to-image inversion model, while the +second leverages the synthesis network of a pretrained StyleGAN network for +increased morph realism. We generate morphing attacks from several source +datasets and study the effectiveness of those attacks against several face +recognition networks. We showcase that our method can compete with and +regularly beat the previous state of the art for deep-learning based morph +generation in terms of effectiveness, both in white-box and black-box attack +scenarios, and is additionally much faster to run. We hope this might +facilitate the development of large scale deep morph datasets for training +detection models. + +
+
+ comment: Published at the IEEE International Joint Conference on Biometrics + (IJCB) 2023 +
+
+
+
+
+ + ☆ A Framework for Building Point Cloud Cleaning, Plane Detection and + Semantic Segmentation + + +
+ This paper presents a framework to address the challenges involved in +building point cloud cleaning, plane detection, and semantic segmentation, with +the ultimate goal of enhancing building modeling. We focus in the cleaning +stage on removing outliers from the acquired point cloud data by employing an +adaptive threshold technique based on z-score measure. Following the cleaning +process, we perform plane detection using the robust RANSAC paradigm. The goal +is to carry out multiple plane segmentations, and to classify segments into +distinct categories, such as floors, ceilings, and walls. The resulting +segments can generate accurate and detailed point clouds representing the +building's architectural elements. Moreover, we address the problem of semantic +segmentation, which plays a vital role in the identification and classification +of different components within the building, such as walls, windows, doors, +roofs, and objects. Inspired by the PointNet architecture, we propose a deep +learning architecture for efficient semantic segmentation in buildings. The +results demonstrate the effectiveness of the proposed framework in handling +building modeling tasks, paving the way for improved accuracy and efficiency in +the field of building modelization. + +
+
+
+
+
+ + ☆ LVC-LGMC: Joint Local and Global Motion Compensation for Learned Video + Compression ICASSP 2024 + + +
+ Existing learned video compression models employ flow net or deformable +convolutional networks (DCN) to estimate motion information. However, the +limited receptive fields of flow net and DCN inherently direct their +attentiveness towards the local contexts. Global contexts, such as large-scale +motions and global correlations among frames are ignored, presenting a +significant bottleneck for capturing accurate motions. To address this issue, +we propose a joint local and global motion compensation module (LGMC) for +leaned video coding. More specifically, we adopt flow net for local motion +compensation. To capture global context, we employ the cross attention in +feature domain for motion compensation. In addition, to avoid the quadratic +complexity of vanilla cross attention, we divide the softmax operations in +attention into two independent softmax operations, leading to linear +complexity. To validate the effectiveness of our proposed LGMC, we integrate it +with DCVC-TCM and obtain learned video compression with joint local and global +motion compensation (LVC-LGMC). Extensive experiments demonstrate that our +LVC-LGMC has significant rate-distortion performance improvements over baseline +DCVC-TCM. + +
+
+ comment: Accepted at ICASSP 2024. The first attempt to use cross attention for + bits-free motion estimation and motion compensation +
+
+
+
+
+ + ☆ Deep Robot Sketching: An application of Deep Q-Learning Networks for + human-like sketching + + +
+ The current success of Reinforcement Learning algorithms for its performance +in complex environments has inspired many recent theoretical approaches to +cognitive science. Artistic environments are studied within the cognitive +science community as rich, natural, multi-sensory, multi-cultural environments. +In this work, we propose the introduction of Reinforcement Learning for +improving the control of artistic robot applications. Deep Q-learning Neural +Networks (DQN) is one of the most successful algorithms for the implementation +of Reinforcement Learning in robotics. DQN methods generate complex control +policies for the execution of complex robot applications in a wide set of +environments. Current art painting robot applications use simple control laws +that limits the adaptability of the frameworks to a set of simple environments. +In this work, the introduction of DQN within an art painting robot application +is proposed. The goal is to study how the introduction of a complex control +policy impacts the performance of a basic art painting robot application. The +main expected contribution of this work is to serve as a first baseline for +future works introducing DQN methods for complex art painting robot frameworks. +Experiments consist of real world executions of human drawn sketches using the +DQN generated policy and TEO, the humanoid robot. Results are compared in terms +of similarity and obtained reward with respect to the reference inputs + +
+
+
+
+
+ + ☆ Exploring Homogeneous and Heterogeneous Consistent Label Associations + for Unsupervised Visible-Infrared Person ReID + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) aims to +retrieve pedestrian images of the same identity from different modalities +without annotations. While prior work focuses on establishing cross-modality +pseudo-label associations to bridge the modality-gap, they ignore maintaining +the instance-level homogeneous and heterogeneous consistency in pseudo-label +space, resulting in coarse associations. In response, we introduce a +Modality-Unified Label Transfer (MULT) module that simultaneously accounts for +both homogeneous and heterogeneous fine-grained instance-level structures, +yielding high-quality cross-modality label associations. It models both +homogeneous and heterogeneous affinities, leveraging them to define the +inconsistency for the pseudo-labels and then minimize it, leading to +pseudo-labels that maintain alignment across modalities and consistency within +intra-modality structures. Additionally, a straightforward plug-and-play Online +Cross-memory Label Refinement (OCLR) module is proposed to further mitigate the +impact of noisy pseudo-labels while simultaneously aligning different +modalities, coupled with a Modality-Invariant Representation Learning (MIRL) +framework. Experiments demonstrate that our proposed method outperforms +existing USL-VI-ReID methods, highlighting the superiority of our MULT in +comparison to other cross-modality association methods. The code will be +available. + +
+
+
+
+
+ + ☆ Fisheye Camera and Ultrasonic Sensor Fusion For Near-Field Obstacle + Perception in Bird's-Eye-View + + +
+ Accurate obstacle identification represents a fundamental challenge within +the scope of near-field perception for autonomous driving. Conventionally, +fisheye cameras are frequently employed for comprehensive surround-view +perception, including rear-view obstacle localization. However, the performance +of such cameras can significantly deteriorate in low-light conditions, during +nighttime, or when subjected to intense sun glare. Conversely, cost-effective +sensors like ultrasonic sensors remain largely unaffected under these +conditions. Therefore, we present, to our knowledge, the first end-to-end +multimodal fusion model tailored for efficient obstacle perception in a +bird's-eye-view (BEV) perspective, utilizing fisheye cameras and ultrasonic +sensors. Initially, ResNeXt-50 is employed as a set of unimodal encoders to +extract features specific to each modality. Subsequently, the feature space +associated with the visible spectrum undergoes transformation into BEV. The +fusion of these two modalities is facilitated via concatenation. At the same +time, the ultrasonic spectrum-based unimodal feature maps pass through +content-aware dilated convolution, applied to mitigate the sensor misalignment +between two sensors in the fused feature space. Finally, the fused features are +utilized by a two-stage semantic occupancy decoder to generate grid-wise +predictions for precise obstacle perception. We conduct a systematic +investigation to determine the optimal strategy for multimodal fusion of both +sensors. We provide insights into our dataset creation procedures, annotation +guidelines, and perform a thorough data analysis to ensure adequate coverage of +all scenarios. When applied to our dataset, the experimental results underscore +the robustness and effectiveness of our proposed multimodal fusion approach. + +
+
+ comment: 16 pages, 12 Figures, 6 tables +
+
+
+
+
+ + ☆ CapHuman: Capture Your Moments in Parallel Universes + + +
+ We concentrate on a novel human-centric image synthesis task, that is, given +only one reference facial photograph, it is expected to generate specific +individual images with diverse head positions, poses, and facial expressions in +different contexts. To accomplish this goal, we argue that our generative model +should be capable of the following favorable characteristics: (1) a strong +visual and semantic understanding of our world and human society for basic +object and human image generation. (2) generalizable identity preservation +ability. (3) flexible and fine-grained head control. Recently, large +pre-trained text-to-image diffusion models have shown remarkable results, +serving as a powerful generative foundation. As a basis, we aim to unleash the +above two capabilities of the pre-trained model. In this work, we present a new +framework named CapHuman. We embrace the ``encode then learn to align" +paradigm, which enables generalizable identity preservation for new individuals +without cumbersome tuning at inference. CapHuman encodes identity features and +then learns to align them into the latent space. Moreover, we introduce the 3D +facial prior to equip our model with control over the human head in a flexible +and 3D-consistent manner. Extensive qualitative and quantitative analyses +demonstrate our CapHuman can produce well-identity-preserved, photo-realistic, +and high-fidelity portraits with content-rich representations and various head +renditions, superior to established baselines. Code and checkpoint will be +released at https://github.com/VamosC/CapHuman. + +
+
+ comment: Project page: https://caphuman.github.io/ +
+
+
+
+
+ + ☆ Vision-LLMs Can Fool Themselves with Self-Generated Typographic Attacks + + +
+ Recently, significant progress has been made on Large Vision-Language Models +(LVLMs); a new class of VL models that make use of large pre-trained language +models. Yet, their vulnerability to Typographic attacks, which involve +superimposing misleading text onto an image remain unstudied. Furthermore, +prior work typographic attacks rely on sampling a random misleading class from +a predefined set of classes. However, the random chosen class might not be the +most effective attack. To address these issues, we first introduce a novel +benchmark uniquely designed to test LVLMs vulnerability to typographic attacks. +Furthermore, we introduce a new and more effective typographic attack: +Self-Generated typographic attacks. Indeed, our method, given an image, make +use of the strong language capabilities of models like GPT-4V by simply +prompting them to recommend a typographic attack. Using our novel benchmark, we +uncover that typographic attacks represent a significant threat against +LVLM(s). Furthermore, we uncover that typographic attacks recommended by GPT-4V +using our new method are not only more effective against GPT-4V itself compared +to prior work attacks, but also against a host of less capable yet popular open +source models like LLaVA, InstructBLIP, and MiniGPT4. + +
+
+
+
+
+ + ☆ Deep Clustering Using the Soft Silhouette Score: Towards Compact and + Well-Separated Clusters + + +
+ Unsupervised learning has gained prominence in the big data era, offering a +means to extract valuable insights from unlabeled datasets. Deep clustering has +emerged as an important unsupervised category, aiming to exploit the non-linear +mapping capabilities of neural networks in order to enhance clustering +performance. The majority of deep clustering literature focuses on minimizing +the inner-cluster variability in some embedded space while keeping the learned +representation consistent with the original high-dimensional dataset. In this +work, we propose soft silhoutte, a probabilistic formulation of the silhouette +coefficient. Soft silhouette rewards compact and distinctly separated +clustering solutions like the conventional silhouette coefficient. When +optimized within a deep clustering framework, soft silhouette guides the +learned representations towards forming compact and well-separated clusters. In +addition, we introduce an autoencoder-based deep learning architecture that is +suitable for optimizing the soft silhouette objective function. The proposed +deep clustering method has been tested and compared with several well-studied +deep clustering methods on various benchmark datasets, yielding very +satisfactory clustering results. + +
+
+
+
+
+ + ☆ Dynamic Texture Transfer using PatchMatch and Transformers + + +
+ How to automatically transfer the dynamic texture of a given video to the +target still image is a challenging and ongoing problem. In this paper, we +propose to handle this task via a simple yet effective model that utilizes both +PatchMatch and Transformers. The key idea is to decompose the task of dynamic +texture transfer into two stages, where the start frame of the target video +with the desired dynamic texture is synthesized in the first stage via a +distance map guided texture transfer module based on the PatchMatch algorithm. +Then, in the second stage, the synthesized image is decomposed into +structure-agnostic patches, according to which their corresponding subsequent +patches can be predicted by exploiting the powerful capability of Transformers +equipped with VQ-VAE for processing long discrete sequences. After getting all +those patches, we apply a Gaussian weighted average merging strategy to +smoothly assemble them into each frame of the target stylized video. +Experimental results demonstrate the effectiveness and superiority of the +proposed method in dynamic texture transfer compared to the state of the art. + +
+
+
+
+
+ + ☆ Coronary Artery Disease Classification with Different Lesion Degree + Ranges based on Deep Learning + + +
+ Invasive Coronary Angiography (ICA) images are considered the gold standard +for assessing the state of the coronary arteries. Deep learning classification +methods are widely used and well-developed in different areas where medical +imaging evaluation has an essential impact due to the development of +computer-aided diagnosis systems that can support physicians in their clinical +procedures. In this paper, a new performance analysis of deep learning methods +for binary ICA classification with different lesion degrees is reported. To +reach this goal, an annotated dataset of ICA images that contains the ground +truth, the location of lesions and seven possible severity degrees ranging +between 0% and 100% was employed. The ICA images were divided into 'lesion' or +'non-lesion' patches. We aim to study how binary classification performance is +affected by the different lesion degrees considered in the positive class. +Therefore, five known convolutional neural network architectures were trained +with different input images where different lesion degree ranges were gradually +incorporated until considering the seven lesion degrees. Besides, four types of +experiments with and without data augmentation were designed, whose F-measure +and Area Under Curve (AUC) were computed. Reported results achieved an +F-measure and AUC of 92.7% and 98.1%, respectively. However, lesion +classification is highly affected by the degree of the lesion intended to +classify, with 15% less accuracy when <99% lesion patches are present. + +
+
+
+
+
+ + ☆ Tropical Decision Boundaries for Neural Networks Are Robust Against + Adversarial Attacks + + +
+ We introduce a simple, easy to implement, and computationally efficient +tropical convolutional neural network architecture that is robust against +adversarial attacks. We exploit the tropical nature of piece-wise linear neural +networks by embedding the data in the tropical projective torus in a single +hidden layer which can be added to any model. We study the geometry of its +decision boundary theoretically and show its robustness against adversarial +attacks on image datasets using computational experiments. + +
+
+
+
+
+ + ☆ Diffusion-based Light Field Synthesis + + +
+ Light fields (LFs), conducive to comprehensive scene radiance recorded across +angular dimensions, find wide applications in 3D reconstruction, virtual +reality, and computational photography.However, the LF acquisition is +inevitably time-consuming and resource-intensive due to the mainstream +acquisition strategy involving manual capture or laborious software +synthesis.Given such a challenge, we introduce LFdiff, a straightforward yet +effective diffusion-based generative framework tailored for LF synthesis, which +adopts only a single RGB image as input.LFdiff leverages disparity estimated by +a monocular depth estimation network and incorporates two distinctive +components: a novel condition scheme and a noise estimation network tailored +for LF data.Specifically, we design a position-aware warping condition scheme, +enhancing inter-view geometry learning via a robust conditional signal.We then +propose DistgUnet, a disentanglement-based noise estimation network, to harness +comprehensive LF representations.Extensive experiments demonstrate that LFdiff +excels in synthesizing visually pleasing and disparity-controllable light +fields with enhanced generalization capability.Additionally, comprehensive +results affirm the broad applicability of the generated LF data, spanning +applications like LF super-resolution and refocusing. + +
+
+ comment: 11 pages,9 figures +
+
+
+
+
+ + ☆ CADICA: a new dataset for coronary artery disease detection by using + invasive coronary angiography + + +
+ Coronary artery disease (CAD) remains the leading cause of death globally and +invasive coronary angiography (ICA) is considered the gold standard of +anatomical imaging evaluation when CAD is suspected. However, risk evaluation +based on ICA has several limitations, such as visual assessment of stenosis +severity, which has significant interobserver variability. This motivates to +development of a lesion classification system that can support specialists in +their clinical procedures. Although deep learning classification methods are +well-developed in other areas of medical imaging, ICA image classification is +still at an early stage. One of the most important reasons is the lack of +available and high-quality open-access datasets. In this paper, we reported a +new annotated ICA images dataset, CADICA, to provide the research community +with a comprehensive and rigorous dataset of coronary angiography consisting of +a set of acquired patient videos and associated disease-related metadata. This +dataset can be used by clinicians to train their skills in angiographic +assessment of CAD severity and by computer scientists to create computer-aided +diagnostic systems to help in such assessment. In addition, baseline +classification methods are proposed and analyzed, validating the functionality +of CADICA and giving the scientific community a starting point to improve CAD +detection. + +
+
+
+
+
+ + ☆ A Single Graph Convolution Is All You Need: Efficient Grayscale Image + Classification + + +
+ Image classifiers often rely on convolutional neural networks (CNN) for their +tasks, which are inherently more heavyweight than multilayer perceptrons +(MLPs), which can be problematic in real-time applications. Additionally, many +image classification models work on both RGB and grayscale datasets. +Classifiers that operate solely on grayscale images are much less common. +Grayscale image classification has diverse applications, including but not +limited to medical image classification and synthetic aperture radar (SAR) +automatic target recognition (ATR). Thus, we present a novel grayscale (single +channel) image classification approach using a vectorized view of images. We +exploit the lightweightness of MLPs by viewing images as a vector and reducing +our problem setting to the grayscale image classification setting. We find that +using a single graph convolutional layer batch-wise increases accuracy and +reduces variance in the performance of our model. Moreover, we develop a +customized accelerator on FPGA for the proposed model with several +optimizations to improve its performance. Our experimental results on benchmark +grayscale image datasets demonstrate the effectiveness of the proposed model, +achieving vastly lower latency (up to 16$\times$ less) and competitive or +leading performance compared to other state-of-the-art image classification +models on various domain-specific grayscale image classification datasets. + +
+
+ comment: 6 pages of content, 1 page of references +
+
+
+
+
+ + ☆ Masked Conditional Diffusion Model for Enhancing Deepfake Detection + + +
+ Recent studies on deepfake detection have achieved promising results when +training and testing faces are from the same dataset. However, their results +severely degrade when confronted with forged samples that the model has not yet +seen during training. In this paper, deepfake data to help detect deepfakes. +this paper present we put a new insight into diffusion model-based data +augmentation, and propose a Masked Conditional Diffusion Model (MCDM) for +enhancing deepfake detection. It generates a variety of forged faces from a +masked pristine one, encouraging the deepfake detection model to learn generic +and robust representations without overfitting to special artifacts. Extensive +experiments demonstrate that forgery images generated with our method are of +high quality and helpful to improve the performance of deepfake detection +models. + +
+
+
+
+
+ + ☆ A Manifold Representation of the Key in Vision Transformers + + +
+ Vision Transformers implement multi-head self-attention (MSA) via stacking +multiple attention blocks. The query, key, and value are often intertwined and +generated within those blocks via a single, shared linear transformation. This +paper explores the concept of disentangling the key from the query and value, +and adopting a manifold representation for the key. Our experiments reveal that +decoupling and endowing the key with a manifold structure can enhance the model +performance. Specifically, ViT-B exhibits a 0.87% increase in top-1 accuracy, +while Swin-T sees a boost of 0.52% in top-1 accuracy on the ImageNet-1K +dataset, with eight charts in the manifold key. Our approach also yields +positive results in object detection and instance segmentation tasks on the +COCO dataset. Through detailed ablation studies, we establish that these +performance gains are not merely due to the simplicity of adding more +parameters and computations. Future research may investigate strategies for +cutting the budget of such representations and aim for further performance +improvements based on our findings. + +
+
+
+
+
+ + ☆ StopThePop: Sorted Gaussian Splatting for View-Consistent Real-time + Rendering + + +
+ Gaussian Splatting has emerged as a prominent model for constructing 3D +representations from images across diverse domains. However, the efficiency of +the 3D Gaussian Splatting rendering pipeline relies on several simplifications. +Notably, reducing Gaussian to 2D splats with a single view-space depth +introduces popping and blending artifacts during view rotation. Addressing this +issue requires accurate per-pixel depth computation, yet a full per-pixel sort +proves excessively costly compared to a global sort operation. In this paper, +we present a novel hierarchical rasterization approach that systematically +resorts and culls splats with minimal processing overhead. Our software +rasterizer effectively eliminates popping artifacts and view inconsistencies, +as demonstrated through both quantitative and qualitative measurements. +Simultaneously, our method mitigates the potential for cheating view-dependent +effects with popping, ensuring a more authentic representation. Despite the +elimination of cheating, our approach achieves comparable quantitative results +for test images, while increasing the consistency for novel view synthesis in +motion. Due to its design, our hierarchical approach is only 4% slower on +average than the original Gaussian Splatting. Notably, enforcing consistency +enables a reduction in the number of Gaussians by approximately half with +nearly identical quality and view-consistency. Consequently, rendering +performance is nearly doubled, making our approach 1.6x faster than the +original Gaussian Splatting, with a 50% reduction in memory requirements. + +
+
+ comment: Video: https://youtu.be/RJQlSORNkr0 +
+
+
+
+
+ + ☆ Bias Mitigating Few-Shot Class-Incremental Learning + + +
+ Few-shot class-incremental learning (FSCIL) aims at recognizing novel classes +continually with limited novel class samples. A mainstream baseline for FSCIL +is first to train the whole model in the base session, then freeze the feature +extractor in the incremental sessions. Despite achieving high overall accuracy, +most methods exhibit notably low accuracy for incremental classes. Some recent +methods somewhat alleviate the accuracy imbalance between base and incremental +classes by fine-tuning the feature extractor in the incremental sessions, but +they further cause the accuracy imbalance between past and current incremental +classes. In this paper, we study the causes of such classification accuracy +imbalance for FSCIL, and abstract them into a unified model bias problem. Based +on the analyses, we propose a novel method to mitigate model bias of the FSCIL +problem during training and inference processes, which includes mapping ability +stimulation, separately dual-feature classification, and self-optimizing +classifiers. Extensive experiments on three widely-used FSCIL benchmark +datasets show that our method significantly mitigates the model bias problem +and achieves state-of-the-art performance. + +
+
+ comment: 8 pages (not including references and checklist) +
+
+
+
+
+ + ☆ Can you see me now? Blind spot estimation for autonomous vehicles using + scenario-based simulation with random reference sensors + + +
+ In this paper, we introduce a method for estimating blind spots for sensor +setups of autonomous or automated vehicles and/or robotics applications. In +comparison to previous methods that rely on geometric approximations, our +presented approach provides more realistic coverage estimates by utilizing +accurate and detailed 3D simulation environments. Our method leverages point +clouds from LiDAR sensors or camera depth images from high-fidelity simulations +of target scenarios to provide accurate and actionable visibility estimates. A +Monte Carlo-based reference sensor simulation enables us to accurately estimate +blind spot size as a metric of coverage, as well as detection probabilities of +objects at arbitrary positions. + +
+
+
+
+
+ + ☆ Instruction Makes a Difference + + +
+ We introduce Instruction Document Visual Question Answering (iDocVQA) dataset +and Large Language Document (LLaDoc) model, for training Language-Vision (LV) +models for document analysis and predictions on document images, respectively. +Usually, deep neural networks for the DocVQA task are trained on datasets +lacking instructions. We show that using instruction-following datasets +improves performance. We compare performance across document-related datasets +using the recent state-of-the-art (SotA) Large Language and Vision Assistant +(LLaVA)1.5 as the base model. We also evaluate the performance of the derived +models for object hallucination using the Polling-based Object Probing +Evaluation (POPE) dataset. The results show that instruction-tuning performance +ranges from 11X to 32X of zero-shot performance and from 0.1% to 4.2% over +non-instruction (traditional task) finetuning. Despite the gains, these still +fall short of human performance (94.36%), implying there's much room for +improvement. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Dual-Student Knowledge Distillation Networks for Unsupervised Anomaly + Detection + + +
+ Due to the data imbalance and the diversity of defects, student-teacher +networks (S-T) are favored in unsupervised anomaly detection, which explores +the discrepancy in feature representation derived from the knowledge +distillation process to recognize anomalies. However, vanilla S-T network is +not stable. Employing identical structures to construct the S-T network may +weaken the representative discrepancy on anomalies. But using different +structures can increase the likelihood of divergent performance on normal data. +To address this problem, we propose a novel dual-student knowledge distillation +(DSKD) architecture. Different from other S-T networks, we use two student +networks a single pre-trained teacher network, where the students have the same +scale but inverted structures. This framework can enhance the distillation +effect to improve the consistency in recognition of normal data, and +simultaneously introduce diversity for anomaly representation. To explore +high-dimensional semantic information to capture anomaly clues, we employ two +strategies. First, a pyramid matching mode is used to perform knowledge +distillation on multi-scale feature maps in the intermediate layers of +networks. Second, an interaction is facilitated between the two student +networks through a deep feature embedding module, which is inspired by +real-world group discussions. In terms of classification, we obtain pixel-wise +anomaly segmentation maps by measuring the discrepancy between the output +feature maps of the teacher and student networks, from which an anomaly score +is computed for sample-wise determination. We evaluate DSKD on three benchmark +datasets and probe the effects of internal modules through ablation +experiments. The results demonstrate that DSKD can achieve exceptional +performance on small models like ResNet18 and effectively improve vanilla S-T +networks. + +
+
+
+
+
+ + ☆ Merging Multi-Task Models via Weight-Ensembling Mixture of Experts + + +
+ Merging various task-specific Transformer-based models trained on different +tasks into a single unified model can execute all the tasks concurrently. +Previous methods, exemplified by task arithmetic, have been proven to be both +effective and scalable. Existing methods have primarily focused on seeking a +static optimal solution within the original model parameter space. A notable +challenge is mitigating the interference between parameters of different +models, which can substantially deteriorate performance. In this paper, we +propose to merge most of the parameters while upscaling the MLP of the +Transformer layers to a weight-ensembling mixture of experts (MoE) module, +which can dynamically integrate shared and task-specific knowledge based on the +input, thereby providing a more flexible solution that can adapt to the +specific needs of each instance. Our key insight is that by identifying and +separating shared knowledge and task-specific knowledge, and then dynamically +integrating them, we can mitigate the parameter interference problem to a great +extent. We conduct the conventional multi-task model merging experiments and +evaluate the generalization and robustness of our method. The results +demonstrate the effectiveness of our method and provide a comprehensive +understanding of our method. The code is available at +https://anonymous.4open.science/r/weight-ensembling_MoE-67C9/ + +
+
+
+
+
+ + ☆ Lightweight Pixel Difference Networks for Efficient Visual + Representation Learning + + +
+ Recently, there have been tremendous efforts in developing lightweight Deep +Neural Networks (DNNs) with satisfactory accuracy, which can enable the +ubiquitous deployment of DNNs in edge devices. The core challenge of developing +compact and efficient DNNs lies in how to balance the competing goals of +achieving high accuracy and high efficiency. In this paper we propose two novel +types of convolutions, dubbed \emph{Pixel Difference Convolution (PDC) and +Binary PDC (Bi-PDC)} which enjoy the following benefits: capturing higher-order +local differential information, computationally efficient, and able to be +integrated with existing DNNs. With PDC and Bi-PDC, we further present two +lightweight deep networks named \emph{Pixel Difference Networks (PiDiNet)} and +\emph{Binary PiDiNet (Bi-PiDiNet)} respectively to learn highly efficient yet +more accurate representations for visual tasks including edge detection and +object recognition. Extensive experiments on popular datasets (BSDS500, +ImageNet, LFW, YTF, \emph{etc.}) show that PiDiNet and Bi-PiDiNet achieve the +best accuracy-efficiency trade-off. For edge detection, PiDiNet is the first +network that can be trained without ImageNet, and can achieve the human-level +performance on BSDS500 at 100 FPS and with $<$1M parameters. For object +recognition, among existing Binary DNNs, Bi-PiDiNet achieves the best accuracy +and a nearly $2\times$ reduction of computational cost on ResNet18. Code +available at +\href{https://github.com/hellozhuo/pidinet}{https://github.com/hellozhuo/pidinet}. + +
+
+ comment: We design a novel lightweight convolutional operator for computer + vision tasks. Both full-precision networks and BNNs are developed. Accepted + by TPAMI +
+
+
+
+
+ + ☆ Short: Benchmarking transferable adversarial attacks NDSS 2024 + + +
+ The robustness of deep learning models against adversarial attacks remains a +pivotal concern. This study presents, for the first time, an exhaustive review +of the transferability aspect of adversarial attacks. It systematically +categorizes and critically evaluates various methodologies developed to augment +the transferability of adversarial attacks. This study encompasses a spectrum +of techniques, including Generative Structure, Semantic Similarity, Gradient +Editing, Target Modification, and Ensemble Approach. Concurrently, this paper +introduces a benchmark framework \textit{TAA-Bench}, integrating ten leading +methodologies for adversarial attack transferability, thereby providing a +standardized and systematic platform for comparative analysis across diverse +model architectures. Through comprehensive scrutiny, we delineate the efficacy +and constraints of each method, shedding light on their underlying operational +principles and practical utility. This review endeavors to be a quintessential +resource for both scholars and practitioners in the field, charting the complex +terrain of adversarial transferability and setting a foundation for future +explorations in this vital sector. The associated codebase is accessible at: +https://github.com/KxPlaug/TAA-Bench + +
+
+ comment: Accepted by NDSS 2024 Workshop +
+
+
+
+
+ + ☆ LM-HT SNN: Enhancing the Performance of SNN to ANN Counterpart through + Learnable Multi-hierarchical Threshold Model + + +
+ Compared to traditional Artificial Neural Network (ANN), Spiking Neural +Network (SNN) has garnered widespread academic interest for its intrinsic +ability to transmit information in a more biological-inspired and +energy-efficient manner. However, despite previous efforts to optimize the +learning gradients and model structure of SNNs through various methods, SNNs +still lag behind ANNs in terms of performance to some extent. The recently +proposed multi-threshold model provides more possibilities for further +enhancing the learning capability of SNNs. In this paper, we rigorously analyze +the relationship among the multi-threshold model, vanilla spiking model and +quantized ANNs from a mathematical perspective, then propose a novel LM-HT +model, which is an equidistant multi-hierarchical model that can dynamically +regulate the global input current and membrane potential leakage on the time +dimension. In addition, we note that the direct training algorithm based on the +LM-HT model can seamlessly integrate with the traditional ANN-SNN Conversion +framework. This novel hybrid learning framework can effectively improve the +relatively poor performance of converted SNNs under low time latency. Extensive +experimental results have demonstrated that our LM-HT model can significantly +outperform previous state-of-the-art works on various types of datasets, which +promote SNNs to achieve a brand-new level of performance comparable to +quantized ANNs. + +
+
+ comment: 15 pages, 2 figures +
+
+
+
+
+ + ☆ InfMAE: A Foundation Model in Infrared Modality + + +
+ In recent years, the foundation models have swept the computer vision field +and facilitated the development of various tasks within different modalities. +However, it remains an open question on how to design an infrared foundation +model. In this paper, we propose InfMAE, a foundation model in infrared +modality. We release an infrared dataset, called Inf30 to address the problem +of lacking large-scale data for self-supervised learning in the infrared vision +community. Besides, we design an information-aware masking strategy, which is +suitable for infrared images. This masking strategy allows for a greater +emphasis on the regions with richer information in infrared images during the +self-supervised learning process, which is conducive to learning the +generalized representation. In addition, we adopt a multi-scale encoder to +enhance the performance of the pre-trained encoders in downstream tasks. +Finally, based on the fact that infrared images do not have a lot of details +and texture information, we design an infrared decoder module, which further +improves the performance of downstream tasks. Extensive experiments show that +our proposed method InfMAE outperforms other supervised methods and +self-supervised learning methods in three downstream tasks. Our code will be +made public at https://github.com/liufangcen/InfMAE. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Image2Points:A 3D Point-based Context Clusters GAN for High-Quality PET + Image Reconstruction ICASSP 2024 + + +
+ To obtain high-quality Positron emission tomography (PET) images while +minimizing radiation exposure, numerous methods have been proposed to +reconstruct standard-dose PET (SPET) images from the corresponding low-dose PET +(LPET) images. However, these methods heavily rely on voxel-based +representations, which fall short of adequately accounting for the precise +structure and fine-grained context, leading to compromised reconstruction. In +this paper, we propose a 3D point-based context clusters GAN, namely PCC-GAN, +to reconstruct high-quality SPET images from LPET. Specifically, inspired by +the geometric representation power of points, we resort to a point-based +representation to enhance the explicit expression of the image structure, thus +facilitating the reconstruction with finer details. Moreover, a context +clustering strategy is applied to explore the contextual relationships among +points, which mitigates the ambiguities of small structures in the +reconstructed images. Experiments on both clinical and phantom datasets +demonstrate that our PCC-GAN outperforms the state-of-the-art reconstruction +methods qualitatively and quantitatively. Code is available at +https://github.com/gluucose/PCCGAN. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ Disentangled Multimodal Brain MR Image Translation via Transformer-based + Modality Infuser + + +
+ Multimodal Magnetic Resonance (MR) Imaging plays a crucial role in disease +diagnosis due to its ability to provide complementary information by analyzing +a relationship between multimodal images on the same subject. Acquiring all MR +modalities, however, can be expensive, and, during a scanning session, certain +MR images may be missed depending on the study protocol. The typical solution +would be to synthesize the missing modalities from the acquired images such as +using generative adversarial networks (GANs). Yet, GANs constructed with +convolutional neural networks (CNNs) are likely to suffer from a lack of global +relationships and mechanisms to condition the desired modality. To address +this, in this work, we propose a transformer-based modality infuser designed to +synthesize multimodal brain MR images. In our method, we extract +modality-agnostic features from the encoder and then transform them into +modality-specific features using the modality infuser. Furthermore, the +modality infuser captures long-range relationships among all brain structures, +leading to the generation of more realistic images. We carried out experiments +on the BraTS 2018 dataset, translating between four MR modalities, and our +experimental results demonstrate the superiority of our proposed method in +terms of synthesis quality. In addition, we conducted experiments on a brain +tumor segmentation task and different conditioning methods. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Safety of Multimodal Large Language Models on Images and Text + + +
+ Attracted by the impressive power of Multimodal Large Language Models +(MLLMs), the public is increasingly utilizing them to improve the efficiency of +daily work. Nonetheless, the vulnerabilities of MLLMs to unsafe instructions +bring huge safety risks when these models are deployed in real-world scenarios. +In this paper, we systematically survey current efforts on the evaluation, +attack, and defense of MLLMs' safety on images and text. We begin with +introducing the overview of MLLMs on images and text and understanding of +safety, which helps researchers know the detailed scope of our survey. Then, we +review the evaluation datasets and metrics for measuring the safety of MLLMs. +Next, we comprehensively present attack and defense techniques related to +MLLMs' safety. Finally, we analyze several unsolved issues and discuss +promising research directions. + +
+
+
+
+
+ + ☆ High-Quality Medical Image Generation from Free-hand Sketch + + +
+ Generating medical images from human-drawn free-hand sketches holds promise +for various important medical imaging applications. Due to the extreme +difficulty in collecting free-hand sketch data in the medical domain, most deep +learning-based methods have been proposed to generate medical images from the +synthesized sketches (e.g., edge maps or contours of segmentation masks from +real images). However, these models often fail to generalize on the free-hand +sketches, leading to unsatisfactory results. In this paper, we propose a +practical free-hand sketch-to-image generation model called Sketch2MedI that +learns to represent sketches in StyleGAN's latent space and generate medical +images from it. Thanks to the ability to encode sketches into this meaningful +representation space, Sketch2MedI only requires synthesized sketches for +training, enabling a cost-effective learning process. Our Sketch2MedI +demonstrates a robust generalization to free-hand sketches, resulting in +high-quality and realistic medical image generations. Comparative evaluations +of Sketch2MedI against the pix2pix, CycleGAN, UNIT, and U-GAT-IT models show +superior performance in generating pharyngeal images, both quantitative and +qualitative across various metrics. + +
+
+
+
+
+ + ☆ Machine Unlearning for Image-to-Image Generative Models ICLR 2024 + + +
+ Machine unlearning has emerged as a new paradigm to deliberately forget data +samples from a given model in order to adhere to stringent regulations. +However, existing machine unlearning methods have been primarily focused on +classification models, leaving the landscape of unlearning for generative +models relatively unexplored. This paper serves as a bridge, addressing the gap +by providing a unifying framework of machine unlearning for image-to-image +generative models. Within this framework, we propose a +computationally-efficient algorithm, underpinned by rigorous theoretical +analysis, that demonstrates negligible performance degradation on the retain +samples, while effectively removing the information from the forget samples. +Empirical studies on two large-scale datasets, ImageNet-1K and Places-365, +further show that our algorithm does not rely on the availability of the retain +samples, which further complies with data retention policy. To our best +knowledge, this work is the first that represents systemic, theoretical, +empirical explorations of machine unlearning specifically tailored for +image-to-image generative models. Our code is available at +https://github.com/jpmorganchase/l2l-generator-unlearning. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Recasting Regional Lighting for Shadow Removal AAAI 2024 + + +
+ Removing shadows requires an understanding of both lighting conditions and +object textures in a scene. Existing methods typically learn pixel-level color +mappings between shadow and non-shadow images, in which the joint modeling of +lighting and object textures is implicit and inadequate. We observe that in a +shadow region, the degradation degree of object textures depends on the local +illumination, while simply enhancing the local illumination cannot fully +recover the attenuated textures. Based on this observation, we propose to +condition the restoration of attenuated textures on the corrected local +lighting in the shadow region. Specifically, We first design a shadow-aware +decomposition network to estimate the illumination and reflectance layers of +shadow regions explicitly. We then propose a novel bilateral correction network +to recast the lighting of shadow regions in the illumination layer via a novel +local lighting correction module, and to restore the textures conditioned on +the corrected illumination layer via a novel illumination-guided texture +restoration module. We further annotate pixel-wise shadow masks for the public +SRD dataset, which originally contains only image pairs. Experiments on three +benchmarks show that our method outperforms existing state-of-the-art shadow +removal methods. + +
+
+ comment: AAAI 2024 (Oral) +
+
+
+
+
+ + ☆ SmartCooper: Vehicular Collaborative Perception with Adaptive Fusion and + Judger Mechanism + + +
+ In recent years, autonomous driving has garnered significant attention due to +its potential for improving road safety through collaborative perception among +connected and autonomous vehicles (CAVs). However, time-varying channel +variations in vehicular transmission environments demand dynamic allocation of +communication resources. Moreover, in the context of collaborative perception, +it is important to recognize that not all CAVs contribute valuable data, and +some CAV data even have detrimental effects on collaborative perception. In +this paper, we introduce SmartCooper, an adaptive collaborative perception +framework that incorporates communication optimization and a judger mechanism +to facilitate CAV data fusion. Our approach begins with optimizing the +connectivity of vehicles while considering communication constraints. We then +train a learnable encoder to dynamically adjust the compression ratio based on +the channel state information (CSI). Subsequently, we devise a judger mechanism +to filter the detrimental image data reconstructed by adaptive decoders. We +evaluate the effectiveness of our proposed algorithm on the OpenCOOD platform. +Our results demonstrate a substantial reduction in communication costs by +23.10\% compared to the non-judger scheme. Additionally, we achieve a +significant improvement on the average precision of Intersection over Union +(AP@IoU) by 7.15\% compared with state-of-the-art schemes. + +
+
+
+
+
+ + ☆ SCO-VIST: Social Interaction Commonsense Knowledge-based Visual + Storytelling + + +
+ Visual storytelling aims to automatically generate a coherent story based on +a given image sequence. Unlike tasks like image captioning, visual stories +should contain factual descriptions, worldviews, and human social commonsense +to put disjointed elements together to form a coherent and engaging +human-writeable story. However, most models mainly focus on applying factual +information and using taxonomic/lexical external knowledge when attempting to +create stories. This paper introduces SCO-VIST, a framework representing the +image sequence as a graph with objects and relations that includes human action +motivation and its social interaction commonsense knowledge. SCO-VIST then +takes this graph representing plot points and creates bridges between plot +points with semantic and occurrence-based edge weights. This weighted story +graph produces the storyline in a sequence of events using Floyd-Warshall's +algorithm. Our proposed framework produces stories superior across multiple +metrics in terms of visual grounding, coherence, diversity, and humanness, per +both automatic and human evaluations. + +
+
+
+
+
+ + ☆ Invariance-powered Trustworthy Defense via Remove Then Restore + + +
+ Adversarial attacks pose a challenge to the deployment of deep neural +networks (DNNs), while previous defense models overlook the generalization to +various attacks. Inspired by targeted therapies for cancer, we view adversarial +samples as local lesions of natural benign samples, because a key finding is +that salient attack in an adversarial sample dominates the attacking process, +while trivial attack unexpectedly provides trustworthy evidence for obtaining +generalizable robustness. Based on this finding, a Pixel Surgery and Semantic +Regeneration (PSSR) model following the targeted therapy mechanism is +developed, which has three merits: 1) To remove the salient attack, a +score-based Pixel Surgery module is proposed, which retains the trivial attack +as a kind of invariance information. 2) To restore the discriminative content, +a Semantic Regeneration module based on a conditional alignment extrapolator is +proposed, which achieves pixel and semantic consistency. 3) To further +harmonize robustness and accuracy, an intractable problem, a self-augmentation +regularizer with adversarial R-drop is designed. Experiments on numerous +benchmarks show the superiority of PSSR. + +
+
+
+
+
+ + ☆ Self-supervised learning of video representations from a child's + perspective + + +
+ Children learn powerful internal models of the world around them from a few +years of egocentric visual experience. Can such internal models be learned from +a child's visual experience with highly generic learning algorithms or do they +require strong inductive biases? Recent advances in collecting large-scale, +longitudinal, developmentally realistic video datasets and generic +self-supervised learning (SSL) algorithms are allowing us to begin to tackle +this nature vs. nurture question. However, existing work typically focuses on +image-based SSL algorithms and visual capabilities that can be learned from +static images (e.g. object recognition), thus ignoring temporal aspects of the +world. To close this gap, here we train self-supervised video models on +longitudinal, egocentric headcam recordings collected from a child over a two +year period in their early development (6-31 months). The resulting models are +highly effective at facilitating the learning of action concepts from a small +number of labeled examples; they have favorable data size scaling properties; +and they display emergent video interpolation capabilities. Video models also +learn more robust object representations than image-based models trained with +the exact same data. These results suggest that important temporal aspects of a +child's internal model of the world may be learnable from their visual +experience using highly generic learning algorithms and without strong +inductive biases. + +
+
+ comment: 7 pages, 6 figures; code & models available from + https://github.com/eminorhan/video-models +
+
+
+
+
+ + ☆ Comparative Evaluation of Traditional and Deep Learning-Based + Segmentation Methods for Spoil Pile Delineation Using UAV Images + + +
+ The stability of mine dumps is contingent upon the precise arrangement of +spoil piles, taking into account their geological and geotechnical attributes. +Yet, on-site characterisation of individual piles poses a formidable challenge. +The utilisation of image-based techniques for spoil pile characterisation, +employing remotely acquired data through unmanned aerial systems, is a +promising complementary solution. Image processing, such as object-based +classification and feature extraction, are dependent upon effective +segmentation. This study refines and juxtaposes various segmentation +approaches, specifically colour-based and morphology-based techniques. The +objective is to enhance and evaluate avenues for object-based analysis for +spoil characterisation within the context of mining environments. Furthermore, +a comparative analysis is conducted between conventional segmentation +approaches and those rooted in deep learning methodologies. Among the diverse +segmentation approaches evaluated, the morphology-based deep learning +segmentation approach, Segment Anything Model (SAM), exhibited superior +performance in comparison to other approaches. This outcome underscores the +efficacy of incorporating advanced morphological and deep learning techniques +for accurate and efficient spoil pile characterisation. The findings of this +study contribute valuable insights to the optimisation of segmentation +strategies, thereby advancing the application of image-based techniques for the +characterisation of spoil piles in mining environments. + +
+
+
+
+
+ + ☆ FineBio: A Fine-Grained Video Dataset of Biological Experiments with + Hierarchical Annotation + + +
+ In the development of science, accurate and reproducible documentation of the +experimental process is crucial. Automatic recognition of the actions in +experiments from videos would help experimenters by complementing the recording +of experiments. Towards this goal, we propose FineBio, a new fine-grained video +dataset of people performing biological experiments. The dataset consists of +multi-view videos of 32 participants performing mock biological experiments +with a total duration of 14.5 hours. One experiment forms a hierarchical +structure, where a protocol consists of several steps, each further decomposed +into a set of atomic operations. The uniqueness of biological experiments is +that while they require strict adherence to steps described in each protocol, +there is freedom in the order of atomic operations. We provide hierarchical +annotation on protocols, steps, atomic operations, object locations, and their +manipulation states, providing new challenges for structured activity +understanding and hand-object interaction recognition. To find out challenges +on activity understanding in biological experiments, we introduce baseline +models and results on four different tasks, including (i) step segmentation, +(ii) atomic operation detection (iii) object detection, and (iv) +manipulated/affected object detection. Dataset and code are available from +https://github.com/aistairc/FineBio. + +
+
+
+
+
+ + ☆ Multimodal Embodied Interactive Agent for Cafe Scene + + +
+ With the surge in the development of large language models, embodied +intelligence has attracted increasing attention. Nevertheless, prior works on +embodied intelligence typically encode scene or historical memory in an +unimodal manner, either visual or linguistic, which complicates the alignment +of the model's action planning with embodied control. To overcome this +limitation, we introduce the Multimodal Embodied Interactive Agent (MEIA), +capable of translating high-level tasks expressed in natural language into a +sequence of executable actions. Specifically, we propose a novel Multimodal +Environment Memory (MEM) module, facilitating the integration of embodied +control with large models through the visual-language memory of scenes. This +capability enables MEIA to generate executable action plans based on diverse +requirements and the robot's capabilities. We conduct experiments in a dynamic +virtual cafe environment, utilizing multiple large models through zero-shot +learning, and carefully design scenarios for various situations. The +experimental results showcase the promising performance of our MEIA in various +embodied interactive tasks. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Guided Interpretable Facial Expression Recognition via Spatial Action + Unit Cues + + +
+ While state-of-the-art facial expression recognition (FER) classifiers +achieve a high level of accuracy, they lack interpretability, an important +aspect for end-users. To recognize basic facial expressions, experts resort to +a codebook associating a set of spatial action units to a facial expression. In +this paper, we follow the same expert footsteps, and propose a learning +strategy that allows us to explicitly incorporate spatial action units (aus) +cues into the classifier's training to build a deep interpretable model. In +particular, using this aus codebook, input image expression label, and facial +landmarks, a single action units heatmap is built to indicate the most +discriminative regions of interest in the image w.r.t the facial expression. We +leverage this valuable spatial cue to train a deep interpretable classifier for +FER. This is achieved by constraining the spatial layer features of a +classifier to be correlated with \aus map. Using a composite loss, the +classifier is trained to correctly classify an image while yielding +interpretable visual layer-wise attention correlated with aus maps, simulating +the experts' decision process. This is achieved using only the image class +expression as supervision and without any extra manual annotations. Moreover, +our method is generic. It can be applied to any CNN- or transformer-based deep +classifier without the need for architectural change or adding significant +training time. Our extensive evaluation on two public benchmarks RAFDB, and +AFFECTNET datasets shows that our proposed strategy can improve layer-wise +interpretability without degrading classification performance. In addition, we +explore a common type of interpretable classifiers that rely on +Class-Activation Mapping methods (CAMs), and we show that our training +technique improves the CAM interpretability. + +
+
+ comment: 11 +
+
+
+
+
+ + ☆ Understanding Neural Network Systems for Image Analysis using Vector + Spaces and Inverse Maps + + +
+ There is strong interest in developing mathematical methods that can be used +to understand complex neural networks used in image analysis. In this paper, we +introduce techniques from Linear Algebra to model neural network layers as maps +between signal spaces. First, we demonstrate how signal spaces can be used to +visualize weight spaces and convolutional layer kernels. We also demonstrate +how residual vector spaces can be used to further visualize information lost at +each layer. Second, we introduce the concept of invertible networks and an +algorithm for computing input images that yield specific outputs. We +demonstrate our approach on two invertible networks and ResNet18. + +
+
+
+
+
+ + ☆ A Survey on Hallucination in Large Vision-Language Models + + +
+ Recent development of Large Vision-Language Models (LVLMs) has attracted +growing attention within the AI landscape for its practical implementation +potential. However, ``hallucination'', or more specifically, the misalignment +between factual visual content and corresponding textual generation, poses a +significant challenge of utilizing LVLMs. In this comprehensive survey, we +dissect LVLM-related hallucinations in an attempt to establish an overview and +facilitate future mitigation. Our scrutiny starts with a clarification of the +concept of hallucinations in LVLMs, presenting a variety of hallucination +symptoms and highlighting the unique challenges inherent in LVLM +hallucinations. Subsequently, we outline the benchmarks and methodologies +tailored specifically for evaluating hallucinations unique to LVLMs. +Additionally, we delve into an investigation of the root causes of these +hallucinations, encompassing insights from the training data and model +components. We also critically review existing methods for mitigating +hallucinations. The open questions and future directions pertaining to +hallucinations within LVLMs are discussed to conclude this survey. + +
+
+
+
+
+ + ☆ LRDif: Diffusion Models for Under-Display Camera Emotion Recognition + + +
+ This study introduces LRDif, a novel diffusion-based framework designed +specifically for facial expression recognition (FER) within the context of +under-display cameras (UDC). To address the inherent challenges posed by UDC's +image degradation, such as reduced sharpness and increased noise, LRDif employs +a two-stage training strategy that integrates a condensed preliminary +extraction network (FPEN) and an agile transformer network (UDCformer) to +effectively identify emotion labels from UDC images. By harnessing the robust +distribution mapping capabilities of Diffusion Models (DMs) and the spatial +dependency modeling strength of transformers, LRDif effectively overcomes the +obstacles of noise and distortion inherent in UDC environments. Comprehensive +experiments on standard FER datasets including RAF-DB, KDEF, and FERPlus, LRDif +demonstrate state-of-the-art performance, underscoring its potential in +advancing FER applications. This work not only addresses a significant gap in +the literature by tackling the UDC challenge in FER but also sets a new +benchmark for future research in the field. + +
+
+
+
+
+ + ☆ Assessing Patient Eligibility for Inspire Therapy through Machine + Learning and Deep Learning Models + + +
+ Inspire therapy is an FDA-approved internal neurostimulation treatment for +obstructive sleep apnea. However, not all patients respond to this therapy, +posing a challenge even for experienced otolaryngologists to determine +candidacy. This paper makes the first attempt to leverage both machine learning +and deep learning techniques in discerning patient responsiveness to Inspire +therapy using medical data and videos captured through Drug-Induced Sleep +Endoscopy (DISE), an essential procedure for Inspire therapy. To achieve this, +we gathered and annotated three datasets from 127 patients. Two of these +datasets comprise endoscopic videos focused on the Base of the Tongue and +Velopharynx. The third dataset composes the patient's clinical information. By +utilizing these datasets, we benchmarked and compared the performance of six +deep learning models and five classical machine learning algorithms. The +results demonstrate the potential of employing machine learning and deep +learning techniques to determine a patient's eligibility for Inspire therapy, +paving the way for future advancements in this field. + +
+
+
+
+
+ + ☆ Unconditional Latent Diffusion Models Memorize Patient Imaging Data + + +
+ Generative latent diffusion models hold a wide range of applications in the +medical imaging domain. A noteworthy application is privacy-preserved open-data +sharing by proposing synthetic data as surrogates of real patient data. Despite +the promise, these models are susceptible to patient data memorization, where +models generate patient data copies instead of novel synthetic samples. This +undermines the whole purpose of preserving patient data and may even result in +patient re-identification. Considering the importance of the problem, +surprisingly it has received relatively little attention in the medical imaging +community. To this end, we assess memorization in latent diffusion models for +medical image synthesis. We train 2D and 3D latent diffusion models on CT, MR, +and X-ray datasets for synthetic data generation. Afterwards, we examine the +amount of training data memorized utilizing self-supervised models and further +investigate various factors that can possibly lead to memorization by training +models in different settings. We observe a surprisingly large amount of data +memorization among all datasets, with up to 41.7%, 19.6%, and 32.6% of the +training data memorized in CT, MRI, and X-ray datasets respectively. Further +analyses reveal that increasing training data size and using data augmentation +reduce memorization, while over-training enhances it. Overall, our results +suggest a call for memorization-informed evaluation of synthetic data prior to +open-data sharing. + +
+
+
+
+
+ + ☆ Weakly Convex Regularisers for Inverse Problems: Convergence of Critical + Points and Primal-Dual Optimisation + + +
+ Variational regularisation is the primary method for solving inverse +problems, and recently there has been considerable work leveraging deeply +learned regularisation for enhanced performance. However, few results exist +addressing the convergence of such regularisation, particularly within the +context of critical points as opposed to global minima. In this paper, we +present a generalised formulation of convergent regularisation in terms of +critical points, and show that this is achieved by a class of weakly convex +regularisers. We prove convergence of the primal-dual hybrid gradient method +for the associated variational problem, and, given a Kurdyka-Lojasiewicz +condition, an $\mathcal{O}(\log{k}/k)$ ergodic convergence rate. Finally, +applying this theory to learned regularisation, we prove universal +approximation for input weakly convex neural networks (IWCNN), and show +empirically that IWCNNs can lead to improved performance of learned adversarial +regularisers for computed tomography (CT) reconstruction. + +
+
+ comment: 26 pages, 4 figures, preprint +
+
+
+
+
+ + ☆ IMUGPT 2.0: Language-Based Cross Modality Transfer for Sensor-Based + Human Activity Recognition + + +
+ One of the primary challenges in the field of human activity recognition +(HAR) is the lack of large labeled datasets. This hinders the development of +robust and generalizable models. Recently, cross modality transfer approaches +have been explored that can alleviate the problem of data scarcity. These +approaches convert existing datasets from a source modality, such as video, to +a target modality (IMU). With the emergence of generative AI models such as +large language models (LLMs) and text-driven motion synthesis models, language +has become a promising source data modality as well as shown in proof of +concepts such as IMUGPT. In this work, we conduct a large-scale evaluation of +language-based cross modality transfer to determine their effectiveness for +HAR. Based on this study, we introduce two new extensions for IMUGPT that +enhance its use for practical HAR application scenarios: a motion filter +capable of filtering out irrelevant motion sequences to ensure the relevance of +the generated virtual IMU data, and a set of metrics that measure the diversity +of the generated data facilitating the determination of when to stop generating +virtual IMU data for both effective and efficient processing. We demonstrate +that our diversity metrics can reduce the effort needed for the generation of +virtual IMU data by at least 50%, which open up IMUGPT for practical use cases +beyond a mere proof of concept. + +
+
+
+
+
+ + ☆ VISION-MAE: A Foundation Model for Medical Image Segmentation and + Classification + + +
+ Artificial Intelligence (AI) has the potential to revolutionize diagnosis and +segmentation in medical imaging. However, development and clinical +implementation face multiple challenges including limited data availability, +lack of generalizability, and the necessity to incorporate multi-modal data +effectively. A foundation model, which is a large-scale pre-trained AI model, +offers a versatile base that can be adapted to a variety of specific tasks and +contexts. Here, we present a novel foundation model, VISION-MAE, specifically +designed for medical imaging. Specifically, VISION-MAE is trained on a dataset +of 2.5 million unlabeled images from various modalities (CT, MR, PET, X-rays, +and ultrasound), using self-supervised learning techniques. It is then adapted +to classification and segmentation tasks using explicit labels. VISION-MAE has +high label efficiency, outperforming several benchmark models in both in-domain +and out-of-domain applications, and achieves high performance even with reduced +availability of labeled data. This model represents a significant advancement +in medical imaging AI, offering a generalizable and robust solution for +improving segmentation and classification tasks while reducing the data +annotation workload. + +
+
+
+
+
+ + ☆ MRAnnotator: A Multi-Anatomy Deep Learning Model for MRI Segmentation + + +
+ Purpose To develop a deep learning model for multi-anatomy and many-class +segmentation of diverse anatomic structures on MRI imaging. + Materials and Methods In this retrospective study, two datasets were curated +and annotated for model development and evaluation. An internal dataset of 1022 +MRI sequences from various clinical sites within a health system and an +external dataset of 264 MRI sequences from an independent imaging center were +collected. In both datasets, 49 anatomic structures were annotated as the +ground truth. The internal dataset was divided into training, validation, and +test sets and used to train and evaluate an nnU-Net model. The external dataset +was used to evaluate nnU-Net model generalizability and performance in all +classes on independent imaging data. Dice scores were calculated to evaluate +model segmentation performance. + Results The model achieved an average Dice score of 0.801 on the internal +test set, and an average score of 0.814 on the complete external dataset across +49 classes. + Conclusion The developed model achieves robust and generalizable segmentation +of 49 anatomic structures on MRI imaging. A future direction is focused on the +incorporation of additional anatomic regions and structures into the datasets +and model. + +
+
+
+
+
+ + ☆ AI-generated faces free from racial and gender stereotypes + + +
+ Text-to-image generative AI models such as Stable Diffusion are used daily by +millions worldwide. However, many have raised concerns regarding how these +models amplify racial and gender stereotypes. To study this phenomenon, we +develop a classifier to predict the race, gender, and age group of any given +face image, and show that it achieves state-of-the-art performance. Using this +classifier, we quantify biases in Stable Diffusion across six races, two +genders, five age groups, 32 professions, and eight attributes. We then propose +novel debiasing solutions that outperform state-of-the-art alternatives. +Additionally, we examine the degree to which Stable Diffusion depicts +individuals of the same race as being similar to one another. This analysis +reveals a high degree of stereotyping, e.g., depicting most middle eastern +males as being dark-skinned, bearded, and wearing a traditional headdress. We +address these limitations by proposing yet another novel solution that +increases facial diversity across genders and racial groups. Our solutions are +open-sourced and made publicly available. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ mmID: High-Resolution mmWave Imaging for Human Identification + + +
+ Achieving accurate human identification through RF imaging has been a +persistent challenge, primarily attributed to the limited aperture size and its +consequent impact on imaging resolution. The existing imaging solution enables +tasks such as pose estimation, activity recognition, and human tracking based +on deep neural networks by estimating skeleton joints. In contrast to +estimating joints, this paper proposes to improve imaging resolution by +estimating the human figure as a whole using conditional generative adversarial +networks (cGAN). In order to reduce training complexity, we use an estimated +spatial spectrum using the MUltiple SIgnal Classification (MUSIC) algorithm as +input to the cGAN. Our system generates environmentally independent, +high-resolution images that can extract unique physical features useful for +human identification. We use a simple convolution layers-based classification +network to obtain the final identification result. From the experimental +results, we show that resolution of the image produced by our trained generator +is high enough to enable human identification. Our finding indicates +high-resolution accuracy with 5% mean silhouette difference to the Kinect +device. Extensive experiments in different environments on multiple testers +demonstrate that our system can achieve 93% overall test accuracy in unseen +environments for static human target identification. + +
+
+ comment: This paper was published in the IEEE 9th World Forum on Internet of + Things +
+
+
+
+
+ + ☆ A Cost-Efficient Approach for Creating Virtual Fitting Room using + Generative Adversarial Networks (GANs) + + +
+ Customers all over the world want to see how the clothes fit them or not +before purchasing. Therefore, customers by nature prefer brick-and-mortar +clothes shopping so they can try on products before purchasing them. But after +the Pandemic of COVID19 many sellers either shifted to online shopping or +closed their fitting rooms which made the shopping process hesitant and +doubtful. The fact that the clothes may not be suitable for their buyers after +purchase led us to think about using new AI technologies to create an online +platform or a virtual fitting room (VFR) in the form of a mobile application +and a deployed model using a webpage that can be embedded later to any online +store where they can try on any number of cloth items without physically trying +them. Besides, it will save much searching time for their needs. Furthermore, +it will reduce the crowding and headache in the physical shops by applying the +same technology using a special type of mirror that will enable customers to +try on faster. On the other hand, from business owners' perspective, this +project will highly increase their online sales, besides, it will save the +quality of the products by avoiding physical trials issues. The main approach +used in this work is applying Generative Adversarial Networks (GANs) combined +with image processing techniques to generate one output image from two input +images which are the person image and the cloth image. This work achieved +results that outperformed the state-of-the-art approaches found in literature. + +
+
+
+
+
+ + ☆ Compressed image quality assessment using stacking + + +
+ It is well-known that there is no universal metric for image quality +evaluation. In this case, distortion-specific metrics can be more reliable. The +artifact imposed by image compression can be considered as a combination of +various distortions. Depending on the image context, this combination can be +different. As a result, Generalization can be regarded as the major challenge +in compressed image quality assessment. In this approach, stacking is employed +to provide a reliable method. Both semantic and low-level information are +employed in the presented IQA to predict the human visual system. Moreover, the +results of the Full-Reference (FR) and No-Reference (NR) models are aggregated +to improve the proposed Full-Reference method for compressed image quality +evaluation. The accuracy of the quality benchmark of the clic2024 perceptual +image challenge was achieved 79.6\%, which illustrates the effectiveness of the +proposed fusion-based approach. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ YOLinO++: Single-Shot Estimation of Generic Polylines for Mapless + Automated Diving + + +
+ In automated driving, highly accurate maps are commonly used to support and +complement perception. These maps are costly to create and quickly become +outdated as the traffic world is permanently changing. In order to support or +replace the map of an automated system with detections from sensor data, a +perception module must be able to detect the map features. We propose a neural +network that follows the one shot philosophy of YOLO but is designed for +detection of 1D structures in images, such as lane boundaries. + We extend previous ideas by a midpoint based line representation and anchor +definitions. This representation can be used to describe lane borders, +markings, but also implicit features such as centerlines of lanes. The broad +applicability of the approach is shown with the detection performance on lane +centerlines, lane borders as well as the markings both on highways and in urban +areas. + Versatile lane boundaries are detected and can be inherently classified as +dashed or solid lines, curb, road boundaries, or implicit delimitation. + +
+
+
+
+
+ + ☆ Enhanced fringe-to-phase framework using deep learning + + +
+ In Fringe Projection Profilometry (FPP), achieving robust and accurate 3D +reconstruction with a limited number of fringe patterns remains a challenge in +structured light 3D imaging. Conventional methods require a set of fringe +images, but using only one or two patterns complicates phase recovery and +unwrapping. In this study, we introduce SFNet, a symmetric fusion network that +transforms two fringe images into an absolute phase. To enhance output +reliability, Our framework predicts refined phases by incorporating information +from fringe images of a different frequency than those used as input. This +allows us to achieve high accuracy with just two images. Comparative +experiments and ablation studies validate the effectiveness of our proposed +method. The dataset and code are publicly accessible on our project page +https://wonhoe-kim.github.io/SFNet. + +
+
+ comment: 35 pages, 13 figures, 6 tables +
+
+
+
+
+ + ☆ FuseFormer: A Transformer for Visual and Thermal Image Fusion + + +
+ Image fusion is the process of combining images from different sensors into a +single image that incorporates all relevant information. The majority of +state-of-the-art image fusion techniques use deep learning methods to extract +meaningful features; however, they primarily integrate local features without +considering the image's broader context. To overcome this limitation, +Transformer-based models have emerged as a promising solution, aiming to +capture general context dependencies through attention mechanisms. Since there +is no ground truth for image fusion, the loss functions are structured based on +evaluation metrics, such as the structural similarity index measure (SSIM). By +doing so, we create a bias towards the SSIM and, therefore, the input visual +band image. The objective of this study is to propose a novel methodology for +image fusion that mitigates the limitations associated with using evaluation +metrics as loss functions. Our approach integrates a transformer-based +multi-scale fusion strategy, which adeptly addresses both local and global +context information. This integration not only refines the individual +components of the image fusion process but also significantly enhances the +overall efficacy of the method. Our proposed method follows a two-stage +training approach, where an auto-encoder is initially trained to extract deep +features at multiple scales at the first stage. For the second stage, we +integrate our fusion block and change the loss function as mentioned. The +multi-scale features are fused using a combination of Convolutional Neural +Networks (CNNs) and Transformers. The CNNs are utilized to capture local +features, while the Transformer handles the integration of general context +features. + +
+
+ comment: 9 pages, 9 figures, 6 tables +
+
+
+
+
+ + ☆ Multi-Modal Machine Learning Framework for Automated Seizure Detection + in Laboratory Rats + + +
+ A multi-modal machine learning system uses multiple unique data sources and +types to improve its performance. This article proposes a system that combines +results from several types of models, all of which are trained on different +data signals. As an example to illustrate the efficacy of the system, an +experiment is described in which multiple types of data are collected from rats +suffering from seizures. This data includes electrocorticography readings, +piezoelectric motion sensor data, and video recordings. Separate models are +trained on each type of data, with the goal of classifying each time frame as +either containing a seizure or not. After each model has generated its +classification predictions, these results are combined. While each data signal +works adequately on its own for prediction purposes, the significant imbalance +in class labels leads to increased numbers of false positives, which can be +filtered and removed by utilizing all data sources. This paper will demonstrate +that, after postprocessing and combination techniques, classification accuracy +is improved with this multi-modal system when compared to the performance of +each individual data source. + +
+
+
+
+
+ + ♻ ☆ SugarViT -- Multi-objective Regression of UAV Images with Vision + Transformers and Deep Label Distribution Learning Demonstrated on Disease + Severity Prediction in Sugar Beet + + +
+ Remote sensing and artificial intelligence are pivotal technologies of +precision agriculture nowadays. The efficient retrieval of large-scale field +imagery combined with machine learning techniques shows success in various +tasks like phenotyping, weeding, cropping, and disease control. This work will +introduce a machine learning framework for automatized large-scale +plant-specific trait annotation for the use case disease severity scoring for +Cercospora Leaf Spot (CLS) in sugar beet. With concepts of Deep Label +Distribution Learning (DLDL), special loss functions, and a tailored model +architecture, we develop an efficient Vision Transformer based model for +disease severity scoring called SugarViT. One novelty in this work is the +combination of remote sensing data with environmental parameters of the +experimental sites for disease severity prediction. Although the model is +evaluated on this special use case, it is held as generic as possible to also +be applicable to various image-based classification and regression tasks. With +our framework, it is even possible to learn models on multi-objective problems +as we show by a pretraining on environmental metadata. + +
+
+ comment: submitted to Computers and Electronics in Agriculture +
+
+
+
+
+ + ♻ ☆ MAMBA: Multi-level Aggregation via Memory Bank for Video Object + Detection + + +
+ State-of-the-art video object detection methods maintain a memory structure, +either a sliding window or a memory queue, to enhance the current frame using +attention mechanisms. However, we argue that these memory structures are not +efficient or sufficient because of two implied operations: (1) concatenating +all features in memory for enhancement, leading to a heavy computational cost; +(2) frame-wise memory updating, preventing the memory from capturing more +temporal information. In this paper, we propose a multi-level aggregation +architecture via memory bank called MAMBA. Specifically, our memory bank +employs two novel operations to eliminate the disadvantages of existing +methods: (1) light-weight key-set construction which can significantly reduce +the computational cost; (2) fine-grained feature-wise updating strategy which +enables our method to utilize knowledge from the whole video. To better enhance +features from complementary levels, i.e., feature maps and proposals, we +further propose a generalized enhancement operation (GEO) to aggregate +multi-level features in a unified manner. We conduct extensive evaluations on +the challenging ImageNetVID dataset. Compared with existing state-of-the-art +methods, our method achieves superior performance in terms of both speed and +accuracy. More remarkably, MAMBA achieves mAP of 83.7/84.6% at 12.6/9.1 FPS +with ResNet-101. Code is available at +https://github.com/guanxiongsun/vfe.pytorch. + +
+
+ comment: update code url https://github.com/guanxiongsun/vfe.pytorch +
+
+
+
+
+ + ♻ ☆ Revisiting the Role of Language Priors in Vision-Language Models + + +
+ Vision-language models (VLMs) are impactful in part because they can be +applied to a variety of visual understanding tasks in a zero-shot fashion, +without any fine-tuning. We study $\textit{generative VLMs}$ that are trained +for next-word generation given an image. We explore their zero-shot performance +on the illustrative task of image-text retrieval across 8 popular +vision-language benchmarks. Our first observation is that they can be +repurposed for discriminative tasks (such as image-text retrieval) by simply +computing the match score of generating a particular text string given an +image. We call this probabilistic score the $\textit{Visual Generative +Pre-Training Score}$ (VisualGPTScore). While the VisualGPTScore produces +near-perfect accuracy on some retrieval benchmarks, it yields poor accuracy on +others. We analyze this behavior through a probabilistic lens, pointing out +that some benchmarks inadvertently capture unnatural language distributions by +creating adversarial but unlikely text captions. In fact, we demonstrate that +even a "blind" language model that ignores any image evidence can sometimes +outperform all prior art, reminiscent of similar challenges faced by the +visual-question answering (VQA) community many years ago. We derive a +probabilistic post-processing scheme that controls for the amount of linguistic +bias in generative VLMs at test time without having to retrain or fine-tune the +model. We show that the VisualGPTScore, when appropriately debiased, is a +strong zero-shot baseline for vision-language understanding, oftentimes +producing state-of-the-art accuracy. + +
+
+ comment: Website: https://linzhiqiu.github.io/papers/visual_gpt_score/ +
+
+
+
+
+ + ♻ ☆ Gaze Detection and Analysis for Initiating Joint Activity in Industrial + Human-Robot Collaboration + + +
+ Collaborative robots (cobots) are widely used in industrial applications, yet +extensive research is still needed to enhance human-robot collaborations and +operator experience. A potential approach to improve the collaboration +experience involves adapting cobot behavior based on natural cues from the +operator. Inspired by the literature on human-human interactions, we conducted +a wizard-of-oz study to examine whether a gaze towards the cobot can serve as a +trigger for initiating joint activities in collaborative sessions. In this +study, 37 participants engaged in an assembly task while their gaze behavior +was analyzed. We employ a gaze-based attention recognition model to identify +when the participants look at the cobot. Our results indicate that in most +cases (84.88\%), the joint activity is preceded by a gaze towards the cobot. +Furthermore, during the entire assembly cycle, the participants tend to look at +the cobot around the time of the joint activity. To the best of our knowledge, +this is the first study to analyze the natural gaze behavior of participants +working on a joint activity with a robot during a collaborative assembly task. + +
+
+ comment: First draft for a paper submitted to Frontiers in Robotics and AI +
+
+
+
+
+ + ♻ ☆ Generalized Video Anomaly Event Detection: Systematic Taxonomy and + Comparison of Deep Models + + +
+ Video Anomaly Detection (VAD) serves as a pivotal technology in the +intelligent surveillance systems, enabling the temporal or spatial +identification of anomalous events within videos. While existing reviews +predominantly concentrate on conventional unsupervised methods, they often +overlook the emergence of weakly-supervised and fully-unsupervised approaches. +To address this gap, this survey extends the conventional scope of VAD beyond +unsupervised methods, encompassing a broader spectrum termed Generalized Video +Anomaly Event Detection (GVAED). By skillfully incorporating recent +advancements rooted in diverse assumptions and learning frameworks, this survey +introduces an intuitive taxonomy that seamlessly navigates through +unsupervised, weakly-supervised, supervised and fully-unsupervised VAD +methodologies, elucidating the distinctions and interconnections within these +research trajectories. In addition, this survey facilitates prospective +researchers by assembling a compilation of research resources, including public +datasets, available codebases, programming tools, and pertinent literature. +Furthermore, this survey quantitatively assesses model performance, delves into +research challenges and directions, and outlines potential avenues for future +exploration. + +
+
+ comment: Accepted by ACM Computing Surveys. For more information, please see + our project page: https://github.com/fudanyliu/GVAED +
+
+
+
+
+ + ♻ ☆ Parrot Captions Teach CLIP to Spot Text + + +
+ Despite CLIP being the foundation model in numerous vision-language +applications, the CLIP suffers from a severe text spotting bias. Such bias +causes CLIP models to `Parrot' the visual text embedded within images while +disregarding the authentic visual semantics. We uncover that in the most +popular image-text dataset LAION-2B, the captions also densely parrot (spell) +the text embedded in images. Our analysis shows that around 50% of images are +embedded with visual text content, and around 30% of captions words are in +these embedded visual content. Based on such observation, we thoroughly inspect +the different released versions of CLIP models and verify that the visual text +is the dominant factor in measuring the LAION-style image-text similarity for +these models. To examine whether these parrot captions shape the text spotting +bias, we train a series of CLIP models with LAION subsets curated by different +parrot-caption-oriented criteria. We show that training with parrot captions +easily shapes such bias but harms the expected visual-language representation +learning in CLIP models. This suggests that it is urgent to revisit either the +design of CLIP-like models or the existing image-text dataset curation pipeline +built on CLIP score filtering. + +
+
+ comment: project page: https://linyq17.github.io/CLIP-Parrot-Bias/. Add more + analysis and ablation studies. Update Figure 3 with a more precise metric +
+
+
+
+
+ + ♻ ☆ MoCaE: Mixture of Calibrated Experts Significantly Improves Object + Detection + + +
+ Combining the strengths of many existing predictors to obtain a Mixture of +Experts which is superior to its individual components is an effective way to +improve the performance without having to develop new architectures or train a +model from scratch. However, surprisingly, we find that na\"ively combining +expert object detectors in a similar way to Deep Ensembles, can often lead to +degraded performance. We identify that the primary cause of this issue is that +the predictions of the experts do not match their performance, a term referred +to as miscalibration. Consequently, the most confident detector dominates the +final predictions, preventing the mixture from leveraging all the predictions +from the experts appropriately. To address this, when constructing the Mixture +of Experts, we propose to combine their predictions in a manner which reflects +the individual performance of the experts; an objective we achieve by first +calibrating the predictions before filtering and refining them. We term this +approach the Mixture of Calibrated Experts and demonstrate its effectiveness +through extensive experiments on 5 different detection tasks using a variety of +detectors, showing that it: (i) improves object detectors on COCO and instance +segmentation methods on LVIS by up to $\sim 2.5$ AP; (ii) reaches +state-of-the-art on COCO test-dev with $65.1$ AP and on DOTA with $82.62$ +$\mathrm{AP_{50}}$; (iii) outperforms single models consistently on recent +detection tasks such as Open Vocabulary Object Detection. + +
+
+
+
+
+ + ♻ ☆ StructChart: Perception, Structuring, Reasoning for Visual Chart + Understanding + + +
+ Charts are common in literature across different scientific fields, conveying +rich information easily accessible to readers. Current chart-related tasks +focus on either chart perception which refers to extracting information from +the visual charts, or performing reasoning given the extracted data, e.g. in a +tabular form. In this paper, we aim to establish a unified and label-efficient +learning paradigm for joint perception and reasoning tasks, which can be +generally applicable to different downstream tasks, beyond the +question-answering task as specifically studied in peer works. Specifically, +StructChart first reformulates the chart information from the popular tubular +form (specifically linearized CSV) to the proposed Structured Triplet +Representations (STR), which is more friendly for reducing the task gap between +chart perception and reasoning due to the employed structured information +extraction for charts. We then propose a Structuring Chart-oriented +Representation Metric (SCRM) to quantitatively evaluate the performance for the +chart perception task. To enrich the dataset for training, we further explore +the possibility of leveraging the Large Language Model (LLM), enhancing the +chart diversity in terms of both chart visual style and its statistical +information. Extensive experiments are conducted on various chart-related +tasks, demonstrating the effectiveness and promising potential for a unified +chart perception-reasoning paradigm to push the frontier of chart +understanding. + +
+
+ comment: SimChart9K is available for downloading at: + https://github.com/UniModal4Reasoning/SimChart9K 26 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Understanding the Role of the Projector in Knowledge Distillation AAAI 2024 + + +
+ In this paper we revisit the efficacy of knowledge distillation as a function +matching and metric learning problem. In doing so we verify three important +design decisions, namely the normalisation, soft maximum function, and +projection layers as key ingredients. We theoretically show that the projector +implicitly encodes information on past examples, enabling relational gradients +for the student. We then show that the normalisation of representations is +tightly coupled with the training dynamics of this projector, which can have a +large impact on the students performance. Finally, we show that a simple soft +maximum function can be used to address any significant capacity gap problems. +Experimental results on various benchmark datasets demonstrate that using these +insights can lead to superior or comparable performance to state-of-the-art +knowledge distillation techniques, despite being much more computationally +efficient. In particular, we obtain these results across image classification +(CIFAR100 and ImageNet), object detection (COCO2017), and on more difficult +distillation objectives, such as training data efficient transformers, whereby +we attain a 77.2% top-1 accuracy with DeiT-Ti on ImageNet. Code and models are +publicly available. + +
+
+ comment: AAAI 2024. Code available at + https://github.com/roymiles/Simple-Recipe-Distillation +
+
+
+
+
+ + ♻ ☆ Diffusion Model Conditioning on Gaussian Mixture Model and Negative + Gaussian Mixture Gradient + + +
+ Diffusion models (DMs) are a type of generative model that has a huge impact +on image synthesis and beyond. They achieve state-of-the-art generation results +in various generative tasks. A great diversity of conditioning inputs, such as +text or bounding boxes, are accessible to control the generation. In this work, +we propose a conditioning mechanism utilizing Gaussian mixture models (GMMs) as +feature conditioning to guide the denoising process. Based on set theory, we +provide a comprehensive theoretical analysis that shows that conditional latent +distribution based on features and classes is significantly different, so that +conditional latent distribution on features produces fewer defect generations +than conditioning on classes. Two diffusion models conditioned on the Gaussian +mixture model are trained separately for comparison. Experiments support our +findings. A novel gradient function called the negative Gaussian mixture +gradient (NGMG) is proposed and applied in diffusion model training with an +additional classifier. Training stability has improved. We also theoretically +prove that NGMG shares the same benefit as the Earth Mover distance +(Wasserstein) as a more sensible cost function when learning distributions +supported by low-dimensional manifolds. + +
+
+
+
+
+ + ♻ ☆ EPIC Fields: Marrying 3D Geometry and Video Understanding NeurIPS 2023 + + +
+ Neural rendering is fuelling a unification of learning, 3D geometry and video +understanding that has been waiting for more than two decades. Progress, +however, is still hampered by a lack of suitable datasets and benchmarks. To +address this gap, we introduce EPIC Fields, an augmentation of EPIC-KITCHENS +with 3D camera information. Like other datasets for neural rendering, EPIC +Fields removes the complex and expensive step of reconstructing cameras using +photogrammetry, and allows researchers to focus on modelling problems. We +illustrate the challenge of photogrammetry in egocentric videos of dynamic +actions and propose innovations to address them. Compared to other neural +rendering datasets, EPIC Fields is better tailored to video understanding +because it is paired with labelled action segments and the recent VISOR segment +annotations. To further motivate the community, we also evaluate two benchmark +tasks in neural rendering and segmenting dynamic objects, with strong baselines +that showcase what is not possible today. We also highlight the advantage of +geometry in semi-supervised video object segmentations on the VISOR +annotations. EPIC Fields reconstructs 96% of videos in EPICKITCHENS, +registering 19M frames in 99 hours recorded in 45 kitchens. + +
+
+ comment: Published at NeurIPS 2023. 24 pages, 15 figures. Project Webpage: + http://epic-kitchens.github.io/epic-fields +
+
+
+
+
+ + ♻ ☆ M3Dsynth: A dataset of medical 3D images with AI-generated local + manipulations + + +
+ The ability to detect manipulated visual content is becoming increasingly +important in many application fields, given the rapid advances in image +synthesis methods. Of particular concern is the possibility of modifying the +content of medical images, altering the resulting diagnoses. Despite its +relevance, this issue has received limited attention from the research +community. One reason is the lack of large and curated datasets to use for +development and benchmarking purposes. Here, we investigate this issue and +propose M3Dsynth, a large dataset of manipulated Computed Tomography (CT) lung +images. We create manipulated images by injecting or removing lung cancer +nodules in real CT scans, using three different methods based on Generative +Adversarial Networks (GAN) or Diffusion Models (DM), for a total of 8,577 +manipulated samples. Experiments show that these images easily fool automated +diagnostic tools. We also tested several state-of-the-art forensic detectors +and demonstrated that, once trained on the proposed dataset, they are able to +accurately detect and localize manipulated synthetic content, even when +training and test sets are not aligned, showing good generalization ability. +Dataset and code are publicly available at +https://grip-unina.github.io/M3Dsynth/. + +
+
+
+
+
+ + ♻ ☆ 1DFormer: a Transformer Architecture Learning 1D Landmark + Representations for Facial Landmark Tracking + + +
+ Recently, heatmap regression methods based on 1D landmark representations +have shown prominent performance on locating facial landmarks. However, +previous methods ignored to make deep explorations on the good potentials of 1D +landmark representations for sequential and structural modeling of multiple +landmarks to track facial landmarks. To address this limitation, we propose a +Transformer architecture, namely 1DFormer, which learns informative 1D landmark +representations by capturing the dynamic and the geometric patterns of +landmarks via token communications in both temporal and spatial dimensions for +facial landmark tracking. For temporal modeling, we propose a recurrent token +mixing mechanism, an axis-landmark-positional embedding mechanism, as well as a +confidence-enhanced multi-head attention mechanism to adaptively and robustly +embed long-term landmark dynamics into their 1D representations; for structure +modeling, we design intra-group and inter-group structure modeling mechanisms +to encode the component-level as well as global-level facial structure patterns +as a refinement for the 1D representations of landmarks through token +communications in the spatial dimension via 1D convolutional layers. +Experimental results on the 300VW and the TF databases show that 1DFormer +successfully models the long-range sequential patterns as well as the inherent +facial structures to learn informative 1D representations of landmark +sequences, and achieves state-of-the-art performance on facial landmark +tracking. + +
+
+
+
+
+ + ♻ ☆ Neural-PBIR Reconstruction of Shape, Material, and Illumination ICCV 2023 + + +
+ Reconstructing the shape and spatially varying surface appearances of a +physical-world object as well as its surrounding illumination based on 2D +images (e.g., photographs) of the object has been a long-standing problem in +computer vision and graphics. In this paper, we introduce an accurate and +highly efficient object reconstruction pipeline combining neural based object +reconstruction and physics-based inverse rendering (PBIR). Our pipeline firstly +leverages a neural SDF based shape reconstruction to produce high-quality but +potentially imperfect object shape. Then, we introduce a neural material and +lighting distillation stage to achieve high-quality predictions for material +and illumination. In the last stage, initialized by the neural predictions, we +perform PBIR to refine the initial results and obtain the final high-quality +reconstruction of object shape, material, and illumination. Experimental +results demonstrate our pipeline significantly outperforms existing methods +quality-wise and performance-wise. + +
+
+ comment: ICCV 2023. Project page at https://neural-pbir.github.io/ Update + Stanford-ORB results +
+
+
+
+
+ + ♻ ☆ Object-Centric Instruction Augmentation for Robotic Manipulation ICRA2024 + + +
+ Humans interpret scenes by recognizing both the identities and positions of +objects in their observations. For a robot to perform tasks such as +\enquote{pick and place}, understanding both what the objects are and where +they are located is crucial. While the former has been extensively discussed in +the literature that uses the large language model to enrich the text +descriptions, the latter remains underexplored. In this work, we introduce the +\textit{Object-Centric Instruction Augmentation (OCI)} framework to augment +highly semantic and information-dense language instruction with position cues. +We utilize a Multi-modal Large Language Model (MLLM) to weave knowledge of +object locations into natural language instruction, thus aiding the policy +network in mastering actions for versatile manipulation. Additionally, we +present a feature reuse mechanism to integrate the vision-language features +from off-the-shelf pre-trained MLLM into policy networks. Through a series of +simulated and real-world robotic tasks, we demonstrate that robotic manipulator +imitation policies trained with our enhanced instructions outperform those +relying solely on traditional language instructions. + +
+
+ comment: accepted to ICRA2024 +
+
+
+
+
+ + ♻ ☆ Language-Conditioned Robotic Manipulation with Fast and Slow Thinking ICRA2024 + + +
+ The language-conditioned robotic manipulation aims to transfer natural +language instructions into executable actions, from simple pick-and-place to +tasks requiring intent recognition and visual reasoning. Inspired by the dual +process theory in cognitive science, which suggests two parallel systems of +fast and slow thinking in human decision-making, we introduce Robotics with +Fast and Slow Thinking (RFST), a framework that mimics human cognitive +architecture to classify tasks and makes decisions on two systems based on +instruction types. Our RFST consists of two key components: 1) an instruction +discriminator to determine which system should be activated based on the +current user instruction, and 2) a slow-thinking system that is comprised of a +fine-tuned vision language model aligned with the policy networks, which allows +the robot to recognize user intention or perform reasoning tasks. To assess our +methodology, we built a dataset featuring real-world trajectories, capturing +actions ranging from spontaneous impulses to tasks requiring deliberate +contemplation. Our results, both in simulation and real-world scenarios, +confirm that our approach adeptly manages intricate tasks that demand intent +recognition and reasoning. The project is available at +https://jlm-z.github.io/RSFT/ + +
+
+ comment: accepted to ICRA2024 +
+
+
+
+
+ + ♻ ☆ Towards Open Vocabulary Learning: A Survey + + +
+ In the field of visual scene understanding, deep neural networks have made +impressive advancements in various core tasks like segmentation, tracking, and +detection. However, most approaches operate on the close-set assumption, +meaning that the model can only identify pre-defined categories that are +present in the training set. Recently, open vocabulary settings were proposed +due to the rapid progress of vision language pre-training. These new approaches +seek to locate and recognize categories beyond the annotated label space. The +open vocabulary approach is more general, practical, and effective compared to +weakly supervised and zero-shot settings. This paper provides a thorough review +of open vocabulary learning, summarizing and analyzing recent developments in +the field. In particular, we begin by comparing it to related concepts such as +zero-shot learning, open-set recognition, and out-of-distribution detection. +Then, we review several closely related tasks in the case of segmentation and +detection, including long-tail problems, few-shot, and zero-shot settings. For +the method survey, we first present the basic knowledge of detection and +segmentation in close-set as the preliminary knowledge. Next, we examine +various scenarios in which open vocabulary learning is used, identifying common +design elements and core ideas. Then, we compare the recent detection and +segmentation approaches in commonly used datasets and benchmarks. Finally, we +conclude with insights, issues, and discussions regarding future research +directions. To our knowledge, this is the first comprehensive literature review +of open vocabulary learning. We keep tracing related works at +https://github.com/jianzongwu/Awesome-Open-Vocabulary. + +
+
+ comment: Accepted by IEEE T-PAMI. Project page: + https://github.com/jianzongwu/Awesome-Open-Vocabulary +
+
+
+
+
+ + ♻ ☆ Tiered approach for rapid damage characterisation of infrastructure + enabled by remote sensing and deep learning technologies + + +
+ Critical infrastructure such as bridges are systematically targeted during +wars and conflicts. This is because critical infrastructure is vital for +enabling connectivity and transportation of people and goods, and hence, +underpinning the national and international defence planning and economic +growth. Mass destruction of bridges, along with minimal or no accessibility to +these assets during natural and anthropogenic disasters, prevents us from +delivering rapid recovery. As a result, systemic resilience is drastically +reduced. A solution to this challenge is to use technology for stand-off +observations. Yet, no method exists to characterise damage at different scales, +i.e. regional, asset, and structural (component), and more so there is little +or no systematic correlation between assessments at scale. We propose an +integrated three-level tiered approach to fill this capability gap, and we +demonstrate the methods for damage characterisation enabled by fit-for-purpose +digital technologies. Next, this method is applied and validated to a case +study in Ukraine that includes 17 bridges. From macro to micro, we deploy +technology at scale, from Sentinel-1 SAR images, crowdsourced information, and +high-resolution images to deep learning for damaged infrastructure. For the +first time, the interferometric coherence difference and semantic segmentation +of images were deployed to improve the reliability of damage characterisations +from regional to infrastructure component level, when enhanced assessment +accuracy is required. This integrated method improves the speed of +decision-making, and thus, enhances resilience. Keywords: critical +infrastructure, damage characterisation, targeted attacks, restoration + +
+
+ comment: Main text (34 pages,18 figures); Supplementary materials (13 pages) +
+
+
+
+
+ + ♻ ☆ UV-SAM: Adapting Segment Anything Model for Urban Village Identification AAAI 2024 + + +
+ Urban villages, defined as informal residential areas in or around urban +centers, are characterized by inadequate infrastructures and poor living +conditions, closely related to the Sustainable Development Goals (SDGs) on +poverty, adequate housing, and sustainable cities. Traditionally, governments +heavily depend on field survey methods to monitor the urban villages, which +however are time-consuming, labor-intensive, and possibly delayed. Thanks to +widely available and timely updated satellite images, recent studies develop +computer vision techniques to detect urban villages efficiently. However, +existing studies either focus on simple urban village image classification or +fail to provide accurate boundary information. To accurately identify urban +village boundaries from satellite images, we harness the power of the vision +foundation model and adapt the Segment Anything Model (SAM) to urban village +segmentation, named UV-SAM. Specifically, UV-SAM first leverages a small-sized +semantic segmentation model to produce mixed prompts for urban villages, +including mask, bounding box, and image representations, which are then fed +into SAM for fine-grained boundary identification. Extensive experimental +results on two datasets in China demonstrate that UV-SAM outperforms existing +baselines, and identification results over multiple years show that both the +number and area of urban villages are decreasing over time, providing deeper +insights into the development trends of urban villages and sheds light on the +vision foundation models for sustainable cities. The dataset and codes of this +study are available at https://github.com/tsinghua-fib-lab/UV-SAM. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning Methods for Calibrated Photometric Stereo and Beyond + + +
+ Photometric stereo recovers the surface normals of an object from multiple +images with varying shading cues, i.e., modeling the relationship between +surface orientation and intensity at each pixel. Photometric stereo prevails in +superior per-pixel resolution and fine reconstruction details. However, it is a +complicated problem because of the non-linear relationship caused by +non-Lambertian surface reflectance. Recently, various deep learning methods +have shown a powerful ability in the context of photometric stereo against +non-Lambertian surfaces. This paper provides a comprehensive review of existing +deep learning-based calibrated photometric stereo methods. We first analyze +these methods from different perspectives, including input processing, +supervision, and network architecture. We summarize the performance of deep +learning photometric stereo models on the most widely-used benchmark data set. +This demonstrates the advanced performance of deep learning-based photometric +stereo methods. Finally, we give suggestions and propose future research trends +based on the limitations of existing models. + +
+
+ comment: 19 pages, 11 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Cross-Modal Causal Intervention for Medical Report Generation + + +
+ Medical report generation (MRG) is essential for computer-aided diagnosis and +medication guidance, which can relieve the heavy burden of radiologists by +automatically generating the corresponding medical reports according to the +given radiology image. However, due to the spurious correlations within +image-text data induced by visual and linguistic biases, it is challenging to +generate accurate reports reliably describing lesion areas. Moreover, the +cross-modal confounders are usually unobservable and challenging to be +eliminated explicitly. In this paper, we aim to mitigate the cross-modal data +bias for MRG from a new perspective, i.e., cross-modal causal intervention, and +propose a novel Visual-Linguistic Causal Intervention (VLCI) framework for MRG, +which consists of a visual deconfounding module (VDM) and a linguistic +deconfounding module (LDM), to implicitly mitigate the visual-linguistic +confounders by causal front-door intervention. Specifically, due to the absence +of a generalized semantic extractor, the VDM explores and disentangles the +visual confounders from the patch-based local and global features without +expensive fine-grained annotations. Simultaneously, due to the lack of +knowledge encompassing the entire field of medicine, the LDM eliminates the +linguistic confounders caused by salient visual features and high-frequency +context without constructing a terminology database. Extensive experiments on +IU-Xray and MIMIC-CXR datasets show that our VLCI significantly outperforms the +state-of-the-art MRG methods. The code and models are available at +https://github.com/WissingChen/VLCI. + +
+
+
+
+
+ + ♻ ☆ OpenIllumination: A Multi-Illumination Dataset for Inverse Rendering + Evaluation on Real Objects + + +
+ We introduce OpenIllumination, a real-world dataset containing over 108K +images of 64 objects with diverse materials, captured under 72 camera views and +a large number of different illuminations. For each image in the dataset, we +provide accurate camera parameters, illumination ground truth, and foreground +segmentation masks. Our dataset enables the quantitative evaluation of most +inverse rendering and material decomposition methods for real objects. We +examine several state-of-the-art inverse rendering methods on our dataset and +compare their performances. The dataset and code can be found on the project +page: https://oppo-us-research.github.io/OpenIllumination. + +
+
+
+
+
+ + ♻ ☆ Enhancing Accuracy and Robustness of Steering Angle Prediction with + Attention Mechanism + + +
+ In this paper, our focus is on enhancing steering angle prediction for +autonomous driving tasks. We initiate our exploration by investigating two +veins of widely adopted deep neural architectures, namely ResNets and +InceptionNets. Within both families, we systematically evaluate various model +sizes to understand their impact on performance. Notably, our key contribution +lies in the incorporation of an attention mechanism to augment steering angle +prediction accuracy and robustness. By introducing attention, our models gain +the ability to selectively focus on crucial regions within the input data, +leading to improved predictive outcomes. Our findings showcase that our +attention-enhanced models not only achieve state-of-the-art results in terms of +steering angle Mean Squared Error (MSE) but also exhibit enhanced adversarial +robustness, addressing critical concerns in real-world deployment. For example, +in our experiments on the Kaggle SAP and our created publicly available +datasets, attention can lead to over 6% error reduction in steering angle +prediction and boost model robustness by up to 56.09%. + +
+
+
+
+
+ + ♻ ☆ Structure-Informed Shadow Removal Networks + + +
+ Existing deep learning-based shadow removal methods still produce images with +shadow remnants. These shadow remnants typically exist in homogeneous regions +with low-intensity values, making them untraceable in the existing +image-to-image mapping paradigm. We observe that shadows mainly degrade images +at the image-structure level (in which humans perceive object shapes and +continuous colors). Hence, in this paper, we propose to remove shadows at the +image structure level. Based on this idea, we propose a novel +structure-informed shadow removal network (StructNet) to leverage the +image-structure information to address the shadow remnant problem. +Specifically, StructNet first reconstructs the structure information of the +input image without shadows and then uses the restored shadow-free structure +prior to guiding the image-level shadow removal. StructNet contains two main +novel modules: (1) a mask-guided shadow-free extraction (MSFE) module to +extract image structural features in a non-shadow-to-shadow directional manner, +and (2) a multi-scale feature & residual aggregation (MFRA) module to leverage +the shadow-free structure information to regularize feature consistency. In +addition, we also propose to extend StructNet to exploit multi-level structure +information (MStructNet), to further boost the shadow removal performance with +minimum computational overheads. Extensive experiments on three shadow removal +benchmarks demonstrate that our method outperforms existing shadow removal +methods, and our StructNet can be integrated with existing methods to improve +them further. + +
+
+ comment: IEEE TIP +
+
+
+
+
+ + ♻ ☆ Trustworthy Large Models in Vision: A Survey + + +
+ The rapid progress of Large Models (LMs) has recently revolutionized various +fields of deep learning with remarkable grades, ranging from Natural Language +Processing (NLP) to Computer Vision (CV). However, LMs are increasingly +challenged and criticized by academia and industry due to their powerful +performance but untrustworthy behavior, which urgently needs to be alleviated +by reliable methods. Despite the abundance of literature on trustworthy LMs in +NLP, a systematic survey specifically delving into the trustworthiness of LMs +in CV remains absent. In order to mitigate this gap, we summarize four relevant +concerns that obstruct the trustworthy usage in vision of LMs in this survey, +including 1) human misuse, 2) vulnerability, 3) inherent issue and 4) +interpretability. By highlighting corresponding challenge, countermeasures, and +discussion in each topic, we hope this survey will facilitate readers' +understanding of this field, promote alignment of LMs with human expectations +and enable trustworthy LMs to serve as welfare rather than disaster for human +society. + +
+
+
+
+
+ + ♻ ☆ Segment Anything in 3D Gaussians + + +
+ 3D Gaussian Splatting has emerged as an alternative 3D representation of +Neural Radiance Fields (NeRFs), benefiting from its high-quality rendering +results and real-time rendering speed. Considering the 3D Gaussian +representation remains unparsed, it is necessary first to execute object +segmentation within this domain. Subsequently, scene editing and collision +detection can be performed, proving vital to a multitude of applications, such +as virtual reality (VR), augmented reality (AR), game/movie production, etc. In +this paper, we propose a novel approach to achieve object segmentation in 3D +Gaussian via an interactive procedure without any training process and learned +parameters. We refer to the proposed method as SA-GS, for Segment Anything in +3D Gaussians. Given a set of clicked points in a single input view, SA-GS can +generalize SAM to achieve 3D consistent segmentation via the proposed +multi-view mask generation and view-wise label assignment methods. We also +propose a cross-view label-voting approach to assign labels from different +views. In addition, in order to address the boundary roughness issue of +segmented objects resulting from the non-negligible spatial sizes of 3D +Gaussian located at the boundary, SA-GS incorporates the simple but effective +Gaussian Decomposition scheme. Extensive experiments demonstrate that SA-GS +achieves high-quality 3D segmentation results, which can also be easily applied +for scene editing and collision detection tasks. Codes will be released soon. + +
+
+
+
+
+ + ♻ ☆ CL2CM: Improving Cross-Lingual Cross-Modal Retrieval via Cross-Lingual + Knowledge Transfer AAAI2024 + + +
+ Cross-lingual cross-modal retrieval has garnered increasing attention +recently, which aims to achieve the alignment between vision and target +language (V-T) without using any annotated V-T data pairs. Current methods +employ machine translation (MT) to construct pseudo-parallel data pairs, which +are then used to learn a multi-lingual and multi-modal embedding space that +aligns visual and target-language representations. However, the large +heterogeneous gap between vision and text, along with the noise present in +target language translations, poses significant challenges in effectively +aligning their representations. To address these challenges, we propose a +general framework, Cross-Lingual to Cross-Modal (CL2CM), which improves the +alignment between vision and target language using cross-lingual transfer. This +approach allows us to fully leverage the merits of multi-lingual pre-trained +models (e.g., mBERT) and the benefits of the same modality structure, i.e., +smaller gap, to provide reliable and comprehensive semantic correspondence +(knowledge) for the cross-modal network. We evaluate our proposed approach on +two multilingual image-text datasets, Multi30K and MSCOCO, and one video-text +dataset, VATEX. The results clearly demonstrate the effectiveness of our +proposed method and its high potential for large-scale retrieval. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ GridMask Data Augmentation + + +
+ We propose a novel data augmentation method `GridMask' in this paper. It +utilizes information removal to achieve state-of-the-art results in a variety +of computer vision tasks. We analyze the requirement of information dropping. +Then we show limitation of existing information dropping algorithms and propose +our structured method, which is simple and yet very effective. It is based on +the deletion of regions of the input image. Our extensive experiments show that +our method outperforms the latest AutoAugment, which is way more +computationally expensive due to the use of reinforcement learning to find the +best policies. On the ImageNet dataset for recognition, COCO2017 object +detection, and on Cityscapes dataset for semantic segmentation, our method all +notably improves performance over baselines. The extensive experiments manifest +the effectiveness and generality of the new method. + +
+
+
+
+
+ + ♻ ☆ Memory Consistency Guided Divide-and-Conquer Learning for Generalized + Category Discovery + + +
+ Generalized category discovery (GCD) aims at addressing a more realistic and +challenging setting of semi-supervised learning, where only part of the +category labels are assigned to certain training samples. Previous methods +generally employ naive contrastive learning or unsupervised clustering scheme +for all the samples. Nevertheless, they usually ignore the inherent critical +information within the historical predictions of the model being trained. +Specifically, we empirically reveal that a significant number of salient +unlabeled samples yield consistent historical predictions corresponding to +their ground truth category. From this observation, we propose a Memory +Consistency guided Divide-and-conquer Learning framework (MCDL). In this +framework, we introduce two memory banks to record historical prediction of +unlabeled data, which are exploited to measure the credibility of each sample +in terms of its prediction consistency. With the guidance of credibility, we +can design a divide-and-conquer learning strategy to fully utilize the +discriminative information of unlabeled data while alleviating the negative +influence of noisy labels. Extensive experimental results on multiple +benchmarks demonstrate the generality and superiority of our method, where our +method outperforms state-of-the-art models by a large margin on both seen and +unseen classes of the generic image recognition and challenging semantic shift +settings (i.e.,with +8.4% gain on CUB and +8.1% on Standford Cars). + +
+
+
+
+
+ + ♻ ☆ Controllable Dense Captioner with Multimodal Embedding Bridging + + +
+ In this paper, we propose a controllable dense captioner (ControlCap), which +accommodates user's intention to dense captioning by introducing linguistic +guidance. ControlCap is defined as a multimodal embedding bridging +architecture, which comprises multimodal embedding generation (MEG) module and +bi-directional embedding bridging (BEB) module. While MEG module represents +objects/regions by combining embeddings of detailed information with +context-aware ones, it also endows ControlCap the adaptability to specialized +controls by utilizing them as linguistic guidance. BEB module aligns the +linguistic guidance with visual embeddings through borrowing/returning features +from/to the visual domain and gathering such features to predict text +descriptions. Experiments on Visual Genome and VG-COCO datasets show that +ControlCap respectively outperforms the state-of-the-art methods by 1.5% and +3.7% (mAP). Last but not least, with the capability of converting +region-category pairs to region-text pairs, ControlCap is able to act as a +powerful data engine for dense captioning. Code is available at +https://github.com/callsys/ControlCap. + +
+
+ comment: https://github.com/callsys/ControlCap +
+
+
+
+
+ + ♻ ☆ SCTransNet: Spatial-channel Cross Transformer Network for Infrared Small + Target Detection + + +
+ Infrared small target detection (IRSTD) has recently benefitted greatly from +U-shaped neural models. However, largely overlooking effective global +information modeling, existing techniques struggle when the target has high +similarities with the background. We present a Spatial-channel Cross +Transformer Network (SCTransNet) that leverages spatial-channel cross +transformer blocks (SCTBs) on top of long-range skip connections to address the +aforementioned challenge. In the proposed SCTBs, the outputs of all encoders +are interacted with cross transformer to generate mixed features, which are +redistributed to all decoders to effectively reinforce semantic differences +between the target and clutter at full scales. Specifically, SCTB contains the +following two key elements: (a) spatial-embedded single-head channel-cross +attention (SSCA) for exchanging local spatial features and full-level global +channel information to eliminate ambiguity among the encoders and facilitate +high-level semantic associations of the images, and (b) a complementary +feed-forward network (CFN) for enhancing the feature discriminability via a +multi-scale strategy and cross-spatial-channel information interaction to +promote beneficial information transfer. Our SCTransNet effectively encodes the +semantic differences between targets and backgrounds to boost its internal +representation for detecting small infrared targets accurately. Extensive +experiments on three public datasets, NUDT-SIRST, NUAA-SIRST, and IRSTD-1k, +demonstrate that the proposed SCTransNet outperforms existing IRSTD methods. +Our code will be made public at https://github.com/xdFai. + +
+
+
+
+
+ + ♻ ☆ Test-time Adaptive Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN) has witnessed significant advancements +in recent years, largely attributed to meticulously curated datasets and +proficiently trained models. Nevertheless, when tested in diverse environments, +the trained models inevitably encounter significant shifts in data +distribution, highlighting that relying solely on pre-trained and fixed +navigation models is insufficient. To enhance models' generalization ability, +test-time adaptation (TTA) demonstrates significant potential in the computer +vision field by leveraging unlabeled test samples for model updates. However, +simply applying existing TTA methods to the VLN task cannot well handle the +adaptability-stability dilemma of VLN models, i.e., frequent updates can result +in drastic changes in model parameters, while occasional updates can make the +models ill-equipped to handle dynamically changing environments. Therefore, we +propose a Fast-Slow Test-Time Adaptation (FSTTA) approach for VLN by performing +decomposition-accumulation analysis for both gradients and parameters in a +unified framework. Specifically, in the fast update phase, gradients generated +during the recent multi-step navigation process are decomposed into components +with varying levels of consistency. Then, these components are adaptively +accumulated to pinpoint a concordant direction for fast model adaptation. In +the slow update phase, historically recorded parameters are gathered, and a +similar decomposition-accumulation analysis is conducted to revert the model to +a stable state. Extensive experiments show that our method obtains impressive +performance gains on four popular benchmarks. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ MPTQ-ViT: Mixed-Precision Post-Training Quantization for Vision + Transformer + + +
+ While vision transformers (ViTs) have shown great potential in computer +vision tasks, their intense computation and memory requirements pose challenges +for practical applications. Existing post-training quantization methods +leverage value redistribution or specialized quantizers to address the +non-normal distribution in ViTs. However, without considering the asymmetry in +activations and relying on hand-crafted settings, these methods often struggle +to maintain performance under low-bit quantization. To overcome these +challenges, we introduce SmoothQuant with bias term (SQ-b) to alleviate the +asymmetry issue and reduce the clamping loss. We also introduce optimal scaling +factor ratio search (OPT-m) to determine quantization parameters by a +data-dependent mechanism automatically. To further enhance the compressibility, +we incorporate the above-mentioned techniques and propose a mixed-precision +post-training quantization framework for vision transformers (MPTQ-ViT). We +develop greedy mixed-precision quantization (Greedy MP) to allocate layer-wise +bit-width considering both model performance and compressibility. Our +experiments on ViT, DeiT, and Swin demonstrate significant accuracy +improvements compared with SOTA on the ImageNet dataset. Specifically, our +proposed methods achieve accuracy improvements ranging from 0.90% to 23.35% on +4-bit ViTs with single-precision and from 3.82% to 78.14% on 5-bit fully +quantized ViTs with mixed-precision. + +
+
+
+
+
+ + ♻ ☆ Commonsense for Zero-Shot Natural Language Video Localization AAAI 2024 + + +
+ Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited +promising results in training NLVL models exclusively with raw video data by +dynamically generating video segments and pseudo-query annotations. However, +existing pseudo-queries often lack grounding in the source video, resulting in +unstructured and disjointed content. In this paper, we investigate the +effectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we +present CORONET, a zero-shot NLVL framework that leverages commonsense to +bridge the gap between videos and generated pseudo-queries via a commonsense +enhancement module. CORONET employs Graph Convolution Networks (GCN) to encode +commonsense information extracted from a knowledge graph, conditioned on the +video, and cross-attention mechanisms to enhance the encoded video and +pseudo-query representations prior to localization. Through empirical +evaluations on two benchmark datasets, we demonstrate that CORONET surpasses +both zero-shot and weakly supervised baselines, achieving improvements up to +32.13% across various recall thresholds and up to 6.33% in mIoU. These results +underscore the significance of leveraging commonsense reasoning for zero-shot +NLVL. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper investigates the distinctions between gradient methods applied to +non-differentiable functions (NGDMs) and classical gradient descents (GDs) +designed for differentiable functions. First, we demonstrate significant +differences in the convergence properties of NGDMs compared to GDs, challenging +the applicability of the extensive neural network convergence literature based +on $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the +paradoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing +that increasing the regularization penalty leads to an increase in the $L_{1}$ +norm of optimal solutions in NGDMs. Consequently, we show that widely adopted +$L_{1}$ penalization-based techniques for network pruning do not yield expected +results. Finally, we explore the Edge of Stability phenomenon, indicating its +inapplicability even to Lipschitz continuous convex differentiable functions, +leaving its relevance to non-convex non-differentiable neural networks +inconclusive. Our analysis exposes misguided interpretations of NGDMs in widely +referenced papers and texts due to an overreliance on strong smoothness +assumptions, emphasizing the necessity for a nuanced understanding of +foundational assumptions in the analysis of these systems. + +
+
+
+
+
+ + ♻ ☆ TokenMotion: Motion-Guided Vision Transformer for Video Camouflaged + Object Detection Via Learnable Token Selection + + +
+ The area of Video Camouflaged Object Detection (VCOD) presents unique +challenges in the field of computer vision due to texture similarities between +target objects and their surroundings, as well as irregular motion patterns +caused by both objects and camera movement. In this paper, we introduce +TokenMotion (TMNet), which employs a transformer-based model to enhance VCOD by +extracting motion-guided features using a learnable token selection. Evaluated +on the challenging MoCA-Mask dataset, TMNet achieves state-of-the-art +performance in VCOD. It outperforms the existing state-of-the-art method by a +12.8% improvement in weighted F-measure, an 8.4% enhancement in S-measure, and +a 10.7% boost in mean IoU. The results demonstrate the benefits of utilizing +motion-guided features via learnable token selection within a transformer-based +framework to tackle the intricate task of VCOD. + +
+
+ comment: Revising Needed +
+
+
+
+
+ + ♻ ☆ Subject-Based Domain Adaptation for Facial Expression Recognition + + +
+ Adapting a deep learning (DL) model to a specific target individual is a +challenging task in facial expression recognition (FER) that may be achieved +using unsupervised domain adaptation (UDA) methods. Although several UDA +methods have been proposed to adapt deep FER models across source and target +data sets, multiple subject-specific source domains are needed to accurately +represent the intra- and inter-person variability in subject-based adaption. In +this paper, we consider the setting where domains correspond to individuals, +not entire datasets. Unlike UDA, multi-source domain adaptation (MSDA) methods +can leverage multiple source datasets to improve the accuracy and robustness of +the target model. However, previous methods for MSDA adapt image classification +models across datasets and do not scale well to a larger number of source +domains. In this paper, a new MSDA method is introduced for subject-based +domain adaptation in FER. It efficiently leverages information from multiple +source subjects (labeled source domain data) to adapt a deep FER model to a +single target individual (unlabeled target domain data). During adaptation, our +Subject-based MSDA first computes a between-source discrepancy loss to mitigate +the domain shift among data from several source subjects. Then, a new strategy +is employed to generate augmented confident pseudo-labels for the target +subject, allowing a reduction in the domain shift between source and target +subjects. Experiments on the challenging BioVid heat and pain dataset (PartA) +with 87 subjects, and the UNBC-McMaster shoulder pain dataset with 25 subjects +show that our Subject-based MSDA can outperform state-of-the-art methods yet +scale well to multiple subject-based source domains. + +
+
+
+
+
+ + ♻ ☆ Robust Single Rotation Averaging Revisited + + +
+ In this work, we propose a novel method for robust single rotation averaging +that can efficiently handle an extremely large fraction of outliers. Our +approach is to minimize the total truncated least unsquared deviations (TLUD) +cost of geodesic distances. The proposed algorithm consists of three steps: +First, we consider each input rotation as a potential initial solution and +choose the one that yields the least sum of truncated chordal deviations. Next, +we obtain the inlier set using the initial solution and compute its chordal +$L_2$-mean. Finally, starting from this estimate, we iteratively compute the +geodesic $L_1$-mean of the inliers using the Weiszfeld algorithm on $SO(3)$. An +extensive evaluation shows that our method is robust against up to 99% outliers +given a sufficient number of accurate inliers, outperforming the current state +of the art. + +
+
+ comment: Added the url to the code + (https://github.com/sunghoon031/SingleRotationAveraging_TLUD) +
+
+
+
+
+ + ♻ ☆ CurveCloudNet: Processing Point Clouds with 1D Structure + + +
+ Modern depth sensors such as LiDAR operate by sweeping laser-beams across the +scene, resulting in a point cloud with notable 1D curve-like structures. In +this work, we introduce a new point cloud processing scheme and backbone, +called CurveCloudNet, which takes advantage of the curve-like structure +inherent to these sensors. While existing backbones discard the rich 1D +traversal patterns and rely on generic 3D operations, CurveCloudNet +parameterizes the point cloud as a collection of polylines (dubbed a "curve +cloud"), establishing a local surface-aware ordering on the points. By +reasoning along curves, CurveCloudNet captures lightweight curve-aware priors +to efficiently and accurately reason in several diverse 3D environments. We +evaluate CurveCloudNet on multiple synthetic and real datasets that exhibit +distinct 3D size and structure. We demonstrate that CurveCloudNet outperforms +both point-based and sparse-voxel backbones in various segmentation settings, +notably scaling to large scenes better than point-based alternatives while +exhibiting improved single-object performance over sparse-voxel alternatives. +In all, CurveCloudNet is an efficient and accurate backbone that can handle a +larger variety of 3D environments than past works. + +
+
+
+
+
+ + ♻ ☆ Model-agnostic Body Part Relevance Assessment for Pedestrian Detection + + +
+ Model-agnostic explanation methods for deep learning models are flexible +regarding usability and availability. However, due to the fact that they can +only manipulate input to see changes in output, they suffer from weak +performance when used with complex model architectures. For models with large +inputs as, for instance, in object detection, sampling-based methods like +KernelSHAP are inefficient due to many computation-heavy forward passes through +the model. In this work, we present a framework for using sampling-based +explanation models in a computer vision context by body part relevance +assessment for pedestrian detection. Furthermore, we introduce a novel +sampling-based method similar to KernelSHAP that shows more robustness for +lower sampling sizes and, thus, is more efficient for explainability analyses +on large-scale datasets. + +
+
+
+
+
+ + ♻ ☆ Comprehensive Exploration of Synthetic Data Generation: A Survey + + +
+ Recent years have witnessed a surge in the popularity of Machine Learning +(ML), applied across diverse domains. However, progress is impeded by the +scarcity of training data due to expensive acquisition and privacy legislation. +Synthetic data emerges as a solution, but the abundance of released models and +limited overview literature pose challenges for decision-making. This work +surveys 417 Synthetic Data Generation (SDG) models over the last decade, +providing a comprehensive overview of model types, functionality, and +improvements. Common attributes are identified, leading to a classification and +trend analysis. The findings reveal increased model performance and complexity, +with neural network-based approaches prevailing, except for privacy-preserving +data generation. Computer vision dominates, with GANs as primary generative +models, while diffusion models, transformers, and RNNs compete. Implications +from our performance evaluation highlight the scarcity of common metrics and +datasets, making comparisons challenging. Additionally, the neglect of training +and computational costs in literature necessitates attention in future +research. This work serves as a guide for SDG model selection and identifies +crucial areas for future exploration. + +
+
+ comment: Fixed bug in Figure 44 +
+
+
+
+
+ + ♻ ☆ DIRECT: Deep Active Learning under Imbalance and Label Noise + + +
+ Class imbalance is a prevalent issue in real world machine learning +applications, often leading to poor performance in rare and minority classes. +With an abundance of wild unlabeled data, active learning is perhaps the most +effective technique in solving the problem at its root -- collecting a more +balanced and informative set of labeled examples during annotation. Label noise +is another common issue in data annotation jobs, which is especially +challenging for active learning methods. In this work, we conduct the first +study of active learning under both class imbalance and label noise. We propose +a novel algorithm that robustly identifies the class separation threshold and +annotates the most uncertain examples that are closest from it. Through a novel +reduction to one-dimensional active learning, our algorithm DIRECT is able to +leverage the classic active learning literature to address issues such as batch +labeling and tolerance towards label noise. We present extensive experiments on +imbalanced datasets with and without label noise. Our results demonstrate that +DIRECT can save more than 60% of the annotation budget compared to state-of-art +active learning algorithms and more than 80% of annotation budget compared to +random sampling. + +
+
+
+
+
+ + ♻ ☆ Coverage Axis++: Efficient Inner Point Selection for 3D Shape + Skeletonization + + +
+ We introduce Coverage Axis++, a novel and efficient approach to 3D shape +skeletonization. The current state-of-the-art approaches for this task often +rely on the watertightness of the input or suffer from substantial +computational costs, thereby limiting their practicality. To address this +challenge, Coverage Axis++ proposes a heuristic algorithm to select skeletal +points, offering a high-accuracy approximation of the Medial Axis Transform +(MAT) while significantly mitigating computational intensity for various shape +representations. We introduce a simple yet effective strategy that considers +both shape coverage and uniformity to derive skeletal points. The selection +procedure enforces consistency with the shape structure while favoring the +dominant medial balls, which thus introduces a compact underlying shape +representation in terms of MAT. As a result, Coverage Axis++ allows for +skeletonization for various shape representations (e.g., water-tight meshes, +triangle soups, point clouds), specification of the number of skeletal points, +few hyperparameters, and highly efficient computation with improved +reconstruction accuracy. Extensive experiments across a wide range of 3D shapes +validate the efficiency and effectiveness of Coverage Axis++. The code will be +publicly available once the paper is published. + +
+
+ comment: This paper needs major revisions in layout/content +
+
+
+
+
+ + ♻ ☆ Where are we in the search for an Artificial Visual Cortex for Embodied + Intelligence? + + +
+ We present the largest and most comprehensive empirical study of pre-trained +visual representations (PVRs) or visual 'foundation models' for Embodied AI. +First, we curate CortexBench, consisting of 17 different tasks spanning +locomotion, navigation, dexterous, and mobile manipulation. Next, we +systematically evaluate existing PVRs and find that none are universally +dominant. To study the effect of pre-training data size and diversity, we +combine over 4,000 hours of egocentric videos from 7 different sources (over +4.3M images) and ImageNet to train different-sized vision transformers using +Masked Auto-Encoding (MAE) on slices of this data. Contrary to inferences from +prior work, we find that scaling dataset size and diversity does not improve +performance universally (but does so on average). Our largest model, named +VC-1, outperforms all prior PVRs on average but does not universally dominate +either. Next, we show that task- or domain-specific adaptation of VC-1 leads to +substantial gains, with VC-1 (adapted) achieving competitive or superior +performance than the best known results on all of the benchmarks in +CortexBench. Finally, we present real-world hardware experiments, in which VC-1 +and VC-1 (adapted) outperform the strongest pre-existing PVR. Overall, this +paper presents no new techniques but a rigorous systematic evaluation, a broad +set of findings about PVRs (that in some cases, refute those made in narrow +domains in prior work), and open-sourced code and models (that required over +10,000 GPU-hours to train) for the benefit of the research community. + +
+
+ comment: Project website: https://eai-vc.github.io +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ A Personalized Framework for Consumer and Producer Group Fairness + Optimization in Recommender Systems + + +
+ In recent years, there has been an increasing recognition that when machine +learning (ML) algorithms are used to automate decisions, they may mistreat +individuals or groups, with legal, ethical, or economic implications. +Recommender systems are prominent examples of these machine learning (ML) +systems that aid users in making decisions. The majority of past literature +research on RS fairness treats user and item fairness concerns independently, +ignoring the fact that recommender systems function in a two-sided marketplace. +In this paper, we propose CP-FairRank, an optimization-based re-ranking +algorithm that seamlessly integrates fairness constraints from both the +consumer and producer side in a joint objective framework. The framework is +generalizable and may take into account varied fairness settings based on group +segmentation, recommendation model selection, and domain, which is one of its +key characteristics. For instance, we demonstrate that the system may jointly +increase consumer and producer fairness when (un)protected consumer groups are +defined on the basis of their activity level and main-streamness, while +producer groups are defined according to their popularity level. For empirical +validation, through large-scale on eight datasets and four mainstream +collaborative filtering (CF) recommendation models, we demonstrate that our +proposed strategy is able to improve both consumer and producer fairness +without compromising or very little overall recommendation quality, +demonstrating the role algorithms may play in avoiding data biases. + +
+
+ comment: TORS. arXiv admin note: substantial text overlap with + arXiv:2204.08085 +
+
+
+
+
+ + ☆ From PARIS to LE-PARIS: Toward Patent Response Automation with + Recommender Systems and Collaborative Large Language Models + + +
+ In patent prosecution, timely and effective responses to Office Actions (OAs) +are crucial for acquiring patents, yet past automation and AI research have +scarcely addressed this aspect. To address this gap, our study introduces the +Patent Office Action Response Intelligence System (PARIS) and its advanced +version, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are +designed to expedite the efficiency of patent attorneys in collaboratively +handling OA responses. The systems' key features include the construction of an +OA Topics Database, development of Response Templates, and implementation of +Recommender Systems and LLM-based Response Generation. Our validation involves +a multi-paradigmatic analysis using the USPTO Office Action database and +longitudinal data of attorney interactions with our systems over six years. +Through five studies, we examine the constructiveness of OA topics (studies 1 +and 2) using topic modeling and the proposed Delphi process, the efficacy of +our proposed hybrid recommender system tailored for OA (both LLM-based and +non-LLM-based) (study 3), the quality of response generation (study 4), and the +practical value of the systems in real-world scenarios via user studies (study +5). Results demonstrate that both PARIS and LE-PARIS significantly meet key +metrics and positively impact attorney performance. + +
+
+ comment: 14 pages, 4 figures, summitted to a journal +
+
+
+
+
+ + ☆ EASRec: Elastic Architecture Search for Efficient Long-term Sequential + Recommender Systems + + +
+ In this age where data is abundant, the ability to distill meaningful +insights from the sea of information is essential. Our research addresses the +computational and resource inefficiencies that current Sequential Recommender +Systems (SRSs) suffer from. especially those employing attention-based models +like SASRec, These systems are designed for next-item recommendations in +various applications, from e-commerce to social networks. However, such systems +suffer from substantial computational costs and resource consumption during the +inference stage. To tackle these issues, our research proposes a novel method +that combines automatic pruning techniques with advanced model architectures. +We also explore the potential of resource-constrained Neural Architecture +Search (NAS), a technique prevalent in the realm of recommendation systems, to +fine-tune models for reduced FLOPs, latency, and energy usage while retaining +or even enhancing accuracy. The main contribution of our work is developing the +Elastic Architecture Search for Efficient Long-term Sequential Recommender +Systems (EASRec). This approach aims to find optimal compact architectures for +attention-based SRSs, ensuring accuracy retention. EASRec introduces data-aware +gates that leverage historical information from input data batch to improve the +performance of the recommendation network. Additionally, it utilizes a dynamic +resource constraint approach, which standardizes the search process and results +in more appropriate architectures. The effectiveness of our methodology is +validated through exhaustive experiments on three benchmark datasets, which +demonstrates EASRec's superiority in SRSs. Our research set a new standard for +future exploration into efficient and accurate recommender systems, signifying +a substantial advancement within this swiftly advancing field. + +
+
+
+
+
+ + ☆ An Exam-based Evaluation Approach Beyond Traditional Relevance Judgments + + +
+ Current IR evaluation is based on relevance judgments, created either +manually or automatically, with decisions outsourced to Large Language Models +(LLMs). We offer an alternative paradigm, that never relies on relevance +judgments in any form. Instead, a text is defined as relevant if it contains +information that enables the answering of key questions. We use this idea to +design the EXAM Answerability Metric to evaluate information +retrieval/generation systems for their ability to provide topically relevant +information. + We envision the role of a human judge to edit and define an exam question +bank that will test for the presence of relevant information in text. We +support this step by generating an initial set of exam questions. In the next +phase, an LLM-based question answering system will automatically grade system +responses by tracking which exam questions are answerable with which system +responses. We propose two evaluation measures, the recall-oriented EXAM Cover +metric, and the precision-oriented EXAM Qrels metric, the latter which can be +implemented with trec_eval. This paradigm not only allows for the expansion of +the exam question set post-hoc but also facilitates the ongoing evaluation of +future information systems, whether they focus on retrieval, generation, or +both. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ PAP-REC: Personalized Automatic Prompt for Recommendation Language Model + + +
+ Recently emerged prompt-based Recommendation Language Models (RLM) can solve +multiple recommendation tasks uniformly. The RLMs make full use of the +inherited knowledge learned from the abundant pre-training data to solve the +downstream recommendation tasks by prompts, without introducing additional +parameters or network training. However, handcrafted prompts require +significant expertise and human effort since slightly rewriting prompts may +cause massive performance changes. In this paper, we propose PAP-REC, a +framework to generate the Personalized Automatic Prompt for RECommendation +language models to mitigate the inefficiency and ineffectiveness problems +derived from manually designed prompts. Specifically, personalized automatic +prompts allow different users to have different prompt tokens for the same +task, automatically generated using a gradient-based method. One challenge for +personalized automatic prompt generation for recommendation language models is +the extremely large search space, leading to a long convergence time. To +effectively and efficiently address the problem, we develop surrogate metrics +and leverage an alternative updating schedule for prompting recommendation +language models. Experimental results show that our PAP-REC framework manages +to generate personalized prompts, and the automatically generated prompts +outperform manually constructed prompts and also outperform various baseline +recommendation models. The source code of the work is available at +https://github.com/rutgerswiselab/PAP-REC. + +
+
+
+
+
+ + ☆ CF4J: Collaborative Filtering for Java + + +
+ Recommender Systems (RS) provide a relevant tool to mitigate the information +overload problem. A large number of researchers have published hundreds of +papers to improve different RS features. It is advisable to use RS frameworks +that simplify RS researchers: a) to design and implement recommendations +methods and, b) to speed up the execution time of the experiments. In this +paper, we present CF4J, a Java library designed to carry out Collaborative +Filtering based RS research experiments. CF4J has been designed from +researchers to researchers. It allows: a) RS datasets reading, b) full and easy +access to data and intermediate or final results, c) to extend their main +functionalities, d) to concurrently execute the implemented methods, and e) to +provide a thorough evaluation for the implementations by quality measures. In +summary, CF4J serves as a library specifically designed for the research trial +and error process. + +
+
+
+
+
+ + ☆ SPARQL Generation with Entity Pre-trained GPT for KG Question Answering SP + + +
+ Knowledge Graphs popularity has been rapidly growing in last years. All that +knowledge is available for people to query it through the many online databases +on the internet. Though, it would be a great achievement if non-programmer +users could access whatever information they want to know. There has been a lot +of effort oriented to solve this task using natural language processing tools +and creativity encouragement by way of many challenges. Our approach focuses on +assuming a correct entity linking on the natural language questions and +training a GPT model to create SPARQL queries from them. We managed to isolate +which property of the task can be the most difficult to solve at few or +zero-shot and we proposed pre-training on all entities (under CWA) to improve +the performance. We obtained a 62.703% accuracy of exact SPARQL matches on +testing at 3-shots, a F1 of 0.809 on the entity linking challenge and a F1 of +0.009 on the question answering challenge. + +
+
+ comment: 7 pages, 1 figure, 2 tables. For the implementation, see + https://github.com/DiegoEmilio01/SPARQL-generation-with-entity-pre-trained-GPT-for-KG-Question-Answering +
+
+
+
+
+ + ☆ Approximate Nearest Neighbor Search with Window Filters + + +
+ We define and investigate the problem of $\textit{c-approximate window +search}$: approximate nearest neighbor search where each point in the dataset +has a numeric label, and the goal is to find nearest neighbors to queries +within arbitrary label ranges. Many semantic search problems, such as image and +document search with timestamp filters, or product search with cost filters, +are natural examples of this problem. We propose and theoretically analyze a +modular tree-based framework for transforming an index that solves the +traditional c-approximate nearest neighbor problem into a data structure that +solves window search. On standard nearest neighbor benchmark datasets equipped +with random label values, adversarially constructed embeddings, and image +search embeddings with real timestamps, we obtain up to a $75\times$ speedup +over existing solutions at the same level of recall. + +
+
+ comment: Code available: https://github.com/JoshEngels/RangeFilteredANN +
+
+
+
+
+ + ☆ Empirical and Experimental Perspectives on Big Data in Recommendation + Systems: A Comprehensive Survey + + +
+ This survey paper provides a comprehensive analysis of big data algorithms in +recommendation systems, addressing the lack of depth and precision in existing +literature. It proposes a two-pronged approach: a thorough analysis of current +algorithms and a novel, hierarchical taxonomy for precise categorization. The +taxonomy is based on a tri-level hierarchy, starting with the methodology +category and narrowing down to specific techniques. Such a framework allows for +a structured and comprehensive classification of algorithms, assisting +researchers in understanding the interrelationships among diverse algorithms +and techniques. Covering a wide range of algorithms, this taxonomy first +categorizes algorithms into four main analysis types: User and Item +Similarity-Based Methods, Hybrid and Combined Approaches, Deep Learning and +Algorithmic Methods, and Mathematical Modeling Methods, with further +subdivisions into sub-categories and techniques. The paper incorporates both +empirical and experimental evaluations to differentiate between the techniques. +The empirical evaluation ranks the techniques based on four criteria. The +experimental assessments rank the algorithms that belong to the same category, +sub-category, technique, and sub-technique. Also, the paper illuminates the +future prospects of big data techniques in recommendation systems, underscoring +potential advancements and opportunities for further research in this field + +
+
+
+
+
+ + ♻ ☆ Using Large Language Models to Generate, Validate, and Apply User Intent + Taxonomies + + +
+ Log data can reveal valuable information about how users interact with Web +search services, what they want, and how satisfied they are. However, analyzing +user intents in log data is not easy, especially for emerging forms of Web +search such as AI-driven chat. To understand user intents from log data, we +need a way to label them with meaningful categories that capture their +diversity and dynamics. Existing methods rely on manual or machine-learned +labeling, which are either expensive or inflexible for large and dynamic +datasets. We propose a novel solution using large language models (LLMs), which +can generate rich and relevant concepts, descriptions, and examples for user +intents. However, using LLMs to generate a user intent taxonomy and apply it +for log analysis can be problematic for two main reasons: (1) such a taxonomy +is not externally validated; and (2) there may be an undesirable feedback loop. +To address this, we propose a new methodology with human experts and assessors +to verify the quality of the LLM-generated taxonomy. We also present an +end-to-end pipeline that uses an LLM with human-in-the-loop to produce, refine, +and apply labels for user intent analysis in log data. We demonstrate its +effectiveness by uncovering new insights into user intents from search and chat +logs from the Microsoft Bing commercial search engine. The proposed work's +novelty stems from the method for generating purpose-driven user intent +taxonomies with strong validation. This method not only helps remove +methodological and practical bottlenecks from intent-focused research, but also +provides a new framework for generating, validating, and applying other kinds +of taxonomies in a scalable and adaptable way with minimal human effort. + +
+
+
+
+
+ + ♻ ☆ Chemist-X: Large Language Model-empowered Agent for Reaction Condition + Recommendation in Chemical Synthesis + + +
+ Recent AI research plots a promising future of automatic chemical reactions +within the chemistry society. This study proposes Chemist-X, a transformative +AI agent that automates the reaction condition recommendation (RCR) task in +chemical synthesis with retrieval-augmented generation (RAG) technology. To +emulate expert chemists' strategies when solving RCR tasks, Chemist-X utilizes +advanced RAG schemes to interrogate online molecular databases and distill +critical data from the latest literature database. Further, the agent leverages +state-of-the-art computer-aided design (CAD) tools with a large language model +(LLM) supervised programming interface. With the ability to utilize updated +chemical knowledge and CAD tools, our agent significantly outperforms +conventional synthesis AIs confined to the fixed knowledge within its training +data. Chemist-X considerably reduces chemists' workload and allows them to +focus on more fundamental and creative problems, thereby bringing closer +computational techniques and chemical research and making a remarkable leap +toward harnessing AI's full capabilities in scientific discovery. + +
+
+
+
+
+ + ♻ ☆ Pareto-based Multi-Objective Recommender System with Forgetting Curve + + +
+ Recommender systems with cascading architecture play an increasingly +significant role in online recommendation platforms, where the approach to +dealing with negative feedback is a vital issue. For instance, in short video +platforms, users tend to quickly slip away from candidates that they feel +aversive, and recommender systems are expected to receive these explicit +negative feedbacks and make adjustments to avoid these recommendations. +Considering recency effect in memories, we propose a forgetting model based on +Ebbinghaus Forgetting Curve to cope with negative feedback. In addition, we +introduce a Pareto optimization solver to guarantee a better trade-off between +recency and model performance. In conclusion, we propose Pareto-based +Multi-Objective Recommender System with forgetting curve (PMORS), which can be +applied to any multi-objective recommendation and show sufficiently superiority +when facing explicit negative feedback. We have conducted evaluations of PMORS +and achieved favorable outcomes in short-video scenarios on both public dataset +and industrial dataset. After being deployed on an online short video platform +named WeChat Channels in May, 2023, PMORS has not only demonstrated promising +results for both consistency and recency but also achieved an improvement of up +to +1.45% GMV. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Generative Large Language Models for Systematic Review + Screening Automation ECIR2024 + + +
+ Systematic reviews are crucial for evidence-based medicine as they +comprehensively analyse published research findings on specific questions. +Conducting such reviews is often resource- and time-intensive, especially in +the screening phase, where abstracts of publications are assessed for inclusion +in a review. This study investigates the effectiveness of using zero-shot large +language models~(LLMs) for automatic screening. We evaluate the effectiveness +of eight different LLMs and investigate a calibration technique that uses a +predefined recall threshold to determine whether a publication should be +included in a systematic review. Our comprehensive evaluation using five +standard test collections shows that instruction fine-tuning plays an important +role in screening, that calibration renders LLMs practical for achieving a +targeted recall, and that combining both with an ensemble of zero-shot models +saves significant screening time compared to state-of-the-art approaches. + +
+
+ comment: Accepted to ECIR2024 full paper (findings) +
+
+
+
+
+ + ♻ ☆ A First Look at Information Highlighting in Stack Overflow Answers + + +
+ Context: Navigating the knowledge of Stack Overflow (SO) remains challenging. +To make the posts vivid to users, SO allows users to write and edit posts with +Markdown or HTML so that users can leverage various formatting styles (e.g., +bold, italic, and code) to highlight the important information. Nonetheless, +there have been limited studies on the highlighted information. Objective: We +carried out the first large-scale exploratory study on the information +highlighted in SO answers in our recent study. To extend our previous study, we +develop approaches to automatically recommend highlighted content with +formatting styles using neural network architectures initially designed for the +Named Entity Recognition task. Method: In this paper, we studied 31,169,429 +answers of Stack Overflow. For training recommendation models, we choose CNN +and BERT models for each type of formatting (i.e., Bold, Italic, Code, and +Heading) using the information highlighting dataset we collected from SO +answers. Results: Our models based on CNN architecture achieve precision +ranging from 0.71 to 0.82. The trained model for automatic code content +highlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming +the trained models for other formatting styles. The BERT models have even lower +recalls and F1 scores than the CNN models. Our analysis of failure cases +indicates that the majority of the failure cases are missing identification +(i.e., the model misses the content that is supposed to be highlighted) due to +the models tend to learn the frequently highlighted words while struggling to +learn less frequent words. Conclusion: Our findings suggest that it is possible +to develop recommendation models for highlighting information for answers with +different formatting styles on Stack Overflow. + +
+
+ comment: This work is submitted to Information and Software Technology Journal +
+
+
+
+
+
+
+
+ + Machine Learning 151 + +
+
+
+ + ☆ Towards Optimal Feature-Shaping Methods for Out-of-Distribution + Detection ICLR 2024 + + +
+ Feature shaping refers to a family of methods that exhibit state-of-the-art +performance for out-of-distribution (OOD) detection. These approaches +manipulate the feature representation, typically from the penultimate layer of +a pre-trained deep learning model, so as to better differentiate between +in-distribution (ID) and OOD samples. However, existing feature-shaping methods +usually employ rules manually designed for specific model architectures and OOD +datasets, which consequently limit their generalization ability. To address +this gap, we first formulate an abstract optimization framework for studying +feature-shaping methods. We then propose a concrete reduction of the framework +with a simple piecewise constant shaping function and show that existing +feature-shaping methods approximate the optimal solution to the concrete +optimization problem. Further, assuming that OOD data is inaccessible, we +propose a formulation that yields a closed-form solution for the piecewise +constant shaping function, utilizing solely the ID data. Through extensive +experiments, we show that the feature-shaping function optimized by our method +improves the generalization ability of OOD detection across a large variety of +datasets and model architectures. + +
+
+ comment: ICLR 2024. Project page: https://github.com/Qinyu-Allen-Zhao/OptFSOOD +
+
+
+
+
+ + ☆ Early Time Classification with Accumulated Accuracy Gap Control + + +
+ Early time classification algorithms aim to label a stream of features +without processing the full input stream, while maintaining accuracy comparable +to that achieved by applying the classifier to the entire input. In this paper, +we introduce a statistical framework that can be applied to any sequential +classifier, formulating a calibrated stopping rule. This data-driven rule +attains finite-sample, distribution-free control of the accuracy gap between +full and early-time classification. We start by presenting a novel method that +builds on the Learn-then-Test calibration framework to control this gap +marginally, on average over i.i.d. instances. As this algorithm tends to yield +an excessively high accuracy gap for early halt times, our main contribution is +the proposal of a framework that controls a stronger notion of error, where the +accuracy gap is controlled conditionally on the accumulated halt times. +Numerical experiments demonstrate the effectiveness, applicability, and +usefulness of our method. We show that our proposed early stopping mechanism +reduces up to 94% of timesteps used for classification while achieving rigorous +accuracy gap control. + +
+
+
+
+
+ + ☆ SymbolicAI: A framework for logic-based approaches combining generative + models and solvers + + +
+ We introduce SymbolicAI, a versatile and modular framework employing a +logic-based approach to concept learning and flow management in generative +processes. SymbolicAI enables the seamless integration of generative models +with a diverse range of solvers by treating large language models (LLMs) as +semantic parsers that execute tasks based on both natural and formal language +instructions, thus bridging the gap between symbolic reasoning and generative +AI. We leverage probabilistic programming principles to tackle complex tasks, +and utilize differentiable and classical programming paradigms with their +respective strengths. The framework introduces a set of polymorphic, +compositional, and self-referential operations for data stream manipulation, +aligning LLM outputs with user objectives. As a result, we can transition +between the capabilities of various foundation models endowed with zero- and +few-shot learning capabilities and specialized, fine-tuned models or solvers +proficient in addressing specific problems. In turn, the framework facilitates +the creation and evaluation of explainable computational graphs. We conclude by +introducing a quality measure and its empirical score for evaluating these +computational graphs, and propose a benchmark that compares various +state-of-the-art LLMs across a set of complex workflows. We refer to the +empirical score as the "Vector Embedding for Relational Trajectory Evaluation +through Cross-similarity", or VERTEX score for short. The framework codebase +and benchmark are linked below. + +
+
+ comment: 38 pages, 12 figures, external resources: framework is available at + https://github.com/ExtensityAI/symbolicai and benchmark at + https://github.com/ExtensityAI/benchmark +
+
+
+
+
+ + ☆ LTAU-FF: Loss Trajectory Analysis for Uncertainty in Atomistic Force + Fields + + +
+ Model ensembles are simple and effective tools for estimating the prediction +uncertainty of deep learning atomistic force fields. Despite this, widespread +adoption of ensemble-based uncertainty quantification (UQ) techniques is +limited by the high computational costs incurred by ensembles during both +training and inference. In this work we leverage the cumulative distribution +functions (CDFs) of per-sample errors obtained over the course of training to +efficiently represent the model ensemble, and couple them with a distance-based +similarity search in the model latent space. Using these tools, we develop a +simple UQ metric (which we call LTAU) that leverages the strengths of +ensemble-based techniques without requiring the evaluation of multiple models +during either training or inference. As an initial test, we apply our method +towards estimating the epistemic uncertainty in atomistic force fields +(LTAU-FF) and demonstrate that it can be easily calibrated to accurately +predict test errors on multiple datasets from the literature. We then +illustrate the utility of LTAU-FF in two practical applications: 1) tuning the +training-validation gap for an example dataset, and 2) predicting errors in +relaxation trajectories on the OC20 IS2RS task. Though in this work we focus on +the use of LTAU with deep learning atomistic force fields, we emphasize that it +can be readily applied to any regression task, or any ensemble-generation +technique, to provide a reliable and easy-to-implement UQ metric. + +
+
+
+
+
+ + ☆ Data Augmentation Scheme for Raman Spectra with Highly Correlated + Annotations + + +
+ In biotechnology Raman Spectroscopy is rapidly gaining popularity as a +process analytical technology (PAT) that measures cell densities, substrate- +and product concentrations. As it records vibrational modes of molecules it +provides that information non-invasively in a single spectrum. Typically, +partial least squares (PLS) is the model of choice to infer information about +variables of interest from the spectra. However, biological processes are known +for their complexity where convolutional neural networks (CNN) present a +powerful alternative. They can handle non-Gaussian noise and account for beam +misalignment, pixel malfunctions or the presence of additional substances. +However, they require a lot of data during model training, and they pick up +non-linear dependencies in the process variables. In this work, we exploit the +additive nature of spectra in order to generate additional data points from a +given dataset that have statistically independent labels so that a network +trained on such data exhibits low correlations between the model predictions. +We show that training a CNN on these generated data points improves the +performance on datasets where the annotations do not bear the same correlation +as the dataset that was used for model training. This data augmentation +technique enables us to reuse spectra as training data for new contexts that +exhibit different correlations. The additional data allows for building a +better and more robust model. This is of interest in scenarios where large +amounts of historical data are available but are currently not used for model +training. We demonstrate the capabilities of the proposed method using +synthetic spectra of Ralstonia eutropha batch cultivations to monitor +substrate, biomass and polyhydroxyalkanoate (PHA) biopolymer concentrations +during of the experiments. + +
+
+
+
+
+ + ☆ Score-based Causal Representation Learning: Linear and General + Transformations AISTATS 2024 + + +
+ This paper addresses intervention-based causal representation learning (CRL) +under a general nonparametric latent causal model and an unknown transformation +that maps the latent variables to the observed variables. Linear and general +transformations are investigated. The paper addresses both the +\emph{identifiability} and \emph{achievability} aspects. Identifiability refers +to determining algorithm-agnostic conditions that ensure recovering the true +latent causal variables and the latent causal graph underlying them. +Achievability refers to the algorithmic aspects and addresses designing +algorithms that achieve identifiability guarantees. By drawing novel +connections between \emph{score functions} (i.e., the gradients of the +logarithm of density functions) and CRL, this paper designs a \emph{score-based +class of algorithms} that ensures both identifiability and achievability. +First, the paper focuses on \emph{linear} transformations and shows that one +stochastic hard intervention per node suffices to guarantee identifiability. It +also provides partial identifiability guarantees for soft interventions, +including identifiability up to ancestors for general causal models and perfect +latent graph recovery for sufficiently non-linear causal models. Secondly, it +focuses on \emph{general} transformations and shows that two stochastic hard +interventions per node suffice for identifiability. Notably, one does +\emph{not} need to know which pair of interventional environments have the same +node intervened. + +
+
+ comment: Linear transformations: stronger results for hard and soft + interventions than our previous paper Score-based Causal Representation + Learning with Interventions (https://arxiv.org/abs/2301.08230). General + transformations: results also appear in our paper General Identifiability and + Achievability for Causal Representation Learning (arXiv:2310.15450) accepted + to AISTATS 2024 (oral) +
+
+
+
+
+ + ☆ X-CBA: Explainability Aided CatBoosted Anomal-E for Intrusion Detection + System + + +
+ The effectiveness of Intrusion Detection Systems (IDS) is critical in an era +where cyber threats are becoming increasingly complex. Machine learning (ML) +and deep learning (DL) models provide an efficient and accurate solution for +identifying attacks and anomalies in computer networks. However, using ML and +DL models in IDS has led to a trust deficit due to their non-transparent +decision-making. This transparency gap in IDS research is significant, +affecting confidence and accountability. To address, this paper introduces a +novel Explainable IDS approach, called X-CBA, that leverages the structural +advantages of Graph Neural Networks (GNNs) to effectively process network +traffic data, while also adapting a new Explainable AI (XAI) methodology. +Unlike most GNN-based IDS that depend on labeled network traffic and node +features, thereby overlooking critical packet-level information, our approach +leverages a broader range of traffic data through network flows, including edge +attributes, to improve detection capabilities and adapt to novel threats. +Through empirical testing, we establish that our approach not only achieves +high accuracy with 99.47% in threat detection but also advances the field by +providing clear, actionable explanations of its analytical outcomes. This +research also aims to bridge the current gap and facilitate the broader +integration of ML/DL technologies in cybersecurity defenses by offering a local +and global explainability solution that is both precise and interpretable. + +
+
+
+
+
+ + ☆ ALISON: Fast and Effective Stylometric Authorship Obfuscation AAAI + + +
+ Authorship Attribution (AA) and Authorship Obfuscation (AO) are two competing +tasks of increasing importance in privacy research. Modern AA leverages an +author's consistent writing style to match a text to its author using an AA +classifier. AO is the corresponding adversarial task, aiming to modify a text +in such a way that its semantics are preserved, yet an AA model cannot +correctly infer its authorship. To address privacy concerns raised by +state-of-the-art (SOTA) AA methods, new AO methods have been proposed but +remain largely impractical to use due to their prohibitively slow training and +obfuscation speed, often taking hours. To this challenge, we propose a +practical AO method, ALISON, that (1) dramatically reduces training/obfuscation +time, demonstrating more than 10x faster obfuscation than SOTA AO methods, (2) +achieves better obfuscation success through attacking three transformer-based +AA methods on two benchmark datasets, typically performing 15% better than +competing methods, (3) does not require direct signals from a target AA +classifier during obfuscation, and (4) utilizes unique stylometric features, +allowing sound model interpretation for explainable obfuscation. We also +demonstrate that ALISON can effectively prevent four SOTA AA methods from +accurately determining the authorship of ChatGPT-generated texts, all while +minimally changing the original text semantics. To ensure the reproducibility +of our findings, our code and data are available at: +https://github.com/EricX003/ALISON. + +
+
+ comment: 10 pages, 6 figures, 4 tables. To be published in the Proceedings of + the 38th Annual AAAI Conference on Artificial Intelligence (AAAI-24) +
+
+
+
+
+ + ☆ A YANG-aided Unified Strategy for Black Hole Detection for Backbone + Networks + + +
+ Despite the crucial importance of addressing Black Hole failures in Internet +backbone networks, effective detection strategies in backbone networks are +lacking. This is largely because previous research has been centered on Mobile +Ad-hoc Networks (MANETs), which operate under entirely different dynamics, +protocols, and topologies, making their findings not directly transferable to +backbone networks. Furthermore, detecting Black Hole failures in backbone +networks is particularly challenging. It requires a comprehensive range of +network data due to the wide variety of conditions that need to be considered, +making data collection and analysis far from straightforward. Addressing this +gap, our study introduces a novel approach for Black Hole detection in backbone +networks using specialized Yet Another Next Generation (YANG) data models with +Black Hole-sensitive Metric Matrix (BHMM) analysis. This paper details our +method of selecting and analyzing four YANG models relevant to Black Hole +detection in ISP networks, focusing on routing protocols and ISP-specific +configurations. Our BHMM approach derived from these models demonstrates a 10% +improvement in detection accuracy and a 13% increase in packet delivery rate, +highlighting the efficiency of our approach. Additionally, we evaluate the +Machine Learning approach leveraged with BHMM analysis in two different network +settings, a commercial ISP network, and a scientific research-only network +topology. This evaluation also demonstrates the practical applicability of our +method, yielding significantly improved prediction outcomes in both +environments. + +
+
+
+
+
+ + ☆ Resolution invariant deep operator network for PDEs with complex + geometries + + +
+ Neural operators (NO) are discretization invariant deep learning methods with +functional output and can approximate any continuous operator. NO have +demonstrated the superiority of solving partial differential equations (PDEs) +over other deep learning methods. However, the spatial domain of its input +function needs to be identical to its output, which limits its applicability. +For instance, the widely used Fourier neural operator (FNO) fails to +approximate the operator that maps the boundary condition to the PDE solution. +To address this issue, we propose a novel framework called resolution-invariant +deep operator (RDO) that decouples the spatial domain of the input and output. +RDO is motivated by the Deep operator network (DeepONet) and it does not +require retraining the network when the input/output is changed compared with +DeepONet. RDO takes functional input and its output is also functional so that +it keeps the resolution invariant property of NO. It can also resolve PDEs with +complex geometries whereas NO fail. Various numerical experiments demonstrate +the advantage of our method over DeepONet and FNO. + +
+
+
+
+
+ + ☆ SLIM: Skill Learning with Multiple Critics ICRA 2024 + + +
+ Self-supervised skill learning aims to acquire useful behaviors that leverage +the underlying dynamics of the environment. Latent variable models, based on +mutual information maximization, have been particularly successful in this task +but still struggle in the context of robotic manipulation. As it requires +impacting a possibly large set of degrees of freedom composing the environment, +mutual information maximization fails alone in producing useful manipulation +behaviors. To address this limitation, we introduce SLIM, a multi-critic +learning approach for skill discovery with a particular focus on robotic +manipulation. Our main insight is that utilizing multiple critics in an +actor-critic framework to gracefully combine multiple reward functions leads to +a significant improvement in latent-variable skill discovery for robotic +manipulation while overcoming possible interference occurring among rewards +which hinders convergence to useful skills. Furthermore, in the context of +tabletop manipulation, we demonstrate the applicability of our novel skill +discovery approach to acquire safe and efficient motor primitives in a +hierarchical reinforcement learning fashion and leverage them through planning, +surpassing the state-of-the-art approaches for skill discovery by a large +margin. + +
+
+ comment: IEEE ICRA 2024 +
+
+
+
+
+ + ☆ Leveraging Approximate Model-based Shielding for Probabilistic Safety + Guarantees in Continuous Environments AAMAS 2024 + + +
+ Shielding is a popular technique for achieving safe reinforcement learning +(RL). However, classical shielding approaches come with quite restrictive +assumptions making them difficult to deploy in complex environments, +particularly those with continuous state or action spaces. In this paper we +extend the more versatile approximate model-based shielding (AMBS) framework to +the continuous setting. In particular we use Safety Gym as our test-bed, +allowing for a more direct comparison of AMBS with popular constrained RL +algorithms. We also provide strong probabilistic safety guarantees for the +continuous setting. In addition, we propose two novel penalty techniques that +directly modify the policy gradient, which empirically provide more stable +convergence in our experiments. + +
+
+ comment: Accepted as an Extended Abstract at AAMAS 2024 +
+
+
+
+
+ + ☆ An Analysis of the Variance of Diffusion-based Speech Enhancement + + +
+ Diffusion models proved to be powerful models for generative speech +enhancement. In recent SGMSE+ approaches, training involves a stochastic +differential equation for the diffusion process, adding both Gaussian and +environmental noise to the clean speech signal gradually. The speech +enhancement performance varies depending on the choice of the stochastic +differential equation that controls the evolution of the mean and the variance +along the diffusion processes when adding environmental and Gaussian noise. In +this work, we highlight that the scale of the variance is a dominant parameter +for speech enhancement performance and show that it controls the tradeoff +between noise attenuation and speech distortions. More concretely, we show that +a larger variance increases the noise attenuation and allows for reducing the +computational footprint, as fewer function evaluations for generating the +estimate are required. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ Position Paper: Bayesian Deep Learning in the Age of Large-Scale AI + + +
+ In the current landscape of deep learning research, there is a predominant +emphasis on achieving high predictive accuracy in supervised tasks involving +large image and language datasets. However, a broader perspective reveals a +multitude of overlooked metrics, tasks, and data types, such as uncertainty, +active and continual learning, and scientific data, that demand attention. +Bayesian deep learning (BDL) constitutes a promising avenue, offering +advantages across these diverse settings. This paper posits that BDL can +elevate the capabilities of deep learning. It revisits the strengths of BDL, +acknowledges existing challenges, and highlights some exciting research avenues +aimed at addressing these obstacles. Looking ahead, the discussion focuses on +possible ways to combine large-scale foundation models with BDL to unlock their +full potential. + +
+
+
+
+
+ + ☆ Distilling Conditional Diffusion Models for Offline Reinforcement + Learning through Trajectory Stitching + + +
+ Deep generative models have recently emerged as an effective approach to +offline reinforcement learning. However, their large model size poses +challenges in computation. We address this issue by proposing a knowledge +distillation method based on data augmentation. In particular, high-return +trajectories are generated from a conditional diffusion model, and they are +blended with the original trajectories through a novel stitching algorithm that +leverages a new reward generator. Applying the resulting dataset to behavioral +cloning, the learned shallow policy whose size is much smaller outperforms or +nearly matches deep generative planners on several D4RL benchmarks. + +
+
+
+
+
+ + ☆ Signal Quality Auditing for Time-series Data + + +
+ Signal quality assessment (SQA) is required for monitoring the reliability of +data acquisition systems, especially in AI-driven Predictive Maintenance (PMx) +application contexts. SQA is vital for addressing "silent failures" of data +acquisition hardware and software, which when unnoticed, misinform the users of +data, creating the risk for incorrect decisions with unintended or even +catastrophic consequences. We have developed an open-source software +implementation of signal quality indices (SQIs) for the analysis of time-series +data. We codify a range of SQIs, demonstrate them using established benchmark +data, and show that they can be effective for signal quality assessment. We +also study alternative approaches to denoising time-series data in an attempt +to improve the quality of the already degraded signal, and evaluate them +empirically on relevant real-world data. To our knowledge, our software toolkit +is the first to provide an open source implementation of a broad range of +signal quality assessment and improvement techniques validated on publicly +available benchmark data for ease of reproducibility. The generality of our +framework can be easily extended to assessing reliability of arbitrary +time-series measurements in complex systems, especially when morphological +patterns of the waveform shapes and signal periodicity are of key interest in +downstream analyses. + +
+
+
+
+
+ + ☆ Formal-LLM: Integrating Formal Language and Natural Language for + Controllable LLM-based Agents + + +
+ Recent advancements on Large Language Models (LLMs) enable AI Agents to +automatically generate and execute multi-step plans to solve complex tasks. +However, since LLM's content generation process is hardly controllable, current +LLM-based agents frequently generate invalid or non-executable plans, which +jeopardizes the performance of the generated plans and corrupts users' trust in +LLM-based agents. In response, this paper proposes a novel ``Formal-LLM'' +framework for LLM-based agents by integrating the expressiveness of natural +language and the precision of formal language. Specifically, the framework +allows human users to express their requirements or constraints for the +planning process as an automaton. A stack-based LLM plan generation process is +then conducted under the supervision of the automaton to ensure that the +generated plan satisfies the constraints, making the planning process +controllable. We conduct experiments on both benchmark tasks and practical +real-life tasks, and our framework achieves over 50% overall performance +increase, which validates the feasibility and effectiveness of employing +Formal-LLM to guide the plan generation of agents, preventing the agents from +generating invalid and unsuccessful plans. Further, more controllable LLM-based +agents can facilitate the broader utilization of LLM in application scenarios +where high validity of planning is essential. The work is open-sourced at +https://github.com/agiresearch/Formal-LLM. + +
+
+ comment: 21 pages, 6 figures; working in process, suggestions are welcome +
+
+
+
+
+ + ☆ LLMs learn governing principles of dynamical systems, revealing an + in-context neural scaling law + + +
+ Pretrained large language models (LLMs) are surprisingly effective at +performing zero-shot tasks, including time-series forecasting. However, +understanding the mechanisms behind such capabilities remains highly +challenging due to the complexity of the models. In this paper, we study LLMs' +ability to extrapolate the behavior of dynamical systems whose evolution is +governed by principles of physical interest. Our results show that LLaMA 2, a +language model trained primarily on texts, achieves accurate predictions of +dynamical system time series without fine-tuning or prompt engineering. +Moreover, the accuracy of the learned physical rules increases with the length +of the input context window, revealing an in-context version of neural scaling +law. Along the way, we present a flexible and efficient algorithm for +extracting probability density functions of multi-digit numbers directly from +LLMs. + +
+
+
+
+
+ + ☆ ReAGent: Towards A Model-agnostic Feature Attribution Method for + Generative Language Models + + +
+ Feature attribution methods (FAs), such as gradients and attention, are +widely employed approaches to derive the importance of all input features to +the model predictions. Existing work in natural language processing has mostly +focused on developing and testing FAs for encoder-only language models (LMs) in +classification tasks. However, it is unknown if it is faithful to use these FAs +for decoder-only models on text generation, due to the inherent differences +between model architectures and task settings respectively. Moreover, previous +work has demonstrated that there is no `one-wins-all' FA across models and +tasks. This makes the selection of a FA computationally expensive for large LMs +since input importance derivation often requires multiple forward and backward +passes including gradient computations that might be prohibitive even with +access to large compute. To address these issues, we present a model-agnostic +FA for generative LMs called Recursive Attribution Generator (ReAGent). Our +method updates the token importance distribution in a recursive manner. For +each update, we compute the difference in the probability distribution over the +vocabulary for predicting the next token between using the original input and +using a modified version where a part of the input is replaced with RoBERTa +predictions. Our intuition is that replacing an important token in the context +should have resulted in a larger change in the model's confidence in predicting +the token than replacing an unimportant token. Our method can be universally +applied to any generative LM without accessing internal model weights or +additional training and fine-tuning, as most other FAs require. We extensively +compare the faithfulness of ReAGent with seven popular FAs across six +decoder-only LMs of various sizes. The results show that our method +consistently provides more faithful token importance distributions. + +
+
+
+
+
+ + ☆ Distinguishing the Indistinguishable: Human Expertise in Algorithmic + Prediction + + +
+ We introduce a novel framework for incorporating human expertise into +algorithmic predictions. Our approach focuses on the use of human judgment to +distinguish inputs which `look the same' to any feasible predictive algorithm. +We argue that this framing clarifies the problem of human/AI collaboration in +prediction tasks, as experts often have access to information -- particularly +subjective information -- which is not encoded in the algorithm's training +data. We use this insight to develop a set of principled algorithms for +selectively incorporating human feedback only when it improves the performance +of any feasible predictor. We find empirically that although algorithms often +outperform their human counterparts on average, human judgment can +significantly improve algorithmic predictions on specific instances (which can +be identified ex-ante). In an X-ray classification task, we find that this +subset constitutes nearly 30% of the patient population. Our approach provides +a natural way of uncovering this heterogeneity and thus enabling effective +human-AI collaboration. + +
+
+
+
+
+ + ☆ Graph-Mamba: Towards Long-Range Graph Sequence Modeling with Selective + State Spaces + + +
+ Attention mechanisms have been widely used to capture long-range dependencies +among nodes in Graph Transformers. Bottlenecked by the quadratic computational +cost, attention mechanisms fail to scale in large graphs. Recent improvements +in computational efficiency are mainly achieved by attention sparsification +with random or heuristic-based graph subsampling, which falls short in +data-dependent context reasoning. State space models (SSMs), such as Mamba, +have gained prominence for their effectiveness and efficiency in modeling +long-range dependencies in sequential data. However, adapting SSMs to +non-sequential graph data presents a notable challenge. In this work, we +introduce Graph-Mamba, the first attempt to enhance long-range context modeling +in graph networks by integrating a Mamba block with the input-dependent node +selection mechanism. Specifically, we formulate graph-centric node +prioritization and permutation strategies to enhance context-aware reasoning, +leading to a substantial improvement in predictive performance. Extensive +experiments on ten benchmark datasets demonstrate that Graph-Mamba outperforms +state-of-the-art methods in long-range graph prediction tasks, with a fraction +of the computational cost in both FLOPs and GPU memory consumption. The code +and models are publicly available at https://github.com/bowang-lab/Graph-Mamba. + +
+
+
+
+
+ + ☆ Learning and Calibrating Heterogeneous Bounded Rational Market Behaviour + with Multi-Agent Reinforcement Learning AAMAS 2024 + + +
+ Agent-based models (ABMs) have shown promise for modelling various real world +phenomena incompatible with traditional equilibrium analysis. However, a +critical concern is the manual definition of behavioural rules in ABMs. Recent +developments in multi-agent reinforcement learning (MARL) offer a way to +address this issue from an optimisation perspective, where agents strive to +maximise their utility, eliminating the need for manual rule specification. +This learning-focused approach aligns with established economic and financial +models through the use of rational utility-maximising agents. However, this +representation departs from the fundamental motivation for ABMs: that realistic +dynamics emerging from bounded rationality and agent heterogeneity can be +modelled. To resolve this apparent disparity between the two approaches, we +propose a novel technique for representing heterogeneous processing-constrained +agents within a MARL framework. The proposed approach treats agents as +constrained optimisers with varying degrees of strategic skills, permitting +departure from strict utility maximisation. Behaviour is learnt through +repeated simulations with policy gradients to adjust action likelihoods. To +allow efficient computation, we use parameterised shared policy learning with +distributions of agent skill levels. Shared policy learning avoids the need for +agents to learn individual policies yet still enables a spectrum of bounded +rational behaviours. We validate our model's effectiveness using real-world +data on a range of canonical $n$-agent settings, demonstrating significantly +improved predictive capability. + +
+
+ comment: Accepted as a full paper at AAMAS 2024 +
+
+
+
+
+ + ☆ CroissantLLM: A Truly Bilingual French-English Language Model + + +
+ We introduce CroissantLLM, a 1.3B language model pretrained on a set of 3T +English and French tokens, to bring to the research and industrial community a +high-performance, fully open-sourced bilingual model that runs swiftly on +consumer-grade local hardware. To that end, we pioneer the approach of training +an intrinsically bilingual model with a 1:1 English-to-French pretraining data +ratio, a custom tokenizer, and bilingual finetuning datasets. We release the +training dataset, notably containing a French split with manually curated, +high-quality, and varied data sources. To assess performance outside of +English, we craft a novel benchmark, FrenchBench, consisting of an array of +classification and generation tasks, covering various orthogonal aspects of +model performance in the French Language. Additionally, rooted in transparency +and to foster further Large Language Model research, we release codebases, and +dozens of checkpoints across various model sizes, training data distributions, +and training steps, as well as fine-tuned Chat models, and strong translation +models. We evaluate our model through the FMTI framework, and validate 81 % of +the transparency criteria, far beyond the scores of even most open initiatives. +This work enriches the NLP landscape, breaking away from previous +English-centric work in order to strengthen our understanding of +multilinguality in language models. + +
+
+
+
+
+ + ☆ Dense Reward for Free in Reinforcement Learning from Human Feedback + + +
+ Reinforcement Learning from Human Feedback (RLHF) has been credited as the +key advance that has allowed Large Language Models (LLMs) to effectively follow +instructions and produce useful assistance. Classically, this involves +generating completions from the LLM in response to a query before using a +separate reward model to assign a score to the full completion. As an +auto-regressive process, the LLM has to take many "actions" (selecting +individual tokens) and only receives a single, sparse reward at the end of an +episode, a setup that is known to be difficult to optimise in traditional +reinforcement learning. In this work we leverage the fact that the reward model +contains more information than just its scalar output, in particular, it +calculates an attention map over tokens as part of the transformer +architecture. We use these attention weights to redistribute the reward along +the whole completion, effectively densifying the signal and highlighting the +most important tokens, all without incurring extra computational cost or +requiring any additional modelling. We demonstrate that, theoretically, this +approach is equivalent to potential-based reward shaping, ensuring that the +optimal policy remains unchanged. Empirically, we show that it stabilises +training, accelerates the rate of learning, and, in practical cases, may lead +to better local optima. + +
+
+
+
+
+ + ☆ Hybrid Quantum Vision Transformers for Event Classification in High + Energy Physics + + +
+ Models based on vision transformer architectures are considered +state-of-the-art when it comes to image classification tasks. However, they +require extensive computational resources both for training and deployment. The +problem is exacerbated as the amount and complexity of the data increases. +Quantum-based vision transformer models could potentially alleviate this issue +by reducing the training and operating time while maintaining the same +predictive power. Although current quantum computers are not yet able to +perform high-dimensional tasks yet, they do offer one of the most efficient +solutions for the future. In this work, we construct several variations of a +quantum hybrid vision transformer for a classification problem in high energy +physics (distinguishing photons and electrons in the electromagnetic +calorimeter). We test them against classical vision transformer architectures. +Our findings indicate that the hybrid models can achieve comparable performance +to their classical analogues with a similar number of parameters. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Mesh motion in fluid-structure interaction with deep operator networks + + +
+ A mesh motion model based on deep operator networks is presented. The model +is trained on and evaluated against a biharmonic mesh motion model on a +fluid-structure interaction benchmark problem and further evaluated in a +setting where biharmonic mesh motion fails. The performance of the proposed +mesh motion model is comparable to the biharmonic mesh motion on the test +problems. + +
+
+ comment: 9 pages, 5 figures, submitted to proceedings of ENUMATH 2023 +
+
+
+
+
+ + ☆ AnimateLCM: Accelerating the Animation of Personalized Diffusion Models + and Adapters with Decoupled Consistency Learning + + +
+ Video diffusion models has been gaining increasing attention for its ability +to produce videos that are both coherent and of high fidelity. However, the +iterative denoising process makes it computationally intensive and +time-consuming, thus limiting its applications. Inspired by the Consistency +Model (CM) that distills pretrained image diffusion models to accelerate the +sampling with minimal steps and its successful extension Latent Consistency +Model (LCM) on conditional image generation, we propose AnimateLCM, allowing +for high-fidelity video generation within minimal steps. Instead of directly +conducting consistency learning on the raw video dataset, we propose a +decoupled consistency learning strategy that decouples the distillation of +image generation priors and motion generation priors, which improves the +training efficiency and enhance the generation visual quality. Additionally, to +enable the combination of plug-and-play adapters in stable diffusion community +to achieve various functions (e.g., ControlNet for controllable generation). we +propose an efficient strategy to adapt existing adapters to our distilled +text-conditioned video consistency model or train adapters from scratch without +harming the sampling speed. We validate the proposed strategy in +image-conditioned video generation and layout-conditioned video generation, all +achieving top-performing results. Experimental results validate the +effectiveness of our proposed method. Code and weights will be made public. +More details are available at https://github.com/G-U-N/AnimateLCM. + +
+
+ comment: Project Page: https://animatelcm.github.io/ +
+
+
+
+
+ + ☆ Control-Theoretic Techniques for Online Adaptation of Deep Neural + Networks in Dynamical Systems + + +
+ Deep neural networks (DNNs), trained with gradient-based optimization and +backpropagation, are currently the primary tool in modern artificial +intelligence, machine learning, and data science. In many applications, DNNs +are trained offline, through supervised learning or reinforcement learning, and +deployed online for inference. However, training DNNs with standard +backpropagation and gradient-based optimization gives no intrinsic performance +guarantees or bounds on the DNN, which is essential for applications such as +controls. Additionally, many offline-training and online-inference problems, +such as sim2real transfer of reinforcement learning policies, experience domain +shift from the training distribution to the real-world distribution. To address +these stability and transfer learning issues, we propose using techniques from +control theory to update DNN parameters online. We formulate the +fully-connected feedforward DNN as a continuous-time dynamical system, and we +propose novel last-layer update laws that guarantee desirable error convergence +under various conditions on the time derivative of the DNN input vector. We +further show that training the DNN under spectral normalization controls the +upper bound of the error trajectories of the online DNN predictions, which is +desirable when numerically differentiated quantities or noisy state +measurements are input to the DNN. The proposed online DNN adaptation laws are +validated in simulation to learn the dynamics of the Van der Pol system under +domain shift, where parameters are varied in inference from the training +dataset. The simulations demonstrate the effectiveness of using +control-theoretic techniques to derive performance improvements and guarantees +in DNN-based learning systems. + +
+
+ comment: Preprint version +
+
+
+
+
+ + ☆ EuroPED-NN: Uncertainty aware surrogate model + + +
+ This work successfully generates uncertainty aware surrogate models, via the +Bayesian neural network with noise contrastive prior (BNN-NCP) technique, of +the EuroPED plasma pedestal model using data from the JET-ILW pedestal database +and subsequent model evaluations. All this conform EuroPED-NN. The BNN-NCP +technique is proven to be a good fit for uncertainty aware surrogate models, +matching the output results as a regular neural network, providing prediction's +confidence as uncertainties, and highlighting the out of distribution (OOD) +regions using surrogate model uncertainties. This provides critical insights +into model robustness and reliability. EuroPED-NN has been physically +validated, first, analyzing electron density +$n_e\!\left(\psi_{\text{pol}}=0.94\right)$ with respect to increasing plasma +current, $I_p$, and second, validating the $\Delta-\beta_{p,ped}$ relation +associated with the EuroPED model. Affirming the robustness of the underlying +physics learned by the surrogate model. + +
+
+
+
+
+ + ☆ Building Expressive and Tractable Probabilistic Generative Models: A + Review + + +
+ We present a comprehensive survey of the advancements and techniques in the +field of tractable probabilistic generative modeling, primarily focusing on +Probabilistic Circuits (PCs). We provide a unified perspective on the inherent +trade-offs between expressivity and the tractability, highlighting the design +principles and algorithmic extensions that have enabled building expressive and +efficient PCs, and provide a taxonomy of the field. We also discuss recent +efforts to build deep and hybrid PCs by fusing notions from deep neural models, +and outline the challenges and open questions that can guide future research in +this evolving field. + +
+
+
+
+
+ + ☆ Unlearnable Algorithms for In-context Learning + + +
+ Machine unlearning is a desirable operation as models get increasingly +deployed on data with unknown provenance. However, achieving exact unlearning +-- obtaining a model that matches the model distribution when the data to be +forgotten was never used -- is challenging or inefficient, often requiring +significant retraining. In this paper, we focus on efficient unlearning methods +for the task adaptation phase of a pretrained large language model (LLM). We +observe that an LLM's ability to do in-context learning for task adaptation +allows for efficient exact unlearning of task adaptation training data. We +provide an algorithm for selecting few-shot training examples to prepend to the +prompt given to an LLM (for task adaptation), ERASE, whose unlearning operation +cost is independent of model and dataset size, meaning it scales to large +models and datasets. We additionally compare our approach to fine-tuning +approaches and discuss the trade-offs between the two approaches. This leads us +to propose a new holistic measure of unlearning cost which accounts for varying +inference costs, and conclude that in-context learning can often be more +favourable than fine-tuning for deployments involving unlearning requests. + +
+
+
+
+
+ + ☆ Benefits of Transformer: In-Context Learning in Linear Regression Tasks + with Unstructured Data + + +
+ In practice, it is observed that transformer-based models can learn concepts +in context in the inference stage. While existing literature, e.g., +\citet{zhang2023trained,huang2023context}, provide theoretical explanations on +this in-context learning ability, they assume the input $x_i$ and the output +$y_i$ for each sample are embedded in the same token (i.e., structured data). +However, in reality, they are presented in two tokens (i.e., unstructured data +\cite{wibisono2023role}). In this case, this paper conducts experiments in +linear regression tasks to study the benefits of the architecture of +transformers and provides some corresponding theoretical intuitions to explain +why the transformer can learn from unstructured data. We study the exact +components in a transformer that facilitate the in-context learning. In +particular, we observe that (1) a transformer with two layers of softmax +(self-)attentions with look-ahead attention mask can learn from the prompt if +$y_i$ is in the token next to $x_i$ for each example; (2) positional encoding +can further improve the performance; and (3) multi-head attention with a high +input embedding dimension has a better prediction performance than single-head +attention. + +
+
+
+
+
+ + ☆ MobilityDL: A Review of Deep Learning From Trajectory Data + + +
+ Trajectory data combines the complexities of time series, spatial data, and +(sometimes irrational) movement behavior. As data availability and computing +power have increased, so has the popularity of deep learning from trajectory +data. This review paper provides the first comprehensive overview of deep +learning approaches for trajectory data. We have identified eight specific +mobility use cases which we analyze with regards to the deep learning models +and the training data used. Besides a comprehensive quantitative review of the +literature since 2018, the main contribution of our work is the data-centric +analysis of recent work in this field, placing it along the mobility data +continuum which ranges from detailed dense trajectories of individual movers +(quasi-continuous tracking data), to sparse trajectories (such as check-in +data), and aggregated trajectories (crowd information). + +
+
+ comment: Submitted to Geoinformatica +
+
+
+
+
+ + ☆ Dropout-Based Rashomon Set Exploration for Efficient Predictive + Multiplicity Estimation ICLR 2024 + + +
+ Predictive multiplicity refers to the phenomenon in which classification +tasks may admit multiple competing models that achieve almost-equally-optimal +performance, yet generate conflicting outputs for individual samples. This +presents significant concerns, as it can potentially result in systemic +exclusion, inexplicable discrimination, and unfairness in practical +applications. Measuring and mitigating predictive multiplicity, however, is +computationally challenging due to the need to explore all such +almost-equally-optimal models, known as the Rashomon set, in potentially huge +hypothesis spaces. To address this challenge, we propose a novel framework that +utilizes dropout techniques for exploring models in the Rashomon set. We +provide rigorous theoretical derivations to connect the dropout parameters to +properties of the Rashomon set, and empirically evaluate our framework through +extensive experimentation. Numerical results show that our technique +consistently outperforms baselines in terms of the effectiveness of predictive +multiplicity metric estimation, with runtime speedup up to $20\times \sim +5000\times$. With efficient Rashomon set exploration and metric estimation, +mitigation of predictive multiplicity is then achieved through dropout ensemble +and model selection. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Automatic Segmentation of the Spinal Cord Nerve Rootlets + + +
+ Precise identification of spinal nerve rootlets is relevant to delineate +spinal levels for the study of functional activity in the spinal cord. The goal +of this study was to develop an automatic method for the semantic segmentation +of spinal nerve rootlets from T2-weighted magnetic resonance imaging (MRI) +scans. Images from two open-access MRI datasets were used to train a 3D +multi-class convolutional neural network using an active learning approach to +segment C2-C8 dorsal nerve rootlets. Each output class corresponds to a spinal +level. The method was tested on 3T T2-weighted images from datasets unseen +during training to assess inter-site, inter-session, and inter-resolution +variability. The test Dice score was 0.67 +- 0.16 (mean +- standard deviation +across rootlets levels), suggesting a good performance. The method also +demonstrated low inter-vendor and inter-site variability (coefficient of +variation <= 1.41 %), as well as low inter-session variability (coefficient of +variation <= 1.30 %) indicating stable predictions across different MRI +vendors, sites, and sessions. The proposed methodology is open-source and +readily available in the Spinal Cord Toolbox (SCT) v6.2 and higher. + +
+
+
+
+
+ + ☆ Neural Style Transfer with Twin-Delayed DDPG for Shared Control of + Robotic Manipulators + + +
+ Neural Style Transfer (NST) refers to a class of algorithms able to +manipulate an element, most often images, to adopt the appearance or style of +another one. Each element is defined as a combination of Content and Style: the +Content can be conceptually defined as the what and the Style as the how of +said element. In this context, we propose a custom NST framework for +transferring a set of styles to the motion of a robotic manipulator, e.g., the +same robotic task can be carried out in an angry, happy, calm, or sad way. An +autoencoder architecture extracts and defines the Content and the Style of the +target robot motions. A Twin Delayed Deep Deterministic Policy Gradient (TD3) +network generates the robot control policy using the loss defined by the +autoencoder. The proposed Neural Policy Style Transfer TD3 (NPST3) alters the +robot motion by introducing the trained style. Such an approach can be +implemented either offline, for carrying out autonomous robot motions in +dynamic environments, or online, for adapting at runtime the style of a +teleoperated robot. The considered styles can be learned online from human +demonstrations. We carried out an evaluation with human subjects enrolling 73 +volunteers, asking them to recognize the style behind some representative +robotic motions. Results show a good recognition rate, proving that it is +possible to convey different styles to a robot using this approach. + +
+
+
+
+
+ + ☆ Explaining Text Classifiers with Counterfactual Representations + + +
+ One well motivated explanation method for classifiers leverages +counterfactuals which are hypothetical events identical to real observations in +all aspects except for one categorical feature. Constructing such +counterfactual poses specific challenges for texts, however, as some attribute +values may not necessarily align with plausible real-world events. In this +paper we propose a simple method for generating counterfactuals by intervening +in the space of text representations which bypasses this limitation. We argue +that our interventions are minimally disruptive and that they are theoretically +sound as they align with counterfactuals as defined in Pearl's causal inference +framework. To validate our method, we first conduct experiments on a synthetic +dataset of counterfactuals, allowing for a direct comparison between classifier +predictions based on ground truth counterfactuals (obtained through explicit +text interventions) and our counterfactuals, derived through interventions in +the representation space. Second, we study a real world scenario where our +counterfactuals can be leveraged both for explaining a classifier and for bias +mitigation. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Non-Exchangeable Conformal Language Generation with Nearest Neighbors + + +
+ Quantifying uncertainty in automatically generated text is important for +letting humans check potential hallucinations and making systems more reliable. +Conformal prediction is an attractive framework to provide predictions imbued +with statistical guarantees, however, its application to text generation is +challenging since any i.i.d. assumptions are not realistic. In this paper, we +bridge this gap by leveraging recent results on non-exchangeable conformal +prediction, which still ensures bounds on coverage. The result, +non-exchangeable conformal nucleus sampling, is a novel extension of the +conformal prediction framework to generation based on nearest neighbors. Our +method can be used post-hoc for an arbitrary model without extra training and +supplies token-level, calibrated prediction sets equipped with statistical +guarantees. Experiments in machine translation and language modeling show +encouraging results in generation quality. By also producing tighter prediction +sets with good coverage, we thus give a more theoretically principled way to +perform sampling with conformal guarantees. + +
+
+
+
+
+ + ☆ Combining the Strengths of Dutch Survey and Register Data in a Data + Challenge to Predict Fertility (PreFer) + + +
+ The social sciences have produced an impressive body of research on +determinants of fertility outcomes, or whether and when people have children. +However, the strength of these determinants and underlying theories are rarely +evaluated on their predictive ability on new data. This prevents us from +systematically comparing studies, hindering the evaluation and accumulation of +knowledge. In this paper, we present two datasets which can be used to study +the predictability of fertility outcomes in the Netherlands. One dataset is +based on the LISS panel, a longitudinal survey which includes thousands of +variables on a wide range of topics, including individual preferences and +values. The other is based on the Dutch register data which lacks attitudinal +data but includes detailed information about the life courses of millions of +Dutch residents. We provide information about the datasets and the samples, and +describe the fertility outcome of interest. We also introduce the fertility +prediction data challenge PreFer which is based on these datasets and will +start in Spring 2024. We outline the ways in which measuring the predictability +of fertility outcomes using these datasets and combining their strengths in the +data challenge can advance our understanding of fertility behaviour and +computational social science. We further provide details for participants on +how to take part in the data challenge. + +
+
+
+
+
+ + ☆ PeaTMOSS: A Dataset and Initial Analysis of Pre-Trained Models in + Open-Source Software + + +
+ The development and training of deep learning models have become increasingly +costly and complex. Consequently, software engineers are adopting pre-trained +models (PTMs) for their downstream applications. The dynamics of the PTM supply +chain remain largely unexplored, signaling a clear need for structured datasets +that document not only the metadata but also the subsequent applications of +these models. Without such data, the MSR community cannot comprehensively +understand the impact of PTM adoption and reuse. This paper presents the +PeaTMOSS dataset, which comprises metadata for 281,638 PTMs and detailed +snapshots for all PTMs with over 50 monthly downloads (14,296 PTMs), along with +28,575 open-source software repositories from GitHub that utilize these models. +Additionally, the dataset includes 44,337 mappings from 15,129 downstream +GitHub repositories to the 2,530 PTMs they use. To enhance the dataset's +comprehensiveness, we developed prompts for a large language model to +automatically extract model metadata, including the model's training datasets, +parameters, and evaluation metrics. Our analysis of this dataset provides the +first summary statistics for the PTM supply chain, showing the trend of PTM +development and common shortcomings of PTM package documentation. Our example +application reveals inconsistencies in software licenses across PTMs and their +dependent projects. PeaTMOSS lays the foundation for future research, offering +rich opportunities to investigate the PTM supply chain. We outline mining +opportunities on PTMs, their downstream usage, and cross-cutting questions. + +
+
+ comment: Accepted at MSR'24 +
+
+
+
+
+ + ☆ Approximating Optimal Morphing Attacks using Template Inversion + + +
+ Recent works have demonstrated the feasibility of inverting face recognition +systems, enabling to recover convincing face images using only their +embeddings. We leverage such template inversion models to develop a novel type +ofdeep morphing attack based on inverting a theoretical optimal morph +embedding, which is obtained as an average of the face embeddings of source +images. We experiment with two variants of this approach: the first one +exploits a fully self-contained embedding-to-image inversion model, while the +second leverages the synthesis network of a pretrained StyleGAN network for +increased morph realism. We generate morphing attacks from several source +datasets and study the effectiveness of those attacks against several face +recognition networks. We showcase that our method can compete with and +regularly beat the previous state of the art for deep-learning based morph +generation in terms of effectiveness, both in white-box and black-box attack +scenarios, and is additionally much faster to run. We hope this might +facilitate the development of large scale deep morph datasets for training +detection models. + +
+
+ comment: Published at the IEEE International Joint Conference on Biometrics + (IJCB) 2023 +
+
+
+
+
+ + ☆ Real Evaluations Tractability using Continuous Goal-Directed Actions in + Smart City Applications + + +
+ One of the most important challenges of Smart City Applications is to adapt +the system to interact with non-expert users. Robot imitation frameworks aim to +simplify and reduce times of robot programming by allowing users to program +directly through demonstrations. In classical frameworks, actions are modeled +using joint or Cartesian space trajectories. Other features, such as visual +ones, are not always well represented with these pure geometrical approaches. +Continuous Goal-Directed Actions (CGDA) is an alternative to these methods, as +it encodes actions as changes of any feature that can be extracted from the +environment. As a consequence of this, the robot joint trajectories for +execution must be fully computed to comply with this feature-agnostic encoding. +This is achieved using Evolutionary Algorithms (EA), which usually requires too +many evaluations to perform this evolution step in the actual robot. Current +strategies involve performing evaluations in a simulation, transferring the +final joint trajectory to the actual robot. Smart City applications involve +working in highly dynamic and complex environments, where having a precise +model is not always achievable. Our goal is to study the tractability of +performing these evaluations directly in a real-world scenario. Two different +approaches to reduce the number of evaluations using EA, are proposed and +compared. In the first approach, Particle Swarm Optimization (PSO)-based +methods have been studied and compared within CGDA: naive PSO, Fitness +Inheritance PSO (FI-PSO), and Adaptive Fuzzy Fitness Granulation with PSO +(AFFG-PSO). The second approach studied the introduction of geometrical and +velocity constraints within CGDA. The effects of both approaches were analyzed +and compared in the wax and paint actions, two CGDA commonly studied use cases. +Results from this paper depict an important reduction in the number of +evaluations. + +
+
+
+
+
+ + ☆ Neural Policy Style Transfer + + +
+ Style Transfer has been proposed in a number of fields: fine arts, natural +language processing, and fixed trajectories. We scale this concept up to +control policies within a Deep Reinforcement Learning infrastructure. Each +network is trained to maximize the expected reward, which typically encodes the +goal of an action, and can be described as the content. The expressive power of +deep neural networks enables encoding a secondary task, which can be described +as the style. The Neural Policy Style Transfer (NPST) algorithm is proposed to +transfer the style of one policy to another, while maintaining the content of +the latter. Different policies are defined via Deep Q-Network architectures. +These models are trained using demonstrations through Inverse Reinforcement +Learning. Two different sets of user demonstrations are performed, one for +content and other for style. Different styles are encoded as defined by user +demonstrations. The generated policy is the result of feeding a content policy +and a style policy to the NPST algorithm. Experiments are performed in a +catch-ball game inspired by the Deep Reinforcement Learning classical Atari +games; and a real-world painting scenario with a full-sized humanoid robot, +based on previous works of the authors. The implementation of three different +Q-Network architectures (Shallow, Deep and Deep Recurrent Q-Network) to encode +the policies within the NPST framework is proposed and the results obtained in +the experiments with each of these architectures compared. + +
+
+
+
+
+ + ☆ Deep Robot Sketching: An application of Deep Q-Learning Networks for + human-like sketching + + +
+ The current success of Reinforcement Learning algorithms for its performance +in complex environments has inspired many recent theoretical approaches to +cognitive science. Artistic environments are studied within the cognitive +science community as rich, natural, multi-sensory, multi-cultural environments. +In this work, we propose the introduction of Reinforcement Learning for +improving the control of artistic robot applications. Deep Q-learning Neural +Networks (DQN) is one of the most successful algorithms for the implementation +of Reinforcement Learning in robotics. DQN methods generate complex control +policies for the execution of complex robot applications in a wide set of +environments. Current art painting robot applications use simple control laws +that limits the adaptability of the frameworks to a set of simple environments. +In this work, the introduction of DQN within an art painting robot application +is proposed. The goal is to study how the introduction of a complex control +policy impacts the performance of a basic art painting robot application. The +main expected contribution of this work is to serve as a first baseline for +future works introducing DQN methods for complex art painting robot frameworks. +Experiments consist of real world executions of human drawn sketches using the +DQN generated policy and TEO, the humanoid robot. Results are compared in terms +of similarity and obtained reward with respect to the reference inputs + +
+
+
+
+
+ + ☆ Modeling Freight Mode Choice Using Machine Learning Classifiers: A + Comparative Study Using the Commodity Flow Survey (CFS) Data + + +
+ This study explores the usefulness of machine learning classifiers for +modeling freight mode choice. We investigate eight commonly used machine +learning classifiers, namely Naive Bayes, Support Vector Machine, Artificial +Neural Network, K-Nearest Neighbors, Classification and Regression Tree, Random +Forest, Boosting and Bagging, along with the classical Multinomial Logit model. +US 2012 Commodity Flow Survey data are used as the primary data source; we +augment it with spatial attributes from secondary data sources. The performance +of the classifiers is compared based on prediction accuracy results. The +current research also examines the role of sample size and training-testing +data split ratios on the predictive ability of the various approaches. In +addition, the importance of variables is estimated to determine how the +variables influence freight mode choice. The results show that the tree-based +ensemble classifiers perform the best. Specifically, Random Forest produces the +most accurate predictions, closely followed by Boosting and Bagging. With +regard to variable importance, shipment characteristics, such as shipment +distance, industry classification of the shipper and shipment size, are the +most significant factors for freight mode choice decisions. + +
+
+
+
+
+ + ☆ Improving the accuracy of freight mode choice models: A case study using + the 2017 CFS PUF data set and ensemble learning techniques + + +
+ The US Census Bureau has collected two rounds of experimental data from the +Commodity Flow Survey, providing shipment-level characteristics of nationwide +commodity movements, published in 2012 (i.e., Public Use Microdata) and in 2017 +(i.e., Public Use File). With this information, data-driven methods have become +increasingly valuable for understanding detailed patterns in freight logistics. +In this study, we used the 2017 Commodity Flow Survey Public Use File data set +to explore building a high-performance freight mode choice model, considering +three main improvements: (1) constructing local models for each separate +commodity/industry category; (2) extracting useful geographical features, +particularly the derived distance of each freight mode between +origin/destination zones; and (3) applying additional ensemble learning methods +such as stacking or voting to combine results from local and unified models for +improved performance. The proposed method achieved over 92% accuracy without +incorporating external information, an over 19% increase compared to directly +fitting Random Forests models over 10,000 samples. Furthermore, SHAP (Shapely +Additive Explanations) values were computed to explain the outputs and major +patterns obtained from the proposed model. The model framework could enhance +the performance and interpretability of existing freight mode choice models. + +
+
+
+
+
+ + ☆ Coherent Feed Forward Quantum Neural Network + + +
+ Quantum machine learning, focusing on quantum neural networks (QNNs), remains +a vastly uncharted field of study. Current QNN models primarily employ +variational circuits on an ansatz or a quantum feature map, often requiring +multiple entanglement layers. This methodology not only increases the +computational cost of the circuit beyond what is practical on near-term quantum +devices but also misleadingly labels these models as neural networks, given +their divergence from the structure of a typical feed-forward neural network +(FFNN). Moreover, the circuit depth and qubit needs of these models scale +poorly with the number of data features, resulting in an efficiency challenge +for real-world machine-learning tasks. We introduce a bona fide QNN model, +which seamlessly aligns with the versatility of a traditional FFNN in terms of +its adaptable intermediate layers and nodes, absent from intermediate +measurements such that our entire model is coherent. This model stands out with +its reduced circuit depth and number of requisite C-NOT gates to outperform +prevailing QNN models. Furthermore, the qubit count in our model remains +unaffected by the data's feature quantity. We test our proposed model on +various benchmarking datasets such as the diagnostic breast cancer (Wisconsin) +and credit card fraud detection datasets. We compare the outcomes of our model +with the existing QNN methods to showcase the advantageous efficacy of our +approach, even with a reduced requirement on quantum resources. Our model paves +the way for application of quantum neural networks to real relevant machine +learning problems. + +
+
+ comment: 11 pages, 7 figures. Comments welcome! +
+
+
+
+
+ + ☆ Spectrally Transformed Kernel Regression ICLR 2024 + + +
+ Unlabeled data is a key component of modern machine learning. In general, the +role of unlabeled data is to impose a form of smoothness, usually from the +similarity information encoded in a base kernel, such as the +$\epsilon$-neighbor kernel or the adjacency matrix of a graph. This work +revisits the classical idea of spectrally transformed kernel regression (STKR), +and provides a new class of general and scalable STKR estimators able to +leverage unlabeled data. Intuitively, via spectral transformation, STKR +exploits the data distribution for which unlabeled data can provide additional +information. First, we show that STKR is a principled and general approach, by +characterizing a universal type of "target smoothness", and proving that any +sufficiently smooth function can be learned by STKR. Second, we provide +scalable STKR implementations for the inductive setting and a general +transformation function, while prior work is mostly limited to the transductive +setting. Third, we derive statistical guarantees for two scenarios: STKR with a +known polynomial transformation, and STKR with kernel PCA when the +transformation is unknown. Overall, we believe that this work helps deepen our +understanding of how to work with unlabeled data, and its generality makes it +easier to inspire new methods. + +
+
+ comment: ICLR 2024 spotlight. 36 pages +
+
+
+
+
+ + ☆ Random Forest-Based Prediction of Stroke Outcome + + +
+ We research into the clinical, biochemical and neuroimaging factors +associated with the outcome of stroke patients to generate a predictive model +using machine learning techniques for prediction of mortality and morbidity 3 +months after admission. The dataset consisted of patients with ischemic stroke +(IS) and non-traumatic intracerebral hemorrhage (ICH) admitted to Stroke Unit +of a European Tertiary Hospital prospectively registered. We identified the +main variables for machine learning Random Forest (RF), generating a predictive +model that can estimate patient mortality/morbidity. In conclusion, machine +learning algorithms RF can be effectively used in stroke patients for long-term +outcome prediction of mortality and morbidity. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Vision-LLMs Can Fool Themselves with Self-Generated Typographic Attacks + + +
+ Recently, significant progress has been made on Large Vision-Language Models +(LVLMs); a new class of VL models that make use of large pre-trained language +models. Yet, their vulnerability to Typographic attacks, which involve +superimposing misleading text onto an image remain unstudied. Furthermore, +prior work typographic attacks rely on sampling a random misleading class from +a predefined set of classes. However, the random chosen class might not be the +most effective attack. To address these issues, we first introduce a novel +benchmark uniquely designed to test LVLMs vulnerability to typographic attacks. +Furthermore, we introduce a new and more effective typographic attack: +Self-Generated typographic attacks. Indeed, our method, given an image, make +use of the strong language capabilities of models like GPT-4V by simply +prompting them to recommend a typographic attack. Using our novel benchmark, we +uncover that typographic attacks represent a significant threat against +LVLM(s). Furthermore, we uncover that typographic attacks recommended by GPT-4V +using our new method are not only more effective against GPT-4V itself compared +to prior work attacks, but also against a host of less capable yet popular open +source models like LLaVA, InstructBLIP, and MiniGPT4. + +
+
+
+
+
+ + ☆ Bayesian Causal Inference with Gaussian Process Networks + + +
+ Causal discovery and inference from observational data is an essential +problem in statistics posing both modeling and computational challenges. These +are typically addressed by imposing strict assumptions on the joint +distribution such as linearity. We consider the problem of the Bayesian +estimation of the effects of hypothetical interventions in the Gaussian Process +Network (GPN) model, a flexible causal framework which allows describing the +causal relationships nonparametrically. We detail how to perform causal +inference on GPNs by simulating the effect of an intervention across the whole +network and propagating the effect of the intervention on downstream variables. +We further derive a simpler computational approximation by estimating the +intervention distribution as a function of local variables only, modeling the +conditional distributions via additive Gaussian processes. We extend both +frameworks beyond the case of a known causal graph, incorporating uncertainty +about the causal structure via Markov chain Monte Carlo methods. Simulation +studies show that our approach is able to identify the effects of hypothetical +interventions with non-Gaussian, non-linear observational data and accurately +reflect the posterior uncertainty of the causal estimates. Finally we compare +the results of our GPN-based causal inference approach to existing methods on a +dataset of $A.~thaliana$ gene expressions. + +
+
+
+
+
+ + ☆ Deep Clustering Using the Soft Silhouette Score: Towards Compact and + Well-Separated Clusters + + +
+ Unsupervised learning has gained prominence in the big data era, offering a +means to extract valuable insights from unlabeled datasets. Deep clustering has +emerged as an important unsupervised category, aiming to exploit the non-linear +mapping capabilities of neural networks in order to enhance clustering +performance. The majority of deep clustering literature focuses on minimizing +the inner-cluster variability in some embedded space while keeping the learned +representation consistent with the original high-dimensional dataset. In this +work, we propose soft silhoutte, a probabilistic formulation of the silhouette +coefficient. Soft silhouette rewards compact and distinctly separated +clustering solutions like the conventional silhouette coefficient. When +optimized within a deep clustering framework, soft silhouette guides the +learned representations towards forming compact and well-separated clusters. In +addition, we introduce an autoencoder-based deep learning architecture that is +suitable for optimizing the soft silhouette objective function. The proposed +deep clustering method has been tested and compared with several well-studied +deep clustering methods on various benchmark datasets, yielding very +satisfactory clustering results. + +
+
+
+
+
+ + ☆ Are Synthetic Time-series Data Really not as Good as Real Data? + + +
+ Time-series data presents limitations stemming from data quality issues, bias +and vulnerabilities, and generalization problem. Integrating universal data +synthesis methods holds promise in improving generalization. However, current +methods cannot guarantee that the generator's output covers all unseen real +data. In this paper, we introduce InfoBoost -- a highly versatile cross-domain +data synthesizing framework with time series representation learning +capability. We have developed a method based on synthetic data that enables +model training without the need for real data, surpassing the performance of +models trained with real data. Additionally, we have trained a universal +feature extractor based on our synthetic data that is applicable to all +time-series data. Our approach overcomes interference from multiple sources +rhythmic signal, noise interference, and long-period features that exceed +sampling window capabilities. Through experiments, our non-deep-learning +synthetic data enables models to achieve superior reconstruction performance +and universal explicit representation extraction without the need for real +data. + +
+
+
+
+
+ + ☆ Uncertainty-Aware Partial-Label Learning + + +
+ In real-world applications, one often encounters ambiguously labeled data, +where different annotators assign conflicting class labels. Partial-label +learning allows training classifiers in this weakly supervised setting. While +state-of-the-art methods already feature good predictive performance, they +often suffer from miscalibrated uncertainty estimates. However, having +well-calibrated uncertainty estimates is important, especially in +safety-critical domains like medicine and autonomous driving. In this article, +we propose a novel nearest-neighbor-based partial-label-learning algorithm that +leverages Dempster-Shafer theory. Extensive experiments on artificial and +real-world datasets show that the proposed method provides a well-calibrated +uncertainty estimate and achieves competitive prediction performance. +Additionally, we prove that our algorithm is risk-consistent. + +
+
+
+
+
+ + ☆ Tropical Decision Boundaries for Neural Networks Are Robust Against + Adversarial Attacks + + +
+ We introduce a simple, easy to implement, and computationally efficient +tropical convolutional neural network architecture that is robust against +adversarial attacks. We exploit the tropical nature of piece-wise linear neural +networks by embedding the data in the tropical projective torus in a single +hidden layer which can be added to any model. We study the geometry of its +decision boundary theoretically and show its robustness against adversarial +attacks on image datasets using computational experiments. + +
+
+
+
+
+ + ☆ Secure Supervised Learning-Based Smart Home Authentication Framework + + +
+ The Smart home possesses the capability of facilitating home services to +their users with the systematic advance in The Internet of Things (IoT) and +information and communication technologies (ICT) in recent decades. The home +service offered by the smart devices helps the users in utilize maximized level +of comfort for the objective of improving life quality. As the user and smart +devices communicate through an insecure channel, the smart home environment is +prone to security and privacy problems. A secure authentication protocol needs +to be established between the smart devices and the user, such that a situation +for device authentication can be made feasible in smart home environments. Most +of the existing smart home authentication protocols were identified to fail in +facilitating a secure mutual authentication and increases the possibility of +lunching the attacks of session key disclosure, impersonation and stolen smart +device. In this paper, Secure Supervised Learning-based Smart Home +Authentication Framework (SSL-SHAF) is proposed as are liable mutual +authentication that can be contextually imposed for better security. The formal +analysis of the proposed SSL-SHAF confirmed better resistance against session +key disclosure, impersonation and stolen smart device attacks. The results of +SSL-SHAF confirmed minimized computational costs and security compared to the +baseline protocols considered for investigation. + +
+
+
+
+
+ + ☆ A Single Graph Convolution Is All You Need: Efficient Grayscale Image + Classification + + +
+ Image classifiers often rely on convolutional neural networks (CNN) for their +tasks, which are inherently more heavyweight than multilayer perceptrons +(MLPs), which can be problematic in real-time applications. Additionally, many +image classification models work on both RGB and grayscale datasets. +Classifiers that operate solely on grayscale images are much less common. +Grayscale image classification has diverse applications, including but not +limited to medical image classification and synthetic aperture radar (SAR) +automatic target recognition (ATR). Thus, we present a novel grayscale (single +channel) image classification approach using a vectorized view of images. We +exploit the lightweightness of MLPs by viewing images as a vector and reducing +our problem setting to the grayscale image classification setting. We find that +using a single graph convolutional layer batch-wise increases accuracy and +reduces variance in the performance of our model. Moreover, we develop a +customized accelerator on FPGA for the proposed model with several +optimizations to improve its performance. Our experimental results on benchmark +grayscale image datasets demonstrate the effectiveness of the proposed model, +achieving vastly lower latency (up to 16$\times$ less) and competitive or +leading performance compared to other state-of-the-art image classification +models on various domain-specific grayscale image classification datasets. + +
+
+ comment: 6 pages of content, 1 page of references +
+
+
+
+
+ + ☆ Quantum-Assisted Hilbert-Space Gaussian Process Regression + + +
+ Gaussian processes are probabilistic models that are commonly used as +functional priors in machine learning. Due to their probabilistic nature, they +can be used to capture the prior information on the statistics of noise, +smoothness of the functions, and training data uncertainty. However, their +computational complexity quickly becomes intractable as the size of the data +set grows. We propose a Hilbert space approximation-based quantum algorithm for +Gaussian process regression to overcome this limitation. Our method consists of +a combination of classical basis function expansion with quantum computing +techniques of quantum principal component analysis, conditional rotations, and +Hadamard and Swap tests. The quantum principal component analysis is used to +estimate the eigenvalues while the conditional rotations and the Hadamard and +Swap tests are employed to evaluate the posterior mean and variance of the +Gaussian process. Our method provides polynomial computational complexity +reduction over the classical method. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ A Manifold Representation of the Key in Vision Transformers + + +
+ Vision Transformers implement multi-head self-attention (MSA) via stacking +multiple attention blocks. The query, key, and value are often intertwined and +generated within those blocks via a single, shared linear transformation. This +paper explores the concept of disentangling the key from the query and value, +and adopting a manifold representation for the key. Our experiments reveal that +decoupling and endowing the key with a manifold structure can enhance the model +performance. Specifically, ViT-B exhibits a 0.87% increase in top-1 accuracy, +while Swin-T sees a boost of 0.52% in top-1 accuracy on the ImageNet-1K +dataset, with eight charts in the manifold key. Our approach also yields +positive results in object detection and instance segmentation tasks on the +COCO dataset. Through detailed ablation studies, we establish that these +performance gains are not merely due to the simplicity of adding more +parameters and computations. Future research may investigate strategies for +cutting the budget of such representations and aim for further performance +improvements based on our findings. + +
+
+
+
+
+ + ☆ Preconditioning for Physics-Informed Neural Networks + + +
+ Physics-informed neural networks (PINNs) have shown promise in solving +various partial differential equations (PDEs). However, training pathologies +have negatively affected the convergence and prediction accuracy of PINNs, +which further limits their practical applications. In this paper, we propose to +use condition number as a metric to diagnose and mitigate the pathologies in +PINNs. Inspired by classical numerical analysis, where the condition number +measures sensitivity and stability, we highlight its pivotal role in the +training dynamics of PINNs. We prove theorems to reveal how condition number is +related to both the error control and convergence of PINNs. Subsequently, we +present an algorithm that leverages preconditioning to improve the condition +number. Evaluations of 18 PDE problems showcase the superior performance of our +method. Significantly, in 7 of these problems, our method reduces errors by an +order of magnitude. These empirical findings verify the critical role of the +condition number in PINNs' training. + +
+
+
+
+
+ + ☆ Understanding the Expressive Power and Mechanisms of Transformer for + Sequence Modeling + + +
+ We conduct a systematic study of the approximation properties of Transformer +for sequence modeling with long, sparse and complicated memory. We investigate +the mechanisms through which different components of Transformer, such as the +dot-product self-attention, positional encoding and feed-forward layer, affect +its expressive power, and we study their combined effects through establishing +explicit approximation rates. Our study reveals the roles of critical +parameters in the Transformer, such as the number of layers and the number of +attention heads, and these insights also provide natural suggestions for +alternative architectures. + +
+
+ comment: 65 pages +
+
+
+
+
+ + ☆ EE-Tuning: An Economical yet Scalable Solution for Tuning Early-Exit + Large Language Models + + +
+ This work introduces EE-Tuning, a lightweight and economical solution to +training/tuning early-exit large language models (LLMs). In contrast to the +common approach of full-parameter pre-training, EE-Tuning augments any +pre-trained (and possibly fine-tuned) standard LLM with additional early-exit +layers that are tuned in a parameter-efficient manner, which requires +significantly less computational resources and training data. Our +implementation of EE-Tuning achieves outstanding training efficiency via +extensive performance optimizations, as well as scalability due to its full +compatibility with 3D parallelism. Results of systematic experiments validate +the efficacy of EE-Tuning, confirming that effective early-exit LLM inference +can be achieved with a limited training budget. In hope of making early-exit +LLMs accessible to the community, we release the source code of our +implementation of EE-Tuning at https://github.com/pan-x-c/EE-LLM. + +
+
+
+
+
+ + ☆ Developing A Multi-Agent and Self-Adaptive Framework with Deep + Reinforcement Learning for Dynamic Portfolio Risk Management + + +
+ Deep or reinforcement learning (RL) approaches have been adapted as reactive +agents to quickly learn and respond with new investment strategies for +portfolio management under the highly turbulent financial market environments +in recent years. In many cases, due to the very complex correlations among +various financial sectors, and the fluctuating trends in different financial +markets, a deep or reinforcement learning based agent can be biased in +maximising the total returns of the newly formulated investment portfolio while +neglecting its potential risks under the turmoil of various market conditions +in the global or regional sectors. Accordingly, a multi-agent and self-adaptive +framework namely the MASA is proposed in which a sophisticated multi-agent +reinforcement learning (RL) approach is adopted through two cooperating and +reactive agents to carefully and dynamically balance the trade-off between the +overall portfolio returns and their potential risks. Besides, a very flexible +and proactive agent as the market observer is integrated into the MASA +framework to provide some additional information on the estimated market trends +as valuable feedbacks for multi-agent RL approach to quickly adapt to the +ever-changing market conditions. The obtained empirical results clearly reveal +the potential strengths of our proposed MASA framework based on the multi-agent +RL approach against many well-known RL-based approaches on the challenging data +sets of the CSI 300, Dow Jones Industrial Average and S&P 500 indexes over the +past 10 years. More importantly, our proposed MASA framework shed lights on +many possible directions for future investigation. + +
+
+ comment: Accepted by The 23rd International Conference on Autonomous Agents + and Multi-Agent Systems +
+
+
+
+
+ + ☆ Equivalence of the Empirical Risk Minimization to Regularization on the + Family of f-Divergences + + +
+ The solution to empirical risk minimization with $f$-divergence +regularization (ERM-$f$DR) is presented under mild conditions on $f$. Under +such conditions, the optimal measure is shown to be unique. Examples of the +solution for particular choices of the function $f$ are presented. Previously +known solutions to common regularization choices are obtained by leveraging the +flexibility of the family of $f$-divergences. These include the unique +solutions to empirical risk minimization with relative entropy regularization +(Type-I and Type-II). The analysis of the solution unveils the following +properties of $f$-divergences when used in the ERM-$f$DR problem: $i\bigl)$ +$f$-divergence regularization forces the support of the solution to coincide +with the support of the reference measure, which introduces a strong inductive +bias that dominates the evidence provided by the training data; and $ii\bigl)$ +any $f$-divergence regularization is equivalent to a different $f$-divergence +regularization with an appropriate transformation of the empirical risk +function. + +
+
+ comment: Submitted to the IEEE Symposium in Information Theory 2024. arXiv + admin note: text overlap with arXiv:2306.07123 +
+
+
+
+
+ + ☆ CPT: Competence-progressive Training Strategy for Few-shot Node + Classification + + +
+ Graph Neural Networks (GNNs) have made significant advancements in node +classification, but their success relies on sufficient labeled nodes per class +in the training data. Real-world graph data often exhibits a long-tail +distribution with sparse labels, emphasizing the importance of GNNs' ability in +few-shot node classification, which entails categorizing nodes with limited +data. Traditional episodic meta-learning approaches have shown promise in this +domain, but they face an inherent limitation: it might lead the model to +converge to suboptimal solutions because of random and uniform task assignment, +ignoring task difficulty levels. This could lead the meta-learner to face +complex tasks too soon, hindering proper learning. Ideally, the meta-learner +should start with simple concepts and advance to more complex ones, like human +learning. So, we introduce CPT, a novel two-stage curriculum learning method +that aligns task difficulty with the meta-learner's progressive competence, +enhancing overall performance. Specifically, in CPT's initial stage, the focus +is on simpler tasks, fostering foundational skills for engaging with complex +tasks later. Importantly, the second stage dynamically adjusts task difficulty +based on the meta-learner's growing competence, aiming for optimal knowledge +acquisition. Extensive experiments on popular node classification datasets +demonstrate significant improvements of our strategy over existing methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2206.11972 by + other authors +
+
+
+
+
+ + ☆ A Survey of Data-Efficient Graph Learning + + +
+ Graph-structured data, prevalent in domains ranging from social networks to +biochemical analysis, serve as the foundation for diverse real-world systems. +While graph neural networks demonstrate proficiency in modeling this type of +data, their success is often reliant on significant amounts of labeled data, +posing a challenge in practical scenarios with limited annotation resources. To +tackle this problem, tremendous efforts have been devoted to enhancing graph +machine learning performance under low-resource settings by exploring various +approaches to minimal supervision. In this paper, we introduce a novel concept +of Data-Efficient Graph Learning (DEGL) as a research frontier, and present the +first survey that summarizes the current progress of DEGL. We initiate by +highlighting the challenges inherent in training models with large labeled +data, paving the way for our exploration into DEGL. Next, we systematically +review recent advances on this topic from several key aspects, including +self-supervised graph learning, semi-supervised graph learning, and few-shot +graph learning. Also, we state promising directions for future research, +contributing to the evolution of graph machine learning. + +
+
+
+
+
+ + ☆ A practical existence theorem for reduced order models based on + convolutional autoencoders + + +
+ In recent years, deep learning has gained increasing popularity in the fields +of Partial Differential Equations (PDEs) and Reduced Order Modeling (ROM), +providing domain practitioners with new powerful data-driven techniques such as +Physics-Informed Neural Networks (PINNs), Neural Operators, Deep Operator +Networks (DeepONets) and Deep-Learning based ROMs (DL-ROMs). In this context, +deep autoencoders based on Convolutional Neural Networks (CNNs) have proven +extremely effective, outperforming established techniques, such as the reduced +basis method, when dealing with complex nonlinear problems. However, despite +the empirical success of CNN-based autoencoders, there are only a few +theoretical results supporting these architectures, usually stated in the form +of universal approximation theorems. In particular, although the existing +literature provides users with guidelines for designing convolutional +autoencoders, the subsequent challenge of learning the latent features has been +barely investigated. Furthermore, many practical questions remain unanswered, +e.g., the number of snapshots needed for convergence or the neural network +training strategy. In this work, using recent techniques from sparse +high-dimensional function approximation, we fill some of these gaps by +providing a new practical existence theorem for CNN-based autoencoders when the +parameter-to-solution map is holomorphic. This regularity assumption arises in +many relevant classes of parametric PDEs, such as the parametric diffusion +equation, for which we discuss an explicit application of our general theory. + +
+
+
+
+
+ + ☆ Merging Multi-Task Models via Weight-Ensembling Mixture of Experts + + +
+ Merging various task-specific Transformer-based models trained on different +tasks into a single unified model can execute all the tasks concurrently. +Previous methods, exemplified by task arithmetic, have been proven to be both +effective and scalable. Existing methods have primarily focused on seeking a +static optimal solution within the original model parameter space. A notable +challenge is mitigating the interference between parameters of different +models, which can substantially deteriorate performance. In this paper, we +propose to merge most of the parameters while upscaling the MLP of the +Transformer layers to a weight-ensembling mixture of experts (MoE) module, +which can dynamically integrate shared and task-specific knowledge based on the +input, thereby providing a more flexible solution that can adapt to the +specific needs of each instance. Our key insight is that by identifying and +separating shared knowledge and task-specific knowledge, and then dynamically +integrating them, we can mitigate the parameter interference problem to a great +extent. We conduct the conventional multi-task model merging experiments and +evaluate the generalization and robustness of our method. The results +demonstrate the effectiveness of our method and provide a comprehensive +understanding of our method. The code is available at +https://anonymous.4open.science/r/weight-ensembling_MoE-67C9/ + +
+
+
+
+
+ + ☆ From PARIS to LE-PARIS: Toward Patent Response Automation with + Recommender Systems and Collaborative Large Language Models + + +
+ In patent prosecution, timely and effective responses to Office Actions (OAs) +are crucial for acquiring patents, yet past automation and AI research have +scarcely addressed this aspect. To address this gap, our study introduces the +Patent Office Action Response Intelligence System (PARIS) and its advanced +version, the Large Language Model Enhanced PARIS (LE-PARIS). These systems are +designed to expedite the efficiency of patent attorneys in collaboratively +handling OA responses. The systems' key features include the construction of an +OA Topics Database, development of Response Templates, and implementation of +Recommender Systems and LLM-based Response Generation. Our validation involves +a multi-paradigmatic analysis using the USPTO Office Action database and +longitudinal data of attorney interactions with our systems over six years. +Through five studies, we examine the constructiveness of OA topics (studies 1 +and 2) using topic modeling and the proposed Delphi process, the efficacy of +our proposed hybrid recommender system tailored for OA (both LLM-based and +non-LLM-based) (study 3), the quality of response generation (study 4), and the +practical value of the systems in real-world scenarios via user studies (study +5). Results demonstrate that both PARIS and LE-PARIS significantly meet key +metrics and positively impact attorney performance. + +
+
+ comment: 14 pages, 4 figures, summitted to a journal +
+
+
+
+
+ + ☆ Short: Benchmarking transferable adversarial attacks NDSS 2024 + + +
+ The robustness of deep learning models against adversarial attacks remains a +pivotal concern. This study presents, for the first time, an exhaustive review +of the transferability aspect of adversarial attacks. It systematically +categorizes and critically evaluates various methodologies developed to augment +the transferability of adversarial attacks. This study encompasses a spectrum +of techniques, including Generative Structure, Semantic Similarity, Gradient +Editing, Target Modification, and Ensemble Approach. Concurrently, this paper +introduces a benchmark framework \textit{TAA-Bench}, integrating ten leading +methodologies for adversarial attack transferability, thereby providing a +standardized and systematic platform for comparative analysis across diverse +model architectures. Through comprehensive scrutiny, we delineate the efficacy +and constraints of each method, shedding light on their underlying operational +principles and practical utility. This review endeavors to be a quintessential +resource for both scholars and practitioners in the field, charting the complex +terrain of adversarial transferability and setting a foundation for future +explorations in this vital sector. The associated codebase is accessible at: +https://github.com/KxPlaug/TAA-Bench + +
+
+ comment: Accepted by NDSS 2024 Workshop +
+
+
+
+
+ + ☆ Multi-scale Traffic Pattern Bank for Cross-city Few-shot Traffic + Forecasting + + +
+ Traffic forecasting is crucial for intelligent transportation systems (ITS), +aiding in efficient resource allocation and effective traffic control. However, +its effectiveness often relies heavily on abundant traffic data, while many +cities lack sufficient data due to limited device support, posing a significant +challenge for traffic forecasting. Recognizing this challenge, we have made a +noteworthy observation: traffic patterns exhibit similarities across diverse +cities. Building on this key insight, we propose a solution for the cross-city +few-shot traffic forecasting problem called Multi-scale Traffic Pattern Bank +(MTPB). Primarily, MTPB initiates its learning process by leveraging data-rich +source cities, effectively acquiring comprehensive traffic knowledge through a +spatial-temporal-aware pre-training process. Subsequently, the framework +employs advanced clustering techniques to systematically generate a multi-scale +traffic pattern bank derived from the learned knowledge. Next, the traffic data +of the data-scarce target city could query the traffic pattern bank, +facilitating the aggregation of meta-knowledge. This meta-knowledge, in turn, +assumes a pivotal role as a robust guide in subsequent processes involving +graph reconstruction and forecasting. Empirical assessments conducted on +real-world traffic datasets affirm the superior performance of MTPB, surpassing +existing methods across various categories and exhibiting numerous attributes +conducive to the advancement of cross-city few-shot forecasting methodologies. +The code is available in https://github.com/zhyliu00/MTPB. + +
+
+ comment: Under review. Text overlap with arXiv:2308.09727 +
+
+
+
+
+ + ☆ Efficient Exploration for LLMs + + +
+ We present evidence of substantial benefit from efficient exploration in +gathering human feedback to improve large language models. In our experiments, +an agent sequentially generates queries while fitting a reward model to the +feedback received. Our best-performing agent generates queries using double +Thompson sampling, with uncertainty represented by an epistemic neural network. +Our results demonstrate that efficient exploration enables high levels of +performance with far fewer queries. Further, both uncertainty estimation and +the choice of exploration scheme play critical roles. + +
+
+
+
+
+ + ☆ Loss Function Considering Dead Zone for Neural Networks + + +
+ It is important to reveal the inverse dynamics of manipulators to improve +control performance of model-based control. Neural networks (NNs) are promising +techniques to represent complicated inverse dynamics while they require a large +amount of motion data. However, motion data in dead zones of actuators is not +suitable for training models decreasing the number of useful training data. In +this study, based on the fact that the manipulator joint does not work +irrespective of input torque in dead zones, we propose a new loss function that +considers only errors of joints not in dead zones. The proposed method enables +to increase in the amount of motion data available for training and the +accuracy of the inverse dynamics computation. Experiments on actual equipment +using a three-degree-of-freedom (DOF) manipulator showed higher accuracy than +conventional methods. We also confirmed and discussed the behavior of the model +of the proposed method in dead zones. + +
+
+ comment: 6 pages, 6 figures, Accepted at AMC2024 +
+
+
+
+
+ + ☆ Cumulative Distribution Function based General Temporal Point Processes + + +
+ Temporal Point Processes (TPPs) hold a pivotal role in modeling event +sequences across diverse domains, including social networking and e-commerce, +and have significantly contributed to the advancement of recommendation systems +and information retrieval strategies. Through the analysis of events such as +user interactions and transactions, TPPs offer valuable insights into +behavioral patterns, facilitating the prediction of future trends. However, +accurately forecasting future events remains a formidable challenge due to the +intricate nature of these patterns. The integration of Neural Networks with +TPPs has ushered in the development of advanced deep TPP models. While these +models excel at processing complex and nonlinear temporal data, they encounter +limitations in modeling intensity functions, grapple with computational +complexities in integral computations, and struggle to capture long-range +temporal dependencies effectively. In this study, we introduce the CuFun model, +representing a novel approach to TPPs that revolves around the Cumulative +Distribution Function (CDF). CuFun stands out by uniquely employing a monotonic +neural network for CDF representation, utilizing past events as a scaling +factor. This innovation significantly bolsters the model's adaptability and +precision across a wide range of data scenarios. Our approach addresses several +critical issues inherent in traditional TPP modeling: it simplifies +log-likelihood calculations, extends applicability beyond predefined density +function forms, and adeptly captures long-range temporal patterns. Our +contributions encompass the introduction of a pioneering CDF-based TPP model, +the development of a methodology for incorporating past event information into +future event prediction, and empirical validation of CuFun's effectiveness +through extensive experimentation on synthetic and real-world datasets. + +
+
+
+
+
+ + ☆ Image2Points:A 3D Point-based Context Clusters GAN for High-Quality PET + Image Reconstruction ICASSP 2024 + + +
+ To obtain high-quality Positron emission tomography (PET) images while +minimizing radiation exposure, numerous methods have been proposed to +reconstruct standard-dose PET (SPET) images from the corresponding low-dose PET +(LPET) images. However, these methods heavily rely on voxel-based +representations, which fall short of adequately accounting for the precise +structure and fine-grained context, leading to compromised reconstruction. In +this paper, we propose a 3D point-based context clusters GAN, namely PCC-GAN, +to reconstruct high-quality SPET images from LPET. Specifically, inspired by +the geometric representation power of points, we resort to a point-based +representation to enhance the explicit expression of the image structure, thus +facilitating the reconstruction with finer details. Moreover, a context +clustering strategy is applied to explore the contextual relationships among +points, which mitigates the ambiguities of small structures in the +reconstructed images. Experiments on both clinical and phantom datasets +demonstrate that our PCC-GAN outperforms the state-of-the-art reconstruction +methods qualitatively and quantitatively. Code is available at +https://github.com/gluucose/PCCGAN. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ Adaptive Primal-Dual Method for Safe Reinforcement Learning + + +
+ Primal-dual methods have a natural application in Safe Reinforcement Learning +(SRL), posed as a constrained policy optimization problem. In practice however, +applying primal-dual methods to SRL is challenging, due to the inter-dependency +of the learning rate (LR) and Lagrangian multipliers (dual variables) each time +an embedded unconstrained RL problem is solved. In this paper, we propose, +analyze and evaluate adaptive primal-dual (APD) methods for SRL, where two +adaptive LRs are adjusted to the Lagrangian multipliers so as to optimize the +policy in each iteration. We theoretically establish the convergence, +optimality and feasibility of the APD algorithm. Finally, we conduct numerical +evaluation of the practical APD algorithm with four well-known environments in +Bullet-Safey-Gym employing two state-of-the-art SRL algorithms: PPO-Lagrangian +and DDPG-Lagrangian. All experiments show that the practical APD algorithm +outperforms (or achieves comparable performance) and attains more stable +training than the constant LR cases. Additionally, we substantiate the +robustness of selecting the two adaptive LRs by empirical evidence. + +
+
+
+
+
+ + ☆ Machine Unlearning for Image-to-Image Generative Models ICLR 2024 + + +
+ Machine unlearning has emerged as a new paradigm to deliberately forget data +samples from a given model in order to adhere to stringent regulations. +However, existing machine unlearning methods have been primarily focused on +classification models, leaving the landscape of unlearning for generative +models relatively unexplored. This paper serves as a bridge, addressing the gap +by providing a unifying framework of machine unlearning for image-to-image +generative models. Within this framework, we propose a +computationally-efficient algorithm, underpinned by rigorous theoretical +analysis, that demonstrates negligible performance degradation on the retain +samples, while effectively removing the information from the forget samples. +Empirical studies on two large-scale datasets, ImageNet-1K and Places-365, +further show that our algorithm does not rely on the availability of the retain +samples, which further complies with data retention policy. To our best +knowledge, this work is the first that represents systemic, theoretical, +empirical explorations of machine unlearning specifically tailored for +image-to-image generative models. Our code is available at +https://github.com/jpmorganchase/l2l-generator-unlearning. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ ODICE: Revealing the Mystery of Distribution Correction Estimation via + Orthogonal-gradient Update ICLR 2024 + + +
+ In this study, we investigate the DIstribution Correction Estimation (DICE) +methods, an important line of work in offline reinforcement learning (RL) and +imitation learning (IL). DICE-based methods impose state-action-level behavior +constraint, which is an ideal choice for offline learning. However, they +typically perform much worse than current state-of-the-art (SOTA) methods that +solely use action-level behavior constraint. After revisiting DICE-based +methods, we find there exist two gradient terms when learning the value +function using true-gradient update: forward gradient (taken on the current +state) and backward gradient (taken on the next state). Using forward gradient +bears a large similarity to many offline RL methods, and thus can be regarded +as applying action-level constraint. However, directly adding the backward +gradient may degenerate or cancel out its effect if these two gradients have +conflicting directions. To resolve this issue, we propose a simple yet +effective modification that projects the backward gradient onto the normal +plane of the forward gradient, resulting in an orthogonal-gradient update, a +new learning rule for DICE-based methods. We conduct thorough theoretical +analyses and find that the projected backward gradient brings state-level +behavior regularization, which reveals the mystery of DICE-based methods: the +value learning objective does try to impose state-action-level constraint, but +needs to be used in a corrected way. Through toy examples and extensive +experiments on complex offline RL and IL tasks, we demonstrate that DICE-based +methods using orthogonal-gradient updates (O-DICE) achieve SOTA performance and +great robustness. + +
+
+ comment: Spotlight @ ICLR 2024, first two authors contribute equally +
+
+
+
+
+ + ☆ Diverse Explanations from Data-driven and Domain-driven Perspectives for + Machine Learning Models + + +
+ Explanations of machine learning models are important, especially in +scientific areas such as chemistry, biology, and physics, where they guide +future laboratory experiments and resource requirements. These explanations can +be derived from well-trained machine learning models (data-driven perspective) +or specific domain knowledge (domain-driven perspective). However, there exist +inconsistencies between these perspectives due to accurate yet misleading +machine learning models and various stakeholders with specific needs, wants, or +aims. This paper calls attention to these inconsistencies and suggests a way to +find an accurate model with expected explanations that reinforce physical laws +and meet stakeholders' requirements from a set of equally-good models, also +known as Rashomon sets. Our goal is to foster a comprehensive understanding of +these inconsistencies and ultimately contribute to the integration of +eXplainable Artificial Intelligence (XAI) into scientific domains. + +
+
+
+
+
+ + ☆ Survey of Privacy Threats and Countermeasures in Federated Learning + + +
+ Federated learning is widely considered to be as a privacy-aware learning +method because no training data is exchanged directly between clients. +Nevertheless, there are threats to privacy in federated learning, and privacy +countermeasures have been studied. However, we note that common and unique +privacy threats among typical types of federated learning have not been +categorized and described in a comprehensive and specific way. In this paper, +we describe privacy threats and countermeasures for the typical types of +federated learning; horizontal federated learning, vertical federated learning, +and transfer federated learning. + +
+
+ comment: Scheduled for renewal by March 2024 +
+
+
+
+
+ + ☆ Comparing Spectral Bias and Robustness For Two-Layer Neural Networks: + SGD vs Adaptive Random Fourier Features + + +
+ We present experimental results highlighting two key differences resulting +from the choice of training algorithm for two-layer neural networks. The +spectral bias of neural networks is well known, while the spectral bias +dependence on the choice of training algorithm is less studied. Our experiments +demonstrate that an adaptive random Fourier features algorithm (ARFF) can yield +a spectral bias closer to zero compared to the stochastic gradient descent +optimizer (SGD). Additionally, we train two identically structured classifiers, +employing SGD and ARFF, to the same accuracy levels and empirically assess +their robustness against adversarial noise attacks. + +
+
+ comment: 6 Pages, 4 Figures; Accepted in the International Conference on + Scientific Computing and Machine Learning +
+
+
+
+
+ + ☆ PirateNets: Physics-informed Deep Learning with Residual Adaptive + Networks + + +
+ While physics-informed neural networks (PINNs) have become a popular deep +learning framework for tackling forward and inverse problems governed by +partial differential equations (PDEs), their performance is known to degrade +when larger and deeper neural network architectures are employed. Our study +identifies that the root of this counter-intuitive behavior lies in the use of +multi-layer perceptron (MLP) architectures with non-suitable initialization +schemes, which result in poor trainablity for the network derivatives, and +ultimately lead to an unstable minimization of the PDE residual loss. To +address this, we introduce Physics-informed Residual Adaptive Networks +(PirateNets), a novel architecture that is designed to facilitate stable and +efficient training of deep PINN models. PirateNets leverage a novel adaptive +residual connection, which allows the networks to be initialized as shallow +networks that progressively deepen during training. We also show that the +proposed initialization scheme allows us to encode appropriate inductive biases +corresponding to a given PDE system into the network architecture. We provide +comprehensive empirical evidence showing that PirateNets are easier to optimize +and can gain accuracy from considerably increased depth, ultimately achieving +state-of-the-art results across various benchmarks. All code and data +accompanying this manuscript will be made publicly available at +\url{https://github.com/PredictiveIntelligenceLab/jaxpi}. + +
+
+ comment: 29 Pages, 15 Figures, 8 Tables +
+
+
+
+
+ + ☆ A Consistent Lebesgue Measure for Multi-label Learning + + +
+ Multi-label loss functions are usually non-differentiable, requiring +surrogate loss functions for gradient-based optimisation. The consistency of +surrogate loss functions is not proven and is exacerbated by the conflicting +nature of multi-label loss functions. To directly learn from multiple related, +yet potentially conflicting multi-label loss functions, we propose a Consistent +Lebesgue Measure-based Multi-label Learner (CLML) and prove that CLML can +achieve theoretical consistency under a Bayes risk framework. Empirical +evidence supports our theory by demonstrating that: (1) CLML can consistently +achieve state-of-the-art results; (2) the primary performance factor is the +Lebesgue measure design, as CLML optimises a simpler feedforward model without +additional label graph, perturbation-based conditioning, or semantic +embeddings; and (3) an analysis of the results not only distinguishes CLML's +effectiveness but also highlights inconsistencies between the surrogate and the +desired loss functions. + +
+
+
+
+
+ + ☆ Analog-digital Scheduling for Federated Learning: A + Communication-Efficient Approach + + +
+ Over-the-air (OTA) computation has recently emerged as a +communication-efficient Federated Learning (FL) paradigm to train machine +learning models over wireless networks. However, its performance is limited by +the device with the worst SNR, resulting in fast yet noisy updates. On the +other hand, allocating orthogonal resource blocks (RB) to individual devices +via digital channels mitigates the noise problem, at the cost of increased +communication latency. In this paper, we address this discrepancy and present +ADFL, a novel Analog-Digital FL scheme: in each round, the parameter server +(PS) schedules each device to either upload its gradient via the analog OTA +scheme or transmit its quantized gradient over an orthogonal RB using the +``digital" scheme. Focusing on a single FL round, we cast the optimal +scheduling problem as the minimization of the mean squared error (MSE) on the +estimated global gradient at the PS, subject to a delay constraint, yielding +the optimal device scheduling configuration and quantization bits for the +digital devices. Our simulation results show that ADFL, by scheduling most of +the devices in the OTA scheme while also occasionally employing the digital +scheme for a few devices, consistently outperforms OTA-only and digital-only +schemes, in both i.i.d. and non-i.i.d. settings. + +
+
+
+
+
+ + ☆ Online Distribution Learning with Local Private Constraints + + +
+ We study the problem of online conditional distribution estimation with +\emph{unbounded} label sets under local differential privacy. Let $\mathcal{F}$ +be a distribution-valued function class with unbounded label set. We aim at +estimating an \emph{unknown} function $f\in \mathcal{F}$ in an online fashion +so that at time $t$ when the context $\boldsymbol{x}_t$ is provided we can +generate an estimate of $f(\boldsymbol{x}_t)$ under KL-divergence knowing only +a privatized version of the true labels sampling from $f(\boldsymbol{x}_t)$. +The ultimate objective is to minimize the cumulative KL-risk of a finite +horizon $T$. We show that under $(\epsilon,0)$-local differential privacy of +the privatized labels, the KL-risk grows as +$\tilde{\Theta}(\frac{1}{\epsilon}\sqrt{KT})$ upto poly-logarithmic factors +where $K=|\mathcal{F}|$. This is in stark contrast to the +$\tilde{\Theta}(\sqrt{T\log K})$ bound demonstrated by Wu et al. (2023a) for +bounded label sets. As a byproduct, our results recover a nearly tight upper +bound for the hypothesis selection problem of gopi et al. (2020) established +only for the batch setting. + +
+
+
+
+
+ + ☆ Control in Stochastic Environment with Delays: A Model-based + Reinforcement Learning Approach + + +
+ In this paper we are introducing a new reinforcement learning method for +control problems in environments with delayed feedback. Specifically, our +method employs stochastic planning, versus previous methods that used +deterministic planning. This allows us to embed risk preference in the policy +optimization problem. We show that this formulation can recover the optimal +policy for problems with deterministic transitions. We contrast our policy with +two prior methods from literature. We apply the methodology to simple tasks to +understand its features. Then, we compare the performance of the methods in +controlling multiple Atari games. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Seismic Traveltime Tomography with Label-free Learning + + +
+ Deep learning techniques have been used to build velocity models (VMs) for +seismic traveltime tomography and have shown encouraging performance in recent +years. However, they need to generate labeled samples (i.e., pairs of input and +label) to train the deep neural network (NN) with end-to-end learning, and the +real labels for field data inversion are usually missing or very expensive. +Some traditional tomographic methods can be implemented quickly, but their +effectiveness is often limited by prior assumptions. To avoid generating +labeled samples, we propose a novel method by integrating deep learning and +dictionary learning to enhance the VMs with low resolution by using the +traditional tomography-least square method (LSQR). We first design a type of +shallow and simple NN to reduce computational cost followed by proposing a +two-step strategy to enhance the VMs with low resolution: (1) Warming up. An +initial dictionary is trained from the estimation by LSQR through dictionary +learning method; (2) Dictionary optimization. The initial dictionary obtained +in the warming-up step will be optimized by the NN, and then it will be used to +reconstruct high-resolution VMs with the reference slowness and the estimation +by LSQR. Furthermore, we design a loss function to minimize traveltime misfit +to ensure that NN training is label-free, and the optimized dictionary can be +obtained after each epoch of NN training. We demonstrate the effectiveness of +the proposed method through numerical tests. + +
+
+ comment: 15 pages, 19 figures. Submitted to IEEE Transactions on Geoscience + and Remote Sensing +
+
+
+
+
+ + ☆ An Accurate and Low-Parameter Machine Learning Architecture for Next + Location Prediction + + +
+ Next location prediction is a discipline that involves predicting a users +next location. Its applications include resource allocation, quality of +service, energy efficiency, and traffic management. This paper proposes an +energy-efficient, small, and low parameter machine learning (ML) architecture +for accurate next location prediction, deployable on modest base stations and +edge devices. To accomplish this we ran a hundred hyperparameter experiments on +the full human mobility patterns of an entire city, to determine an exact ML +architecture that reached a plateau of accuracy with the least amount of model +parameters. We successfully achieved a reduction in the number of model +parameters within published ML architectures from 202 million down to 2 +million. This reduced the total size of the model parameters from 791 MB down +to 8 MB. Additionally, this decreased the training time by a factor of four, +the amount of graphics processing unit (GPU) memory needed for training by a +factor of twenty, and the overall accuracy was increased from 80.16% to 82.54%. +This improvement allows for modest base stations and edge devices which do not +have a large amount of memory or storage, to deploy and utilize the proposed ML +architecture for next location prediction. + +
+
+ comment: 7 page conference paper. Paper was accepted and presented in person + at the 2023 IEEE Future Networks World Forum, in Baltimore, Maryland, USA +
+
+
+
+
+ + ☆ Self-supervised learning of video representations from a child's + perspective + + +
+ Children learn powerful internal models of the world around them from a few +years of egocentric visual experience. Can such internal models be learned from +a child's visual experience with highly generic learning algorithms or do they +require strong inductive biases? Recent advances in collecting large-scale, +longitudinal, developmentally realistic video datasets and generic +self-supervised learning (SSL) algorithms are allowing us to begin to tackle +this nature vs. nurture question. However, existing work typically focuses on +image-based SSL algorithms and visual capabilities that can be learned from +static images (e.g. object recognition), thus ignoring temporal aspects of the +world. To close this gap, here we train self-supervised video models on +longitudinal, egocentric headcam recordings collected from a child over a two +year period in their early development (6-31 months). The resulting models are +highly effective at facilitating the learning of action concepts from a small +number of labeled examples; they have favorable data size scaling properties; +and they display emergent video interpolation capabilities. Video models also +learn more robust object representations than image-based models trained with +the exact same data. These results suggest that important temporal aspects of a +child's internal model of the world may be learnable from their visual +experience using highly generic learning algorithms and without strong +inductive biases. + +
+
+ comment: 7 pages, 6 figures; code & models available from + https://github.com/eminorhan/video-models +
+
+
+
+
+ + ☆ Attention-based Dynamic Multilayer Graph Neural Networks for Loan + Default Prediction + + +
+ Whereas traditional credit scoring tends to employ only individual borrower- +or loan-level predictors, it has been acknowledged for some time that +connections between borrowers may result in default risk propagating over a +network. In this paper, we present a model for credit risk assessment +leveraging a dynamic multilayer network built from a Graph Neural Network and a +Recurrent Neural Network, each layer reflecting a different source of network +connection. We test our methodology in a behavioural credit scoring context +using a dataset provided by U.S. mortgage financier Freddie Mac, in which +different types of connections arise from the geographical location of the +borrower and their choice of mortgage provider. The proposed model considers +both types of connections and the evolution of these connections over time. We +enhance the model by using a custom attention mechanism that weights the +different time snapshots according to their importance. After testing multiple +configurations, a model with GAT, LSTM, and the attention mechanism provides +the best results. Empirical results demonstrate that, when it comes to +predicting probability of default for the borrowers, our proposed model brings +both better results and novel insights for the analysis of the importance of +connections and timestamps, compared to traditional methods. + +
+
+
+
+
+ + ☆ PAP-REC: Personalized Automatic Prompt for Recommendation Language Model + + +
+ Recently emerged prompt-based Recommendation Language Models (RLM) can solve +multiple recommendation tasks uniformly. The RLMs make full use of the +inherited knowledge learned from the abundant pre-training data to solve the +downstream recommendation tasks by prompts, without introducing additional +parameters or network training. However, handcrafted prompts require +significant expertise and human effort since slightly rewriting prompts may +cause massive performance changes. In this paper, we propose PAP-REC, a +framework to generate the Personalized Automatic Prompt for RECommendation +language models to mitigate the inefficiency and ineffectiveness problems +derived from manually designed prompts. Specifically, personalized automatic +prompts allow different users to have different prompt tokens for the same +task, automatically generated using a gradient-based method. One challenge for +personalized automatic prompt generation for recommendation language models is +the extremely large search space, leading to a long convergence time. To +effectively and efficiently address the problem, we develop surrogate metrics +and leverage an alternative updating schedule for prompting recommendation +language models. Experimental results show that our PAP-REC framework manages +to generate personalized prompts, and the automatically generated prompts +outperform manually constructed prompts and also outperform various baseline +recommendation models. The source code of the work is available at +https://github.com/rutgerswiselab/PAP-REC. + +
+
+
+
+
+ + ☆ Understanding Neural Network Systems for Image Analysis using Vector + Spaces and Inverse Maps + + +
+ There is strong interest in developing mathematical methods that can be used +to understand complex neural networks used in image analysis. In this paper, we +introduce techniques from Linear Algebra to model neural network layers as maps +between signal spaces. First, we demonstrate how signal spaces can be used to +visualize weight spaces and convolutional layer kernels. We also demonstrate +how residual vector spaces can be used to further visualize information lost at +each layer. Second, we introduce the concept of invertible networks and an +algorithm for computing input images that yield specific outputs. We +demonstrate our approach on two invertible networks and ResNet18. + +
+
+
+
+
+ + ☆ Score-based Causal Representation Learning: Linear and General + Transformations AISTATS 2024 + + +
+ This paper addresses intervention-based causal representation learning (CRL) +under a general nonparametric latent causal model and an unknown transformation +that maps the latent variables to the observed variables. Linear and general +transformations are investigated. The paper addresses both the +\emph{identifiability} and \emph{achievability} aspects. Identifiability refers +to determining algorithm-agnostic conditions that ensure recovering the true +latent causal variables and the latent causal graph underlying them. +Achievability refers to the algorithmic aspects and addresses designing +algorithms that achieve identifiability guarantees. By drawing novel +connections between \emph{score functions} (i.e., the gradients of the +logarithm of density functions) and CRL, this paper designs a \emph{score-based +class of algorithms} that ensures both identifiability and achievability. +First, the paper focuses on \emph{linear} transformations and shows that one +stochastic hard intervention per node suffices to guarantee identifiability. It +also provides partial identifiability guarantees for soft interventions, +including identifiability up to ancestors for general causal models and perfect +latent graph recovery for sufficiently non-linear causal models. Secondly, it +focuses on \emph{general} transformations and shows that two stochastic hard +interventions per node suffice for identifiability. Notably, one does +\emph{not} need to know which pair of interventional environments have the same +node intervened. + +
+
+ comment: Linear transformations: stronger results for hard and soft + interventions than our previous paper Score-based Causal Representation + Learning with Interventions (https://arxiv.org/abs/2301.08230). General + transformations: results also appear in our paper General Identifiability and + Achievability for Causal Representation Learning (arXiv:2310.15450) accepted + to AISTATS 2024 (oral) +
+
+
+
+
+ + ♻ ☆ Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal + Inference + + +
+ This paper presents a groundbreaking approach to causal inference by +integrating continuous normalizing flows (CNFs) with parametric submodels, +enhancing their geometric sensitivity and improving upon traditional Targeted +Maximum Likelihood Estimation (TMLE). Our method employs CNFs to refine TMLE, +optimizing the Cram\'er-Rao bound and transitioning from a predefined +distribution $p_0$ to a data-driven distribution $p_1$. We innovate further by +embedding Wasserstein gradient flows within Fokker-Planck equations, thus +imposing geometric structures that boost the robustness of CNFs, particularly +in optimal transport theory. + Our approach addresses the disparity between sample and population +distributions, a critical factor in parameter estimation bias. We leverage +optimal transport and Wasserstein gradient flows to develop causal inference +methodologies with minimal variance in finite-sample settings, outperforming +traditional methods like TMLE and AIPW. This novel framework, centered on +Wasserstein gradient flows, minimizes variance in efficient influence functions +under distribution $p_t$. Preliminary experiments showcase our method's +superiority, yielding lower mean-squared errors compared to standard flows, +thereby demonstrating the potential of geometry-aware normalizing Wasserstein +flows in advancing statistical modeling and inference. + +
+
+
+
+
+ + ♻ ☆ RLHF and IIA: Perverse Incentives + + +
+ Existing algorithms for reinforcement learning from human feedback (RLHF) can +incentivize responses at odds with preferences because they are based on models +that assume independence of irrelevant alternatives (IIA). The perverse +incentives induced by IIA hinder innovations on query formats and learning +algorithms. + +
+
+
+
+
+ + ♻ ☆ The Power of Populations in Decentralized Bandits + + +
+ We study a cooperative multi-agent bandit setting in the distributed GOSSIP +model: in every round, each of $n$ agents chooses an action from a common set, +observes the action's corresponding reward, and subsequently exchanges +information with a single randomly chosen neighbor, which informs its policy in +the next round. We introduce and analyze several families of +fully-decentralized local algorithms in this setting under the constraint that +each agent has only constant memory. We highlight a connection between the +global evolution of such decentralized algorithms and a new class of "zero-sum" +multiplicative weights update methods, and we develop a general framework for +analyzing the population-level regret of these natural protocols. Using this +framework, we derive sublinear regret bounds for both stationary and +adversarial reward settings. Moreover, we show that these simple local +algorithms can approximately optimize convex functions over the simplex, +assuming that the reward distributions are generated from a stochastic gradient +oracle. + +
+
+
+
+
+ + ♻ ☆ Mitigating System Bias in Resource Constrained Asynchronous Federated + Learning Systems + + +
+ Federated learning (FL) systems face performance challenges in dealing with +heterogeneous devices and non-identically distributed data across clients. We +propose a dynamic global model aggregation method within Asynchronous Federated +Learning (AFL) deployments to address these issues. Our aggregation method +scores and adjusts the weighting of client model updates based on their upload +frequency to accommodate differences in device capabilities. Additionally, we +also immediately provide an updated global model to clients after they upload +their local models to reduce idle time and improve training efficiency. We +evaluate our approach within an AFL deployment consisting of 10 simulated +clients with heterogeneous compute constraints and non-IID data. The simulation +results, using the FashionMNIST dataset, demonstrate over 10% and 19% +improvement in global model accuracy compared to state-of-the-art methods +PAPAYA and FedAsync, respectively. Our dynamic aggregation method allows +reliable global model training despite limiting client resources and +statistical data heterogeneity. This improves robustness and scalability for +real-world FL deployments. + +
+
+ comment: 6 pages, 5 figures. This work has been accepted by PerCom PerconAI + workshop 2024 +
+
+
+
+
+ + ♻ ☆ Engineering A Large Language Model From Scratch + + +
+ The proliferation of deep learning in natural language processing (NLP) has +led to the development and release of innovative technologies capable of +understanding and generating human language with remarkable proficiency. +Atinuke, a Transformer-based neural network, optimises performance across +various language tasks by utilising a unique configuration. The architecture +interweaves layers for processing sequential data with attention mechanisms to +draw meaningful affinities between inputs and outputs. Due to the configuration +of its topology and hyperparameter tuning, it can emulate human-like language +by extracting features and learning complex mappings. Atinuke is modular, +extensible, and integrates seamlessly with existing machine learning pipelines. +Advanced matrix operations like softmax, embeddings, and multi-head attention +enable nuanced handling of textual, acoustic, and visual signals. By unifying +modern deep learning techniques with software design principles and +mathematical theory, the system achieves state-of-the-art results on natural +language tasks whilst remaining interpretable and robust. + +
+
+
+
+
+ + ♻ ☆ Conformal Prediction Sets Improve Human Decision Making + + +
+ In response to everyday queries, humans explicitly signal uncertainty and +offer alternative answers when they are unsure. Machine learning models that +output calibrated prediction sets through conformal prediction mimic this human +behaviour; larger sets signal greater uncertainty while providing alternatives. +In this work, we study the usefulness of conformal prediction sets as an aid +for human decision making by conducting a pre-registered randomized controlled +trial with conformal prediction sets provided to human subjects. With +statistical significance, we find that when humans are given conformal +prediction sets their accuracy on tasks improves compared to fixed-size +prediction sets with the same coverage guarantee. The results show that +quantifying model uncertainty with conformal prediction is helpful for +human-in-the-loop decision making and human-AI teams. + +
+
+ comment: Code available at + https://github.com/layer6ai-labs/hitl-conformal-prediction +
+
+
+
+
+ + ♻ ☆ HyperMask: Adaptive Hypernetwork-based Masks for Continual Learning + + +
+ Artificial neural networks suffer from catastrophic forgetting when they are +sequentially trained on multiple tasks. Many continual learning (CL) strategies +are trying to overcome this problem. One of the most effective is the +hypernetwork-based approach. The hypernetwork generates the weights of a target +model based on the task's identity. The model's main limitation is that, in +practice, the hypernetwork can produce completely different architectures for +subsequent tasks. To solve such a problem, we use the lottery ticket +hypothesis, which postulates the existence of sparse subnetworks, named winning +tickets, that preserve the performance of a whole network. In the paper, we +propose a method called HyperMask, which trains a single network for all CL +tasks. The hypernetwork produces semi-binary masks to obtain target subnetworks +dedicated to consecutive tasks. Moreover, due to the lottery ticket hypothesis, +we can use a single network with weighted subnets. Depending on the task, the +importance of some weights may be dynamically enhanced while others may be +weakened. HyperMask achieves competitive results in several CL datasets and, in +some scenarios, goes beyond the state-of-the-art scores, both with derived and +unknown task identities. + +
+
+
+
+
+ + ♻ ☆ Enhancing Energy-Awareness in Deep Learning through Fine-Grained Energy + Measurement + + +
+ With the increasing usage, scale, and complexity of Deep Learning (DL) +models, their rapidly growing energy consumption has become a critical concern. +Promoting green development and energy awareness at different granularities is +the need of the hour to limit carbon emissions of DL systems. However, the lack +of standard and repeatable tools to accurately measure and optimize energy +consumption at a fine granularity (e.g., at method level) hinders progress in +this area. This paper introduces FECoM (Fine-grained Energy Consumption Meter), +a framework for fine-grained DL energy consumption measurement. FECoM enables +researchers and developers to profile DL APIs from energy perspective. FECoM +addresses the challenges of measuring energy consumption at fine-grained level +by using static instrumentation and considering various factors, including +computational load and temperature stability. We assess FECoM's capability to +measure fine-grained energy consumption for one of the most popular open-source +DL frameworks, namely TensorFlow. Using FECoM, we also investigate the impact +of parameter size and execution time on energy consumption, enriching our +understanding of TensorFlow APIs' energy profiles. Furthermore, we elaborate on +the considerations, issues, and challenges that one needs to consider while +designing and implementing a fine-grained energy consumption measurement tool. +This work will facilitate further advances in DL energy measurement and the +development of energy-aware practices for DL systems. + +
+
+
+
+
+ + ♻ ☆ Leveraging Open Information Extraction for More Robust Domain Transfer + of Event Trigger Detection EACL 2024 + + +
+ Event detection is a crucial information extraction task in many domains, +such as Wikipedia or news. The task typically relies on trigger detection (TD) +-- identifying token spans in the text that evoke specific events. While the +notion of triggers should ideally be universal across domains, domain transfer +for TD from high- to low-resource domains results in significant performance +drops. We address the problem of negative transfer in TD by coupling triggers +between domains using subject-object relations obtained from a rule-based open +information extraction (OIE) system. We demonstrate that OIE relations injected +through multi-task training can act as mediators between triggers in different +domains, enhancing zero- and few-shot TD domain transfer and reducing +performance drops, in particular when transferring from a high-resource source +domain (Wikipedia) to a low(er)-resource target domain (news). Additionally, we +combine this improved transfer with masked language modeling on the target +domain, observing further TD transfer gains. Finally, we demonstrate that the +gains are robust to the choice of the OIE system. + +
+
+ comment: Accepted at EACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Probability-Generating Function Kernels for Spherical Data + + +
+ Probability-generating function (PGF) kernels are introduced, which +constitute a class of kernels supported on the unit hypersphere, for the +purposes of spherical data analysis. PGF kernels generalize RBF kernels in the +context of spherical data. The properties of PGF kernels are studied. A +semi-parametric learning algorithm is introduced to enable the use of PGF +kernels with spherical data. + +
+
+
+
+
+ + ♻ ☆ Generative quantum machine learning via denoising diffusion + probabilistic models + + +
+ Deep generative models are key-enabling technology to computer vision, text +generation and large language models. Denoising diffusion probabilistic models +(DDPMs) have recently gained much attention due to their ability to generate +diverse and high-quality samples in many computer vision tasks, as well as to +incorporate flexible model architectures and relatively simple training scheme. +Quantum generative models, empowered by entanglement and superposition, have +brought new insight to learning classical and quantum data. Inspired by the +classical counterpart, we propose the \emph{quantum denoising diffusion +probabilistic model} (QuDDPM) to enable efficiently trainable generative +learning of quantum data. QuDDPM adopts sufficient layers of circuits to +guarantee expressivity, while introduces multiple intermediate training tasks +as interpolation between the target distribution and noise to avoid barren +plateau and guarantee efficient training. We provide bounds on the learning +error and demonstrate QuDDPM's capability in learning correlated quantum noise +model, quantum many-body phases and topological structure of quantum data. The +results provide a paradigm for versatile and efficient quantum generative +learning. + +
+
+ comment: 5+10 pages, 16 figures. PRL accepted version. Code available at: + https://github.com/francis-hsu/quantgenmdl +
+
+
+
+
+ + ♻ ☆ Emergent Dominance Hierarchies in Reinforcement Learning Agents + + +
+ Modern Reinforcement Learning (RL) algorithms are able to outperform humans +in a wide variety of tasks. Multi-agent reinforcement learning (MARL) settings +present additional challenges, and successful cooperation in mixed-motive +groups of agents depends on a delicate balancing act between individual and +group objectives. Social conventions and norms, often inspired by human +institutions, are used as tools for striking this balance. + In this paper, we examine a fundamental, well-studied social convention that +underlies cooperation in both animal and human societies: dominance +hierarchies. + We adapt the ethological theory of dominance hierarchies to artificial +agents, borrowing the established terminology and definitions with as few +amendments as possible. We demonstrate that populations of RL agents, operating +without explicit programming or intrinsic rewards, can invent, learn, enforce, +and transmit a dominance hierarchy to new populations. The dominance +hierarchies that emerge have a similar structure to those studied in chickens, +mice, fish, and other species. + +
+
+
+
+
+ + ♻ ☆ Machine learning for sports betting: should model selection be based on + accuracy or calibration? + + +
+ Sports betting's recent federal legalisation in the USA coincides with the +golden age of machine learning. If bettors can leverage data to reliably +predict the probability of an outcome, they can recognise when the bookmaker's +odds are in their favour. As sports betting is a multi-billion dollar industry +in the USA alone, identifying such opportunities could be extremely lucrative. +Many researchers have applied machine learning to the sports outcome prediction +problem, generally using accuracy to evaluate the performance of predictive +models. We hypothesise that for the sports betting problem, model calibration +is more important than accuracy. To test this hypothesis, we train models on +NBA data over several seasons and run betting experiments on a single season, +using published odds. We show that using calibration, rather than accuracy, as +the basis for model selection leads to greater returns, on average (return on +investment of $+34.69\%$ versus $-35.17\%$) and in the best case ($+36.93\%$ +versus $+5.56\%$). These findings suggest that for sports betting (or any +probabilistic decision-making problem), calibration is a more important metric +than accuracy. Sports bettors who wish to increase profits should therefore +select their predictive model based on calibration, rather than accuracy. + +
+
+ comment: 15 pages, 5 Figures. Paper submitted to Elsevier's Machine Learning + with Applications +
+
+
+
+
+ + ♻ ☆ On the Second-Order Convergence of Biased Policy Gradient Algorithms + + +
+ Since the objective functions of reinforcement learning problems are +typically highly nonconvex, it is desirable that policy gradient, the most +popular algorithm, escapes saddle points and arrives at second-order stationary +points. Existing results only consider vanilla policy gradient algorithms with +unbiased gradient estimators, but practical implementations under the +infinite-horizon discounted reward setting are biased due to finite-horizon +sampling. Moreover, actor-critic methods, whose second-order convergence has +not yet been established, are also biased due to the critic approximation of +the value function. We provide a novel second-order analysis of biased policy +gradient methods, including the vanilla gradient estimator computed from +Monte-Carlo sampling of trajectories as well as the double-loop actor-critic +algorithm, where in the inner loop the critic improves the approximation of the +value function via TD(0) learning. Separately, we also establish the +convergence of TD(0) on Markov chains irrespective of initial state +distribution. + +
+
+
+
+
+ + ♻ ☆ Fast Cerebral Blood Flow Analysis via Extreme Learning Machine + + +
+ We introduce a rapid and precise analytical approach for analyzing cerebral +blood flow (CBF) using Diffuse Correlation Spectroscopy (DCS) with the +application of the Extreme Learning Machine (ELM). Our evaluation of ELM and +existing algorithms involves a comprehensive set of metrics. We assess these +algorithms using synthetic datasets for both semi-infinite and multi-layer +models. The results demonstrate that ELM consistently achieves higher fidelity +across various noise levels and optical parameters, showcasing robust +generalization ability and outperforming iterative fitting algorithms. Through +a comparison with a computationally efficient neural network, ELM attains +comparable accuracy with reduced training and inference times. Notably, the +absence of a back-propagation process in ELM during training results in +significantly faster training speeds compared to existing neural network +approaches. This proposed strategy holds promise for edge computing +applications with online training capabilities. + +
+
+ comment: Not ready to submission. Need further correction +
+
+
+
+
+ + ♻ ☆ Enhancing Blood Flow Assessment in Diffuse Correlation Spectroscopy: A + Transfer Learning Approach with Noise Robustness Analysis + + +
+ Diffuse correlation spectroscopy (DCS) is an emerging noninvasive technique +that measures the tissue blood flow, by using near-infrared coherent +point-source illumination to detect spectral changes. While machine learning +has demonstrated significant potential for measuring blood flow index (BFi), an +open question concerning the success of this approach pertains to its +robustness in scenarios involving deviations between datasets with varying +Signal-to-Noise Ratios (SNRs) originating from diverse clinical applications +and various setups. This study proposes a transfer learning approach, aims to +assess the influence of SNRs on the generalization ability of learned features, +and demonstrate the robustness for transfer learning. A synthetic dataset with +varying levels of added noise is utilized to simulate different SNRs. The +proposed network takes a 1x64 autocorrelation curve as input and generates BFi +and the correlation parameter beta. The proposed model demonstrates excellent +performance across different SNRs, exhibiting enhanced fitting accuracy, +particularly for low SNR datasets when compared with other fitting methods. +This highlights its potential for clinical diagnosis and treatment across +various scenarios under different clinical setups. + +
+
+ comment: Not ready for submission. Need further changes +
+
+
+
+
+ + ♻ ☆ DP-SGD with weight clipping + + +
+ Recently, due to the popularity of deep neural networks and other methods +whose training typically relies on the optimization of an objective function, +and due to concerns for data privacy, there is a lot of interest in +differentially private gradient descent methods. To achieve differential +privacy guarantees with a minimum amount of noise, it is important to be able +to bound precisely the sensitivity of the information which the participants +will observe. In this study, we present a novel approach that mitigates the +bias arising from traditional gradient clipping. By leveraging a public upper +bound of the Lipschitz value of the current model and its current location +within the search domain, we can achieve refined noise level adjustments. We +present a new algorithm with improved differential privacy guarantees and a +systematic empirical evaluation, showing that our new approach outperforms +existing approaches also in practice. + +
+
+
+
+
+ + ♻ ☆ Online Graph Topology Learning from Matrix-valued Time Series + + +
+ This paper is concerned with the statistical analysis of matrix-valued time +series. These are data collected over a network of sensors (typically a set of +spatial locations) along time, where a vector of features is observed per time +instant per sensor. Thus each sensor is characterized by a vectorial time +series. We would like to identify the dependency structure among these sensors +and represent it by a graph. When there is only one feature per sensor, the +vector auto-regressive models have been widely adapted to infer the structure +of Granger causality. The resulting graph is referred to as causal graph. Our +first contribution is then extending VAR models to matrix-variate models to +serve the purpose of graph learning. Secondly, we propose two online procedures +respectively in low and high dimensions, which can update quickly the estimates +of coefficients when new samples arrive. In particular in high dimensional +regime, a novel Lasso-type is introduced and we develop its homotopy algorithms +for the online learning. We also provide an adaptive tuning procedure for the +regularization parameter. Lastly, we consider that, the application of AR +models onto data usually requires detrending the raw data, however, this step +is forbidden in online context. Therefore, we augment the proposed AR models by +incorporating trend as extra parameter, and then adapt the online algorithms to +the augmented data models, which allow us to simultaneously learn the graph and +trend from streaming samples. In this work, we consider primarily the periodic +trend. Numerical experiments using both synthetic and real data are performed, +whose results support the effectiveness of the proposed methods. + +
+
+
+
+
+ + ♻ ☆ Breaking the Communication-Privacy-Accuracy Tradeoff with + $f$-Differential Privacy + + +
+ We consider a federated data analytics problem in which a server coordinates +the collaborative data analysis of multiple users with privacy concerns and +limited communication capability. The commonly adopted compression schemes +introduce information loss into local data while improving communication +efficiency, and it remains an open problem whether such discrete-valued +mechanisms provide any privacy protection. In this paper, we study the local +differential privacy guarantees of discrete-valued mechanisms with finite +output space through the lens of $f$-differential privacy (DP). More +specifically, we advance the existing literature by deriving tight $f$-DP +guarantees for a variety of discrete-valued mechanisms, including the binomial +noise and the binomial mechanisms that are proposed for privacy preservation, +and the sign-based methods that are proposed for data compression, in +closed-form expressions. We further investigate the amplification in privacy by +sparsification and propose a ternary stochastic compressor. By leveraging +compression for privacy amplification, we improve the existing methods by +removing the dependency of accuracy (in terms of mean square error) on +communication cost in the popular use case of distributed mean estimation, +therefore breaking the three-way tradeoff between privacy, communication, and +accuracy. Finally, we discuss the Byzantine resilience of the proposed +mechanism and its application in federated learning. + +
+
+
+
+
+ + ♻ ☆ Multi-Relational Hyperbolic Word Embeddings from Natural Language + Definitions EACL 2024 + + +
+ Natural language definitions possess a recursive, self-explanatory semantic +structure that can support representation learning methods able to preserve +explicit conceptual relations and constraints in the latent space. This paper +presents a multi-relational model that explicitly leverages such a structure to +derive word embeddings from definitions. By automatically extracting the +relations linking defined and defining terms from dictionaries, we demonstrate +how the problem of learning word embeddings can be formalised via a +translational framework in Hyperbolic space and used as a proxy to capture the +global semantic structure of definitions. An extensive empirical analysis +demonstrates that the framework can help imposing the desired structural +constraints while preserving the semantic mapping required for controllable and +interpretable traversal. Moreover, the experiments reveal the superiority of +the Hyperbolic word embeddings over the Euclidean counterparts and demonstrate +that the multi-relational approach can obtain competitive results when compared +to state-of-the-art neural models, with the advantage of being intrinsically +more efficient and interpretable. + +
+
+ comment: Accepted at the 18th Conference of the European Chapter of the + Association for Computational Linguistics (EACL 2024) +
+
+
+
+
+ + ♻ ☆ Discovering interpretable elastoplasticity models via the neural + polynomial method enabled symbolic regressions + + +
+ Conventional neural network elastoplasticity models are often perceived as +lacking interpretability. This paper introduces a two-step machine learning +approach that returns mathematical models interpretable by human experts. In +particular, we introduce a surrogate model where yield surfaces are expressed +in terms of a set of single-variable feature mappings obtained from supervised +learning. A post-processing step is then used to re-interpret the set of +single-variable neural network mapping functions into mathematical form through +symbolic regression. This divide-and-conquer approach provides several +important advantages. First, it enables us to overcome the scaling issue of +symbolic regression algorithms. From a practical perspective, it enhances the +portability of learned models for partial differential equation solvers written +in different programming languages. Finally, it enables us to have a concrete +understanding of the attributes of the materials, such as convexity and +symmetries of models, through automated derivations and reasoning. Numerical +examples have been provided, along with an open-source code to enable +third-party validation. + +
+
+
+
+
+ + ♻ ☆ Physics-constrained convolutional neural networks for inverse problems + in spatiotemporal partial differential equations + + +
+ We propose a physics-constrained convolutional neural network (PC-CNN) to +solve two types of inverse problems in partial differential equations (PDEs), +which are nonlinear and vary both in space and time. In the first inverse +problem, we are given data that is offset by spatially varying systematic error +(i.e., the bias, also known as the epistemic uncertainty). The task is to +uncover from the biased data the true state, which is the solution of the PDE. +In the second inverse problem, we are given sparse information on the solution +of a PDE. The task is to reconstruct the solution in space with +high-resolution. First, we present the PC-CNN, which constrains the PDE with a +simple time-windowing scheme to handle sequential data. Second, we analyse the +performance of the PC-CNN for uncovering solutions from biased data. We analyse +both linear and nonlinear convection-diffusion equations, and the Navier-Stokes +equations, which govern the spatiotemporally chaotic dynamics of turbulent +flows. We find that the PC-CNN correctly recovers the true solution for a +variety of biases, which are parameterised as non-convex functions. Third, we +analyse the performance of the PC-CNN for reconstructing solutions from biased +data for the turbulent flow. We reconstruct the spatiotemporal chaotic solution +on a high-resolution grid from only 2\% of the information contained in it. For +both tasks, we further analyse the Navier-Stokes solutions. We find that the +inferred solutions have a physical spectral energy content, whereas traditional +methods, such as interpolation, do not. This work opens opportunities for +solving inverse problems with partial differential equations. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.04600, + arXiv:2306.10990 +
+
+
+
+
+ + ♻ ☆ Privacy Preserving Adaptive Experiment Design + + +
+ Adaptive experiment is widely adopted to estimate conditional average +treatment effect (CATE) in clinical trials and many other scenarios. While the +primary goal in experiment is to maximize estimation accuracy, due to the +imperative of social welfare, it's also crucial to provide treatment with +superior outcomes to patients, which is measured by regret in contextual bandit +framework. These two objectives often lead to contrast optimal allocation +mechanism. Furthermore, privacy concerns arise in clinical scenarios containing +sensitive data like patients health records. Therefore, it's essential for the +treatment allocation mechanism to incorporate robust privacy protection +measures. In this paper, we investigate the tradeoff between loss of social +welfare and statistical power in contextual bandit experiment. We propose a +matched upper and lower bound for the multi-objective optimization problem, and +then adopt the concept of Pareto optimality to mathematically characterize the +optimality condition. Furthermore, we propose differentially private algorithms +which still matches the lower bound, showing that privacy is "almost free". +Additionally, we derive the asymptotic normality of the estimator, which is +essential in statistical inference and hypothesis testing. + +
+
+ comment: Update an algorithm and the title of our paper +
+
+
+
+
+ + ♻ ☆ Towards Cross-Table Masked Pretraining for Web Data Mining WWW 2024 + + +
+ Tabular data pervades the landscape of the World Wide Web, playing a +foundational role in the digital architecture that underpins online +information. Given the recent influence of large-scale pretrained models like +ChatGPT and SAM across various domains, exploring the application of +pretraining techniques for mining tabular data on the web has emerged as a +highly promising research direction. Indeed, there have been some recent works +around this topic where most (if not all) of them are limited in the scope of a +fixed-schema/single table. Due to the scale of the dataset and the parameter +size of the prior models, we believe that we have not reached the ''BERT +moment'' for the ubiquitous tabular data. The development on this line +significantly lags behind the counterpart research domains such as natural +language processing. In this work, we first identify the crucial challenges +behind tabular data pretraining, particularly overcoming the cross-table +hurdle. As a pioneering endeavor, this work mainly (i)-contributes a +high-quality real-world tabular dataset, (ii)-proposes an innovative, generic, +and efficient cross-table pretraining framework, dubbed as CM2, where the core +to it comprises a semantic-aware tabular neural network that uniformly encodes +heterogeneous tables without much restriction and (iii)-introduces a novel +pretraining objective -- prompt Masked Table Modeling (pMTM) -- inspired by NLP +but intricately tailored to scalable pretraining on tables. Our extensive +experiments demonstrate CM2's state-of-the-art performance and validate that +cross-table pretraining can enhance various downstream tasks. + +
+
+ comment: Accepted to WWW 2024 +
+
+
+
+
+ + ♻ ☆ Relationship between Batch Size and Number of Steps Needed for Nonconvex + Optimization of Stochastic Gradient Descent using Armijo Line Search + + +
+ While stochastic gradient descent (SGD) can use various learning rates, such +as constant or diminishing rates, the previous numerical results showed that +SGD performs better than other deep learning optimizers using when it uses +learning rates given by line search methods. In this paper, we perform a +convergence analysis on SGD with a learning rate given by an Armijo line search +for nonconvex optimization indicating that the upper bound of the expectation +of the squared norm of the full gradient becomes small when the number of steps +and the batch size are large. Next, we show that, for SGD with the +Armijo-line-search learning rate, the number of steps needed for nonconvex +optimization is a monotone decreasing convex function of the batch size; that +is, the number of steps needed for nonconvex optimization decreases as the +batch size increases. Furthermore, we show that the stochastic first-order +oracle (SFO) complexity, which is the stochastic gradient computation cost, is +a convex function of the batch size; that is, there exists a critical batch +size that minimizes the SFO complexity. Finally, we provide numerical results +that support our theoretical results. The numerical results indicate that the +number of steps needed for training deep neural networks decreases as the batch +size increases and that there exist the critical batch sizes that can be +estimated from the theoretical results. + +
+
+
+
+
+ + ♻ ☆ Interpretable Concept Bottlenecks to Align Reinforcement Learning Agents + + +
+ Goal misalignment, reward sparsity and difficult credit assignment are only a +few of the many issues that make it difficult for deep reinforcement learning +(RL) agents to learn optimal policies. Unfortunately, the black-box nature of +deep neural networks impedes the inclusion of domain experts for inspecting the +model and revising suboptimal policies. To this end, we introduce *Successive +Concept Bottleneck Agents* (SCoBots), that integrate consecutive concept +bottleneck (CB) layers. In contrast to current CB models, SCoBots do not just +represent concepts as properties of individual objects, but also as relations +between objects which is crucial for many RL tasks. Our experimental results +provide evidence of SCoBots' competitive performances, but also of their +potential for domain experts to understand and regularize their behavior. Among +other things, SCoBots enabled us to identify a previously unknown misalignment +problem in the iconic video game, Pong, and resolve it. Overall, SCoBots thus +result in more human-aligned RL agents. Our code is available at +https://github.com/k4ntz/SCoBots . + +
+
+ comment: 20 pages, 8 of main text, 8 of appendix, 3 main figures +
+
+
+
+
+ + ♻ ☆ ACT: Empowering Decision Transformer with Dynamic Programming via + Advantage Conditioning AAAI 2024 + + +
+ Decision Transformer (DT), which employs expressive sequence modeling +techniques to perform action generation, has emerged as a promising approach to +offline policy optimization. However, DT generates actions conditioned on a +desired future return, which is known to bear some weaknesses such as the +susceptibility to environmental stochasticity. To overcome DT's weaknesses, we +propose to empower DT with dynamic programming. Our method comprises three +steps. First, we employ in-sample value iteration to obtain approximated value +functions, which involves dynamic programming over the MDP structure. Second, +we evaluate action quality in context with estimated advantages. We introduce +two types of advantage estimators, IAE and GAE, which are suitable for +different tasks. Third, we train an Advantage-Conditioned Transformer (ACT) to +generate actions conditioned on the estimated advantages. Finally, during +testing, ACT generates actions conditioned on a desired advantage. Our +evaluation results validate that, by leveraging the power of dynamic +programming, ACT demonstrates effective trajectory stitching and robust action +generation in spite of the environmental stochasticity, outperforming baseline +methods across various benchmarks. Additionally, we conduct an in-depth +analysis of ACT's various design choices through ablation studies. Our code is +available at https://github.com/LAMDA-RL/ACT. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Compression-Aware Split Learning and Inference for Enhanced + Network Efficiency + + +
+ The growing number of AI-driven applications in mobile devices has led to +solutions that integrate deep learning models with the available edge-cloud +resources. Due to multiple benefits such as reduction in on-device energy +consumption, improved latency, improved network usage, and certain privacy +improvements, split learning, where deep learning models are split away from +the mobile device and computed in a distributed manner, has become an +extensively explored topic. Incorporating compression-aware methods (where +learning adapts to compression level of the communicated data) has made split +learning even more advantageous. This method could even offer a viable +alternative to traditional methods, such as federated learning techniques. In +this work, we develop an adaptive compression-aware split learning method +('deprune') to improve and train deep learning models so that they are much +more network-efficient, which would make them ideal to deploy in weaker devices +with the help of edge-cloud resources. This method is also extended ('prune') +to very quickly train deep learning models through a transfer learning +approach, which trades off little accuracy for much more network-efficient +inference abilities. We show that the 'deprune' method can reduce network usage +by 4x when compared with a split-learning approach (that does not use our +method) without loss of accuracy, while also improving accuracy over +compression-aware split-learning by 4 percent. Lastly, we show that the 'prune' +method can reduce the training time for certain models by up to 6x without +affecting the accuracy when compared against a compression-aware split-learning +approach. + +
+
+
+
+
+ + ♻ ☆ Acceleration of stochastic gradient descent with momentum by averaging: + finite-sample rates and asymptotic normality + + +
+ Stochastic gradient descent with momentum (SGDM) has been widely used in many +machine learning and statistical applications. Despite the observed empirical +benefits of SGDM over traditional SGD, the theoretical understanding of the +role of momentum for different learning rates in the optimization process +remains widely open. We analyze the finite-sample convergence rate of SGDM +under the strongly convex settings and show that, with a large batch size, the +mini-batch SGDM converges faster than the mini-batch SGD to a neighborhood of +the optimal value. Additionally, our findings, supported by theoretical +analysis and numerical experiments, indicate that SGDM permits broader choices +of learning rates. Furthermore, we analyze the Polyak-averaging version of the +SGDM estimator, establish its asymptotic normality, and justify its asymptotic +equivalence to the averaged SGD. The asymptotic distribution of the averaged +SGDM enables uncertainty quantification of the algorithm output and statistical +inference of the model parameters. + +
+
+
+
+
+ + ♻ ☆ Learning from Graphs with Heterophily: Progress and Future + + +
+ Graphs are structured data that models complex relations between real-world +entities. Heterophilous graphs, where linked nodes are prone to be with +different labels or dissimilar features, have recently attracted significant +attention and found many applications. Meanwhile, increasing efforts have been +made to advance learning from heterophilous graphs. Although there exist +surveys on the relevant topic, they focus on heterophilous GNNs, which are only +sub-topics of heterophilous graph learning. In this survey, we comprehensively +overview existing works on learning from graphs with heterophily.First, we +collect over 180 publications and introduce the development of this field. +Then, we systematically categorize existing methods based on a hierarchical +taxonomy including learning strategies, model architectures and practical +applications. Finally, we discuss the primary challenges of existing studies +and highlight promising avenues for future research.More publication details +and corresponding open-source codes can be accessed and will be continuously +updated at our +repositories:https://github.com/gongchenghua/Awesome-Survey-Graphs-with-Heterophily. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Piecewise Normalizing Flows + + +
+ Normalizing flows are an established approach for modelling complex +probability densities through invertible transformations from a base +distribution. However, the accuracy with which the target distribution can be +captured by the normalizing flow is strongly influenced by the topology of the +base distribution. A mismatch between the topology of the target and the base +can result in a poor performance, as is typically the case for multi-modal +problems. A number of different works have attempted to modify the topology of +the base distribution to better match the target, either through the use of +Gaussian Mixture Models (Izmailov et al., 2020; Ardizzone et al., 2020; +Hagemann & Neumayer, 2021) or learned accept/reject sampling (Stimper et al., +2022). We introduce piecewise normalizing flows which divide the target +distribution into clusters, with topologies that better match the standard +normal base distribution, and train a series of flows to model complex +multi-modal targets. We demonstrate the performance of the piecewise flows +using some standard benchmarks and compare the accuracy of the flows to the +approach taken in Stimper et al. (2022) for modelling multi-modal +distributions. We find that our approach consistently outperforms the approach +in Stimper et al. (2022) with a higher emulation accuracy on the standard +benchmarks. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ EE-LLM: Large-Scale Training and Inference of Early-Exit Large Language + Models with 3D Parallelism + + +
+ We present EE-LLM, a framework for large-scale training and inference of +early-exit large language models (LLMs). While recent works have shown +preliminary evidence for the efficacy of early exiting in accelerating LLM +inference, EE-LLM makes a foundational step towards scaling up early-exit LLMs +by supporting their training and inference with massive 3D parallelism. Built +upon Megatron-LM, EE-LLM implements a variety of algorithmic innovations and +performance optimizations tailored to early exiting, including a lightweight +method that facilitates backpropagation for the early-exit training objective +with pipeline parallelism, techniques of leveraging idle resources in the +original pipeline schedule for computation related to early-exit layers, and +two approaches of early-exit inference that are compatible with KV caching for +autoregressive generation. Our analytical and empirical study shows that EE-LLM +achieves great training efficiency with negligible computational overhead +compared to standard LLM training, as well as outstanding inference speedup +without compromising output quality. To facilitate further research and +adoption, we release EE-LLM at https://github.com/pan-x-c/EE-LLM. + +
+
+ comment: arXiv v2 update: extended related works and formal analysis of + training efficiency. We will continuously update the codebase and arXiv + version +
+
+
+
+
+ + ♻ ☆ Hierarchical Continual Reinforcement Learning via Large Language Model + + +
+ The ability to learn continuously in dynamic environments is a crucial +requirement for reinforcement learning (RL) agents applying in the real world. +Despite the progress in continual reinforcement learning (CRL), existing +methods often suffer from insufficient knowledge transfer, particularly when +the tasks are diverse. To address this challenge, we propose a new framework, +Hierarchical Continual reinforcement learning via large language model +(Hi-Core), designed to facilitate the transfer of high-level knowledge. Hi-Core +orchestrates a twolayer structure: high-level policy formulation by a large +language model (LLM), which represents agenerates a sequence of goals, and +low-level policy learning that closely aligns with goal-oriented RL practices, +producing the agent's actions in response to the goals set forth. The framework +employs feedback to iteratively adjust and verify highlevel policies, storing +them along with low-level policies within a skill library. When encountering a +new task, Hi-Core retrieves relevant experience from this library to help to +learning. Through experiments on Minigrid, Hi-Core has demonstrated its +effectiveness in handling diverse CRL tasks, which outperforms popular +baselines. + +
+
+
+
+
+ + ♻ ☆ Small Language Models Improve Giants by Rewriting Their Outputs EACL 2024 + + +
+ Despite the impressive performance of large language models (LLMs), they +often lag behind specialized models in various tasks. LLMs only use a fraction +of the existing training data for in-context learning, while task-specific +models harness the full dataset for fine-tuning. In this work, we tackle the +problem of leveraging training data to improve the performance of LLMs without +fine-tuning. Our approach directly targets LLM predictions without requiring +access to their weights. We create a pool of candidates from the LLM through +few-shot prompting and we employ a compact model, the LM-corrector (LMCor), +specifically trained to merge these candidates to produce an enhanced output. +Our experiments on four natural language generation tasks demonstrate that even +a small LMCor model (250M) substantially improves the few-shot performance of +LLMs (62B), matching and even outperforming standard fine-tuning. Furthermore, +we illustrate the robustness of LMCor against different prompts, thereby +minimizing the need for extensive prompt engineering. Finally, we show that +LMCor can be seamlessly integrated with different LLMs at inference, serving as +a plug-and-play module to improve their performance. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ♻ ☆ A Theoretical Analysis of Noise Geometry in Stochastic Gradient Descent + + +
+ In this paper, we provide a theoretical study of noise geometry for minibatch +stochastic gradient descent (SGD), a phenomenon where noise aligns favorably +with the geometry of local landscape. We propose two metrics, derived from +analyzing how noise influences the loss and subspace projection dynamics, to +quantify the alignment strength. We show that for (over-parameterized) linear +models and two-layer nonlinear networks, when measured by these metrics, the +alignment can be provably guaranteed under conditions independent of the degree +of over-parameterization. To showcase the utility of our noise geometry +characterizations, we present a refined analysis of the mechanism by which SGD +escapes from sharp minima. We reveal that unlike gradient descent (GD), which +escapes along the sharpest directions, SGD tends to escape from flatter +directions and cyclical learning rates can exploit this SGD characteristic to +navigate more effectively towards flatter regions. Lastly, extensive +experiments are provided to support our theoretical findings. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ Diffusion Model Conditioning on Gaussian Mixture Model and Negative + Gaussian Mixture Gradient + + +
+ Diffusion models (DMs) are a type of generative model that has a huge impact +on image synthesis and beyond. They achieve state-of-the-art generation results +in various generative tasks. A great diversity of conditioning inputs, such as +text or bounding boxes, are accessible to control the generation. In this work, +we propose a conditioning mechanism utilizing Gaussian mixture models (GMMs) as +feature conditioning to guide the denoising process. Based on set theory, we +provide a comprehensive theoretical analysis that shows that conditional latent +distribution based on features and classes is significantly different, so that +conditional latent distribution on features produces fewer defect generations +than conditioning on classes. Two diffusion models conditioned on the Gaussian +mixture model are trained separately for comparison. Experiments support our +findings. A novel gradient function called the negative Gaussian mixture +gradient (NGMG) is proposed and applied in diffusion model training with an +additional classifier. Training stability has improved. We also theoretically +prove that NGMG shares the same benefit as the Earth Mover distance +(Wasserstein) as a more sensible cost function when learning distributions +supported by low-dimensional manifolds. + +
+
+
+
+
+ + ♻ ☆ FedIN: Federated Intermediate Layers Learning for Model Heterogeneity + + +
+ Federated learning (FL) facilitates edge devices to cooperatively train a +global shared model while maintaining the training data locally and privately. +However, a common assumption in FL requires the participating edge devices to +have similar computation resources and train on an identical global model +architecture. In this study, we propose an FL method called Federated +Intermediate Layers Learning (FedIN), supporting heterogeneous models without +relying on any public dataset. Instead, FedIN leverages the inherent knowledge +embedded in client model features to facilitate knowledge exchange. The +training models in FedIN are partitioned into three distinct components: an +extractor, intermediate layers, and a classifier. We capture client features by +extracting the outputs of the extractor and the inputs of the classifier. To +harness the knowledge from client features, we propose IN training for aligning +the intermediate layers based on features obtained from other clients. IN +training only needs minimal memory and communication overhead by utilizing a +single batch of client features. Additionally, we formulate and address a +convex optimization problem to mitigate the challenge of gradient divergence +caused by conflicts between IN training and local training. The experiment +results demonstrate the superior performance of FedIN in heterogeneous model +environments compared to state-of-the-art algorithms. Furthermore, our ablation +study demonstrates the effectiveness of IN training and the proposed solution +for alleviating gradient divergence. + +
+
+
+
+
+ + ♻ ☆ Implicit Manifold Gaussian Process Regression + + +
+ Gaussian process regression is widely used because of its ability to provide +well-calibrated uncertainty estimates and handle small or sparse datasets. +However, it struggles with high-dimensional data. One possible way to scale +this technique to higher dimensions is to leverage the implicit low-dimensional +manifold upon which the data actually lies, as postulated by the manifold +hypothesis. Prior work ordinarily requires the manifold structure to be +explicitly provided though, i.e. given by a mesh or be known to be one of the +well-known manifolds like the sphere. In contrast, in this paper we propose a +Gaussian process regression technique capable of inferring implicit structure +directly from data (labeled and unlabeled) in a fully differentiable way. For +the resulting model, we discuss its convergence to the Mat\'ern Gaussian +process on the assumed manifold. Our technique scales up to hundreds of +thousands of data points, and may improve the predictive performance and +calibration of the standard Gaussian process regression in high-dimensional +settings. + +
+
+
+
+
+ + ♻ ☆ Efficacy of MRI data harmonization in the age of machine learning. A + multicenter study across 36 datasets + + +
+ Pooling publicly-available MRI data from multiple sites allows to assemble +extensive groups of subjects, increase statistical power, and promote data +reuse with machine learning techniques. The harmonization of multicenter data +is necessary to reduce the confounding effect associated with non-biological +sources of variability in the data. However, when applied to the entire dataset +before machine learning, the harmonization leads to data leakage, because +information outside the training set may affect model building, and potentially +falsely overestimate performance. We propose a 1) measurement of the efficacy +of data harmonization; 2) harmonizer transformer, i.e., an implementation of +the ComBat harmonization allowing its encapsulation among the preprocessing +steps of a machine learning pipeline, avoiding data leakage. We tested these +tools using brain T1-weighted MRI data from 1740 healthy subjects acquired at +36 sites. After harmonization, the site effect was removed or reduced, and we +showed the data leakage effect in predicting individual age from MRI data, +highlighting that introducing the harmonizer transformer into a machine +learning pipeline allows for avoiding data leakage. + +
+
+
+
+
+ + ♻ ☆ Generative machine learning methods for multivariate ensemble + post-processing + + +
+ Ensemble weather forecasts based on multiple runs of numerical weather +prediction models typically show systematic errors and require post-processing +to obtain reliable forecasts. Accurately modeling multivariate dependencies is +crucial in many practical applications, and various approaches to multivariate +post-processing have been proposed where ensemble predictions are first +post-processed separately in each margin and multivariate dependencies are then +restored via copulas. These two-step methods share common key limitations, in +particular the difficulty to include additional predictors in modeling the +dependencies. We propose a novel multivariate post-processing method based on +generative machine learning to address these challenges. In this new class of +nonparametric data-driven distributional regression models, samples from the +multivariate forecast distribution are directly obtained as output of a +generative neural network. The generative model is trained by optimizing a +proper scoring rule which measures the discrepancy between the generated and +observed data, conditional on exogenous input variables. Our method does not +require parametric assumptions on univariate distributions or multivariate +dependencies and allows for incorporating arbitrary predictors. In two case +studies on multivariate temperature and wind speed forecasting at weather +stations over Germany, our generative model shows significant improvements over +state-of-the-art methods and particularly improves the representation of +spatial dependencies. + +
+
+
+
+
+ + ♻ ☆ Corruption-Robust Lipschitz Contextual Search ALT 2024 + + +
+ I study the problem of learning a Lipschitz function with corrupted binary +signals. The learner tries to learn a $L$-Lipschitz function $f: [0,1]^d +\rightarrow [0, L]$ that the adversary chooses. There is a total of $T$ rounds. +In each round $t$, the adversary selects a context vector $x_t$ in the input +space, and the learner makes a guess to the true function value $f(x_t)$ and +receives a binary signal indicating whether the guess is high or low. In a +total of $C$ rounds, the signal may be corrupted, though the value of $C$ is +\emph{unknown} to the learner. The learner's goal is to incur a small +cumulative loss. This work introduces the new algorithmic technique +\emph{agnostic checking} as well as new analysis techniques. I design +algorithms which: for the symmetric loss, the learner achieves regret $L\cdot +O(C\log T)$ with $d = 1$ and $L\cdot O_d(C\log T + T^{(d-1)/d})$ with $d > 1$; +for the pricing loss, the learner achieves regret $L\cdot \widetilde{O} +(T^{d/(d+1)} + C\cdot T^{1/(d+1)})$. + +
+
+ comment: Accepted at ALT 2024 +
+
+
+
+
+ + ♻ ☆ Fine-Tune Language Models as Multi-Modal Differential Equation Solvers + + +
+ In the growing domain of scientific machine learning, in-context operator +learning has shown notable potential in building foundation models, as in this +framework the model is trained to learn operators and solve differential +equations using prompted data, during the inference stage without weight +updates. However, the current model's overdependence on function data overlooks +the invaluable human insight into the operator. To address this, we present a +transformation of in-context operator learning into a multi-modal paradigm. In +particular, we take inspiration from the recent success of large language +models, and propose using "captions" to integrate human knowledge about the +operator, expressed through natural language descriptions and equations. Also, +we introduce a novel approach to train a language-model-like architecture, or +directly fine-tune existing language models, for in-context operator learning. +We beat the baseline on single-modal learning tasks, and also demonstrated the +effectiveness of multi-modal learning in enhancing performance and reducing +function data requirements. The proposed method not only significantly enhanced +the development of the in-context operator learning paradigm, but also created +a new path for the application of language models. + +
+
+
+
+
+ + ♻ ☆ The curse of overparametrization in adversarial training: Precise + analysis of robust generalization for random features regression + + +
+ Successful deep learning models often involve training neural network +architectures that contain more parameters than the number of training samples. +Such overparametrized models have been extensively studied in recent years, and +the virtues of overparametrization have been established from both the +statistical perspective, via the double-descent phenomenon, and the +computational perspective via the structural properties of the optimization +landscape. + Despite the remarkable success of deep learning architectures in the +overparametrized regime, it is also well known that these models are highly +vulnerable to small adversarial perturbations in their inputs. Even when +adversarially trained, their performance on perturbed inputs (robust +generalization) is considerably worse than their best attainable performance on +benign inputs (standard generalization). It is thus imperative to understand +how overparametrization fundamentally affects robustness. + In this paper, we will provide a precise characterization of the role of +overparametrization on robustness by focusing on random features regression +models (two-layer neural networks with random first layer weights). We consider +a regime where the sample size, the input dimension and the number of +parameters grow in proportion to each other, and derive an asymptotically exact +formula for the robust generalization error when the model is adversarially +trained. Our developed theory reveals the nontrivial effect of +overparametrization on robustness and indicates that for adversarially trained +random features models, high overparametrization can hurt robust +generalization. + +
+
+ comment: 86 pages (main file: 25 pages and supplementary: 61 pages). To appear + in the Annals of Statistics +
+
+
+
+
+ + ♻ ☆ A Multi-Grained Symmetric Differential Equation Model for Learning + Protein-Ligand Binding Dynamics + + +
+ In drug discovery, molecular dynamics (MD) simulation for protein-ligand +binding provides a powerful tool for predicting binding affinities, estimating +transport properties, and exploring pocket sites. There has been a long history +of improving the efficiency of MD simulations through better numerical methods +and, more recently, by utilizing machine learning (ML) methods. Yet, challenges +remain, such as accurate modeling of extended-timescale simulations. To address +this issue, we propose NeuralMD, the first ML surrogate that can facilitate +numerical MD and provide accurate simulations in protein-ligand binding. We +propose a principled approach that incorporates a novel physics-informed +multi-grained group symmetric framework. Specifically, we propose (1) a +BindingNet model that satisfies group symmetry using vector frames and captures +the multi-level protein-ligand interactions, and (2) an augmented neural +differential equation solver that learns the trajectory under Newtonian +mechanics. For the experiment, we design ten single-trajectory and three +multi-trajectory binding simulation tasks. We show the efficiency and +effectiveness of NeuralMD, with a 2000$\times$ speedup over standard numerical +MD simulation and outperforming all other ML approaches by up to 80% under the +stability metric. We further qualitatively show that NeuralMD reaches more +stable binding predictions compared to other machine learning methods. + +
+
+
+
+
+ + ♻ ☆ On Accelerating Diffusion-based Molecular Conformation Generation in + SE(3)-invariant Space + + +
+ Diffusion-based generative models in SE(3)-invariant space have demonstrated +promising performance in molecular conformation generation, but typically +require solving stochastic differential equations (SDEs) with thousands of +update steps. Till now, it remains unclear how to effectively accelerate this +procedure explicitly in SE(3)-invariant space, which greatly hinders its wide +application in the real world. In this paper, we systematically study the +diffusion mechanism in SE(3)-invariant space via the lens of approximate errors +induced by existing methods. Thereby, we develop more precise approximate in +SE(3) in the context of projected differential equations. Theoretical analysis +is further provided as well as empirical proof relating hyper-parameters with +such errors. Altogether, we propose a novel acceleration scheme for generating +molecular conformations in SE(3)-invariant space. Experimentally, our scheme +can generate high-quality conformations with 50x--100x speedup compared to +existing methods. + +
+
+ comment: We are currently developing a new manuscript that significantly + expands upon and integrates the research presented here. The forthcoming + paper includes broader analyses and more comprehensive findings, rendering + the current version obsolete. We believe this decision will contribute to a + clearer and more consolidated presentation of our research findings +
+
+
+
+
+ + ♻ ☆ SELF: Self-Evolution with Language Feedback + + +
+ Large Language Models (LLMs) have demonstrated remarkable versatility across +various domains. To further advance LLMs, we propose 'SELF' (Self-Evolution +with Language Feedback), a novel approach that enables LLMs to self-improve +through self-reflection, akin to human learning processes. SELF initiates with +a meta-skill learning process that equips the LLMs with capabilities for +self-feedback and self-refinement. Subsequently, the model undergoes an +iterative process of self-evolution. In each iteration, it utilizes an +unlabeled dataset of instructions to generate initial responses. These +responses are enhanced through self-feedback and self-refinement. The model is +then fine-tuned using this enhanced data. The model undergoes progressive +improvement through this iterative self-evolution process. Moreover, the SELF +framework enables the model to apply self-refinement during inference, which +further improves response quality. Our experiments in mathematics and general +tasks demonstrate that SELF can enhance the capabilities of LLMs without human +intervention. The SELF framework indicates a promising direction for the +autonomous evolution of LLMs, transitioning them from passive information +receivers to active participants in their development. + +
+
+ comment: 20 pages, 4 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Fair Machine Learning in Healthcare: A Review + + +
+ The digitization of healthcare data coupled with advances in computational +capabilities has propelled the adoption of machine learning (ML) in healthcare. +However, these methods can perpetuate or even exacerbate existing disparities, +leading to fairness concerns such as the unequal distribution of resources and +diagnostic inaccuracies among different demographic groups. Addressing these +fairness problem is paramount to prevent further entrenchment of social +injustices. In this survey, we analyze the intersection of fairness in machine +learning and healthcare disparities. We adopt a framework based on the +principles of distributive justice to categorize fairness concerns into two +distinct classes: equal allocation and equal performance. We provide a critical +review of the associated fairness metrics from a machine learning standpoint +and examine biases and mitigation strategies across the stages of the ML +lifecycle, discussing the relationship between biases and their +countermeasures. The paper concludes with a discussion on the pressing +challenges that remain unaddressed in ensuring fairness in healthcare ML, and +proposes several new research directions that hold promise for developing +ethical and equitable ML applications in healthcare. + +
+
+
+
+
+ + ♻ ☆ Synthetic Skull CT Generation with Generative Adversarial Networks to + Train Deep Learning Models for Clinical Transcranial Ultrasound + + +
+ Deep learning offers potential for various healthcare applications, yet +requires extensive datasets of curated medical images where data privacy, cost, +and distribution mismatch across various acquisition centers could become major +problems. To overcome these challenges, we propose a generative adversarial +network (SkullGAN) to create large datasets of synthetic skull CT slices, +geared towards training models for transcranial ultrasound. With wide ranging +applications in treatment of essential tremor, Parkinson's, and Alzheimer's +disease, transcranial ultrasound clinical pipelines can be significantly +optimized via integration of deep learning. The main roadblock is the lack of +sufficient skull CT slices for the purposes of training, which SkullGAN aims to +address. Actual CT slices of 38 healthy subjects were used for training. The +generated synthetic skull images were then evaluated based on skull density +ratio, mean thickness, and mean intensity. Their fidelity was further analyzed +using t-distributed stochastic neighbor embedding (t-SNE), Fr\'echet inception +distance (FID) score, and visual Turing test (VTT) taken by four staff clinical +radiologists. SkullGAN-generated images demonstrated similar quantitative +radiological features to real skulls. t-SNE failed to separate real and +synthetic samples from one another, and the FID score was 49. Expert +radiologists achieved a 60\% mean accuracy on the VTT. SkullGAN makes it +possible for researchers to generate large numbers of synthetic skull CT +segments, necessary for training neural networks for medical applications +involving the human skull, such as transcranial focused ultrasound, mitigating +challenges with access, privacy, capital, time, and the need for domain +expertise. + +
+
+ comment: The first two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Minimum Width of Leaky-ReLU Neural Networks for Uniform Universal + Approximation + + +
+ The study of universal approximation properties (UAP) for neural networks +(NN) has a long history. When the network width is unlimited, only a single +hidden layer is sufficient for UAP. In contrast, when the depth is unlimited, +the width for UAP needs to be not less than the critical width +$w^*_{\min}=\max(d_x,d_y)$, where $d_x$ and $d_y$ are the dimensions of the +input and output, respectively. Recently, \cite{cai2022achieve} shows that a +leaky-ReLU NN with this critical width can achieve UAP for $L^p$ functions on a +compact domain ${K}$, \emph{i.e.,} the UAP for $L^p({K},\mathbb{R}^{d_y})$. +This paper examines a uniform UAP for the function class +$C({K},\mathbb{R}^{d_y})$ and gives the exact minimum width of the leaky-ReLU +NN as $w_{\min}=\max(d_x,d_y)+\Delta (d_x, d_y)$, where $\Delta (d_x, d_y)$ is +the additional dimensions for approximating continuous functions with +diffeomorphisms via embedding. To obtain this result, we propose a novel +lift-flow-discretization approach that shows that the uniform UAP has a deep +connection with topological theory. + +
+
+ comment: Include errata of the previous versions +
+
+
+
+
+ + ♻ ☆ Exploring Simple, High Quality Out-of-Distribution Detection with L2 + Normalization + + +
+ We demonstrate that L2 normalization over feature space can produce capable +performance for Out-of-Distribution (OoD) detection for some models and +datasets. Although it does not demonstrate outright state-of-the-art +performance, this method is notable for its extreme simplicity: it requires +only two addition lines of code, and does not need specialized loss functions, +image augmentations, outlier exposure or extra parameter tuning. We also +observe that training may be more efficient for some datasets and +architectures. Notably, only 60 epochs with ResNet18 on CIFAR10 (or 100 epochs +with ResNet50) can produce performance within two percentage points (AUROC) of +several state-of-the-art methods for some near and far OoD datasets. We provide +theoretical and empirical support for this method, and demonstrate viability +across five architectures and three In-Distribution (ID) datasets. + +
+
+
+
+
+ + ♻ ☆ Langevin Unlearning: A New Perspective of Noisy Gradient Descent for + Machine Unlearning + + +
+ Machine unlearning has raised significant interest with the adoption of laws +ensuring the ``right to be forgotten''. Researchers have provided a +probabilistic notion of approximate unlearning under a similar definition of +Differential Privacy (DP), where privacy is defined as statistical +indistinguishability to retraining from scratch. We propose Langevin +unlearning, an unlearning framework based on noisy gradient descent with +privacy guarantees for approximate unlearning problems. Langevin unlearning +unifies the DP learning process and the privacy-certified unlearning process +with many algorithmic benefits. These include approximate certified unlearning +for non-convex problems, complexity saving compared to retraining, sequential +and batch unlearning for multiple unlearning requests. We verify the +practicality of Langevin unlearning by studying its privacy-utility-complexity +trade-off via experiments on benchmark datasets, and also demonstrate its +superiority against gradient-decent-plus-output-perturbation based approximate +unlearning. + +
+
+
+
+
+ + ♻ ☆ FORESEE: Prediction with Expansion-Compression Unscented Transform for + Online Policy Optimization + + +
+ Propagating state distributions through a generic, uncertain nonlinear +dynamical model is known to be intractable and usually begets numerical or +analytical approximations. We introduce a method for state prediction, called +the Expansion-Compression Unscented Transform, and use it to solve a class of +online policy optimization problems. Our proposed algorithm propagates a finite +number of sigma points through a state-dependent distribution, which dictates +an increase in the number of sigma points at each time step to represent the +resulting distribution; this is what we call the expansion operation. To keep +the algorithm scalable, we augment the expansion operation with a compression +operation based on moment matching, thereby keeping the number of sigma points +constant across predictions over multiple time steps. Its performance is +empirically shown to be comparable to Monte Carlo but at a much lower +computational cost. Under state and control input constraints, the state +prediction is subsequently used in tandem with a proposed variant of +constrained gradient-descent for online update of policy parameters in a +receding horizon fashion. The framework is implemented as a differentiable +computational graph for policy training. We showcase our framework for a +quadrotor stabilization task as part of a benchmark comparison in +safe-control-gym and for optimizing the parameters of a Control Barrier +Function based controller in a leader-follower problem. + +
+
+
+
+
+ + ♻ ☆ Fair Sampling in Diffusion Models through Switching Mechanism AAAI 2024 + + +
+ Diffusion models have shown their effectiveness in generation tasks by +well-approximating the underlying probability distribution. However, diffusion +models are known to suffer from an amplified inherent bias from the training +data in terms of fairness. While the sampling process of diffusion models can +be controlled by conditional guidance, previous works have attempted to find +empirical guidance to achieve quantitative fairness. To address this +limitation, we propose a fairness-aware sampling method called +\textit{attribute switching} mechanism for diffusion models. Without additional +training, the proposed sampling can obfuscate sensitive attributes in generated +data without relying on classifiers. We mathematically prove and experimentally +demonstrate the effectiveness of the proposed method on two key aspects: (i) +the generation of fair data and (ii) the preservation of the utility of the +generated data. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Generalization of LiNGAM that allows confounding + + +
+ LiNGAM determines the variable order from cause to effect using additive +noise models, but it faces challenges with confounding. Previous methods +maintained LiNGAM's fundamental structure while trying to identify and address +variables affected by confounding. As a result, these methods required +significant computational resources regardless of the presence of confounding, +and they did not ensure the detection of all confounding types. In contrast, +this paper enhances LiNGAM by introducing LiNGAM-MMI, a method that quantifies +the magnitude of confounding using KL divergence and arranges the variables to +minimize its impact. This method efficiently achieves a globally optimal +variable order through the shortest path problem formulation. LiNGAM-MMI +processes data as efficiently as traditional LiNGAM in scenarios without +confounding while effectively addressing confounding situations. Our +experimental results suggest that LiNGAM-MMI more accurately determines the +correct variable order, both in the presence and absence of confounding. + +
+
+
+
+
+ + ♻ ☆ Tackling Interference Induced by Data Training Loops in A/B Tests: A + Weighted Training Approach + + +
+ In modern recommendation systems, the standard pipeline involves training +machine learning models on historical data to predict user behaviors and +improve recommendations continuously. However, these data training loops can +introduce interference in A/B tests, where data generated by control and +treatment algorithms, potentially with different distributions, are combined. +To address these challenges, we introduce a novel approach called weighted +training. This approach entails training a model to predict the probability of +each data point appearing in either the treatment or control data and +subsequently applying weighted losses during model training. We demonstrate +that this approach achieves the least variance among all estimators without +causing shifts in the training distributions. Through simulation studies, we +demonstrate the lower bias and variance of our approach compared to other +methods. + +
+
+
+
+
+ + ♻ ☆ Commonsense for Zero-Shot Natural Language Video Localization AAAI 2024 + + +
+ Zero-shot Natural Language-Video Localization (NLVL) methods have exhibited +promising results in training NLVL models exclusively with raw video data by +dynamically generating video segments and pseudo-query annotations. However, +existing pseudo-queries often lack grounding in the source video, resulting in +unstructured and disjointed content. In this paper, we investigate the +effectiveness of commonsense reasoning in zero-shot NLVL. Specifically, we +present CORONET, a zero-shot NLVL framework that leverages commonsense to +bridge the gap between videos and generated pseudo-queries via a commonsense +enhancement module. CORONET employs Graph Convolution Networks (GCN) to encode +commonsense information extracted from a knowledge graph, conditioned on the +video, and cross-attention mechanisms to enhance the encoded video and +pseudo-query representations prior to localization. Through empirical +evaluations on two benchmark datasets, we demonstrate that CORONET surpasses +both zero-shot and weakly supervised baselines, achieving improvements up to +32.13% across various recall thresholds and up to 6.33% in mIoU. These results +underscore the significance of leveraging commonsense reasoning for zero-shot +NLVL. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ A First Look at Information Highlighting in Stack Overflow Answers + + +
+ Context: Navigating the knowledge of Stack Overflow (SO) remains challenging. +To make the posts vivid to users, SO allows users to write and edit posts with +Markdown or HTML so that users can leverage various formatting styles (e.g., +bold, italic, and code) to highlight the important information. Nonetheless, +there have been limited studies on the highlighted information. Objective: We +carried out the first large-scale exploratory study on the information +highlighted in SO answers in our recent study. To extend our previous study, we +develop approaches to automatically recommend highlighted content with +formatting styles using neural network architectures initially designed for the +Named Entity Recognition task. Method: In this paper, we studied 31,169,429 +answers of Stack Overflow. For training recommendation models, we choose CNN +and BERT models for each type of formatting (i.e., Bold, Italic, Code, and +Heading) using the information highlighting dataset we collected from SO +answers. Results: Our models based on CNN architecture achieve precision +ranging from 0.71 to 0.82. The trained model for automatic code content +highlighting achieves a recall of 0.73 and an F1 score of 0.71, outperforming +the trained models for other formatting styles. The BERT models have even lower +recalls and F1 scores than the CNN models. Our analysis of failure cases +indicates that the majority of the failure cases are missing identification +(i.e., the model misses the content that is supposed to be highlighted) due to +the models tend to learn the frequently highlighted words while struggling to +learn less frequent words. Conclusion: Our findings suggest that it is possible +to develop recommendation models for highlighting information for answers with +different formatting styles on Stack Overflow. + +
+
+ comment: This work is submitted to Information and Software Technology Journal +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper investigates the distinctions between gradient methods applied to +non-differentiable functions (NGDMs) and classical gradient descents (GDs) +designed for differentiable functions. First, we demonstrate significant +differences in the convergence properties of NGDMs compared to GDs, challenging +the applicability of the extensive neural network convergence literature based +on $L-smoothness$ to non-smooth neural networks. Next, we demonstrate the +paradoxical nature of NGDM solutions for $L_{1}$-regularized problems, showing +that increasing the regularization penalty leads to an increase in the $L_{1}$ +norm of optimal solutions in NGDMs. Consequently, we show that widely adopted +$L_{1}$ penalization-based techniques for network pruning do not yield expected +results. Finally, we explore the Edge of Stability phenomenon, indicating its +inapplicability even to Lipschitz continuous convex differentiable functions, +leaving its relevance to non-convex non-differentiable neural networks +inconclusive. Our analysis exposes misguided interpretations of NGDMs in widely +referenced papers and texts due to an overreliance on strong smoothness +assumptions, emphasizing the necessity for a nuanced understanding of +foundational assumptions in the analysis of these systems. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Gain of Grain: A Film Grain Handling Toolchain for VVC-based Open + Implementations + + +
+ Film grain is a distinctive visual characteristic cherished by filmmakers and +cinephiles for its ability to evoke nostalgia and artistic aesthetics. However, +faithful preservation of film grain during encoding poses unique challenges. +Film grain introduces random noise, complicating traditional compression +techniques. Consequently, specialized algorithms and encoding strategies have +emerged, aiming to strike a harmonious equilibrium. This paper delves into the +nuanced realm of film grain handling in Versatile Video Coding (VVC) encoding. +We explore the delicate balance between retaining the cinematic charm of film +grain and achieving efficient compression. Moreover, we discuss the importance +of perceptual quality assessment and adaptive encoding techniques in preserving +film grain fidelity. Additionally, we delve into the impact of film grain +handling on bitrate control and compression efficiency using VVenC, an open and +optimized VVC encoder. Understanding the role of film grain and its nuanced +treatment within encoders becomes increasingly pivotal for delivering +high-quality, grain-inclusive content in the digital age. + +
+
+ comment: 2024 Mile High Video (MHV) +
+
+
+
+
+ + ♻ ☆ Generalized Video Anomaly Event Detection: Systematic Taxonomy and + Comparison of Deep Models + + +
+ Video Anomaly Detection (VAD) serves as a pivotal technology in the +intelligent surveillance systems, enabling the temporal or spatial +identification of anomalous events within videos. While existing reviews +predominantly concentrate on conventional unsupervised methods, they often +overlook the emergence of weakly-supervised and fully-unsupervised approaches. +To address this gap, this survey extends the conventional scope of VAD beyond +unsupervised methods, encompassing a broader spectrum termed Generalized Video +Anomaly Event Detection (GVAED). By skillfully incorporating recent +advancements rooted in diverse assumptions and learning frameworks, this survey +introduces an intuitive taxonomy that seamlessly navigates through +unsupervised, weakly-supervised, supervised and fully-unsupervised VAD +methodologies, elucidating the distinctions and interconnections within these +research trajectories. In addition, this survey facilitates prospective +researchers by assembling a compilation of research resources, including public +datasets, available codebases, programming tools, and pertinent literature. +Furthermore, this survey quantitatively assesses model performance, delves into +research challenges and directions, and outlines potential avenues for future +exploration. + +
+
+ comment: Accepted by ACM Computing Surveys. For more information, please see + our project page: https://github.com/fudanyliu/GVAED +
+
+
+
+
+ + ♻ ☆ CL2CM: Improving Cross-Lingual Cross-Modal Retrieval via Cross-Lingual + Knowledge Transfer AAAI2024 + + +
+ Cross-lingual cross-modal retrieval has garnered increasing attention +recently, which aims to achieve the alignment between vision and target +language (V-T) without using any annotated V-T data pairs. Current methods +employ machine translation (MT) to construct pseudo-parallel data pairs, which +are then used to learn a multi-lingual and multi-modal embedding space that +aligns visual and target-language representations. However, the large +heterogeneous gap between vision and text, along with the noise present in +target language translations, poses significant challenges in effectively +aligning their representations. To address these challenges, we propose a +general framework, Cross-Lingual to Cross-Modal (CL2CM), which improves the +alignment between vision and target language using cross-lingual transfer. This +approach allows us to fully leverage the merits of multi-lingual pre-trained +models (e.g., mBERT) and the benefits of the same modality structure, i.e., +smaller gap, to provide reliable and comprehensive semantic correspondence +(knowledge) for the cross-modal network. We evaluate our proposed approach on +two multilingual image-text datasets, Multi30K and MSCOCO, and one video-text +dataset, VATEX. The results clearly demonstrate the effectiveness of our +proposed method and its high potential for large-scale retrieval. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 81 + +
+
+
+ + ☆ Do Language Models Exhibit the Same Cognitive Biases in Problem Solving + as Human Learners? + + +
+ There is increasing interest in employing large language models (LLMs) as +cognitive models. For such purposes, it is central to understand which +cognitive properties are well-modeled by LLMs, and which are not. In this work, +we study the biases of LLMs in relation to those known in children when solving +arithmetic word problems. Surveying the learning science literature, we posit +that the problem-solving process can be split into three distinct steps: text +comprehension, solution planning and solution execution. We construct tests for +each one in order to understand which parts of this process can be faithfully +modeled by current state-of-the-art LLMs. We generate a novel set of word +problems for each of these tests, using a neuro-symbolic method that enables +fine-grained control over the problem features. We find evidence that LLMs, +with and without instruction-tuning, exhibit human-like biases in both the +text-comprehension and the solution-planning steps of the solving process, but +not during the final step which relies on the problem's arithmetic expressions +(solution execution). + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval + + +
+ Retrieval-augmented language models can better adapt to changes in world +state and incorporate long-tail knowledge. However, most existing methods +retrieve only short contiguous chunks from a retrieval corpus, limiting +holistic understanding of the overall document context. We introduce the novel +approach of recursively embedding, clustering, and summarizing chunks of text, +constructing a tree with differing levels of summarization from the bottom up. +At inference time, our RAPTOR model retrieves from this tree, integrating +information across lengthy documents at different levels of abstraction. +Controlled experiments show that retrieval with recursive summaries offers +significant improvements over traditional retrieval-augmented LMs on several +tasks. On question-answering tasks that involve complex, multi-step reasoning, +we show state-of-the-art results; for example, by coupling RAPTOR retrieval +with the use of GPT-4, we can improve the best performance on the QuALITY +benchmark by 20% in absolute accuracy. + +
+
+
+
+
+ + ☆ LongAlign: A Recipe for Long Context Alignment of Large Language Models + + +
+ Extending large language models to effectively handle long contexts requires +instruction fine-tuning on input sequences of similar length. To address this, +we present LongAlign -- a recipe of the instruction data, training, and +evaluation for long context alignment. First, we construct a long +instruction-following dataset using Self-Instruct. To ensure the data +diversity, it covers a broad range of tasks from various long context sources. +Second, we adopt the packing and sorted batching strategies to speed up +supervised fine-tuning on data with varied length distributions. Additionally, +we develop a loss weighting method to balance the contribution to the loss +across different sequences during packing training. Third, we introduce the +LongBench-Chat benchmark for evaluating instruction-following capabilities on +queries of 10k-100k in length. Experiments show that LongAlign outperforms +existing recipes for LLMs in long context tasks by up to 30\%, while also +maintaining their proficiency in handling short, generic tasks. The code, data, +and long-aligned models are open-sourced at https://github.com/THUDM/LongAlign. + +
+
+
+
+
+ + ☆ Multipath parsing in the brain + + +
+ Humans understand sentences word-by-word, in the order that they hear them. +This incrementality entails resolving temporary ambiguities about syntactic +relationships. We investigate how humans process these syntactic ambiguities by +correlating predictions from incremental generative dependency parsers with +timecourse data from people undergoing functional neuroimaging while listening +to an audiobook. In particular, we compare competing hypotheses regarding the +number of developing syntactic analyses in play during word-by-word +comprehension: one vs more than one. This comparison involves evaluating +syntactic surprisal from a state-of-the-art dependency parser with LLM-adapted +encodings against an existing fMRI dataset. In both English and Chinese data, +we find evidence for multipath parsing. Brain regions associated with this +multipath effect include bilateral superior temporal gyrus. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition + + +
+ Recent advancements in language models have significantly enhanced +performance in multiple speech-related tasks. Existing speech language models +typically utilize task-dependent prompt tokens to unify various speech tasks in +a single model. However, this design omits the intrinsic connections between +different speech tasks, which can potentially boost the performance of each +task. In this work, we propose a novel decoder-only speech language model, +SpeechComposer, that can unify common speech tasks by composing a fixed set of +prompt tokens. Built upon four primary tasks -- speech synthesis, speech +recognition, speech language modeling, and text language modeling -- +SpeechComposer can easily extend to more speech tasks via compositions of +well-designed prompt tokens, like voice conversion and speech enhancement. The +unification of prompt tokens also makes it possible for knowledge sharing among +different speech tasks in a more structured manner. Experimental results +demonstrate that our proposed SpeechComposer can improve the performance of +both primary tasks and composite tasks, showing the effectiveness of the shared +prompt tokens. Remarkably, the unified decoder-only model achieves a comparable +and even better performance than the baselines which are expert models designed +for single tasks. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Enhancing End-to-End Multi-Task Dialogue Systems: A Study on Intrinsic + Motivation Reinforcement Learning Algorithms for Improved Training and + Adaptability + + +
+ End-to-end multi-task dialogue systems are usually designed with separate +modules for the dialogue pipeline. Among these, the policy module is essential +for deciding what to do in response to user input. This policy is trained by +reinforcement learning algorithms by taking advantage of an environment in +which an agent receives feedback in the form of a reward signal. The current +dialogue systems, however, only provide meagre and simplistic rewards. +Investigating intrinsic motivation reinforcement learning algorithms is the +goal of this study. Through this, the agent can quickly accelerate training and +improve its capacity to judge the quality of its actions by teaching it an +internal incentive system. In particular, we adapt techniques for random +network distillation and curiosity-driven reinforcement learning to measure the +frequency of state visits and encourage exploration by using semantic +similarity between utterances. Experimental results on MultiWOZ, a +heterogeneous dataset, show that intrinsic motivation-based debate systems +outperform policies that depend on extrinsic incentives. By adopting random +network distillation, for example, which is trained using semantic similarity +between user-system dialogues, an astounding average success rate of 73% is +achieved. This is a significant improvement over the baseline Proximal Policy +Optimization (PPO), which has an average success rate of 60%. In addition, +performance indicators such as booking rates and completion rates show a 10% +rise over the baseline. Furthermore, these intrinsic incentive models help +improve the system's policy's resilience in an increasing amount of domains. +This implies that they could be useful in scaling up to settings that cover a +wider range of domains. + +
+
+ comment: 6 pages, 1 figure, 18th IEEE International Conference on Semantic + Computing +
+
+
+
+
+ + ☆ Paramanu: A Family of Novel Efficient Indic Generative Foundation + Language Models + + +
+ We present Gyan AI Paramanu ("atom"), a family of novel language models for +Indian languages. It is a collection of auto-regressive monolingual, bilingual, +and multilingual Indic language models pretrained from scratch on a single GPU +for 10 Indian languages (Assamese, Bangla, Hindi, Konkani, Maithili, Marathi, +Odia, Sanskrit, Tamil, Telugu) across 5 scripts (Bangla, Devanagari, Odia, +Tamil, Telugu) of varying sizes ranging from 13.29M to 367.5M.The models are +pretrained with a context size of 1024 on a single GPU. The models are very +efficient, small, fast, and powerful. We have also developed an efficient most +advanced Indic tokenizer that can even tokenize unseen languages. In order to +avoid the "curse of multi-linguality" in our multilingual mParamanu model, we +pretrained on comparable corpora by typological grouping using the same script. +We performed human evaluation of our pretrained models for open end text +generation on grammar, coherence, creativity, and factuality metrics for +Bangla, Hindi, and Sanskrit. Our Bangla, Hindi, and Sanskrit models +outperformed GPT-3.5-Turbo (ChatGPT), Bloom 7B, LLaMa-2 7B, OPT 6.7B, GPT-J 6B, +GPTNeo 1.3B, GPT2-XL large language models (LLMs) by a large margin despite +being smaller in size by 66 to 20 times compared to standard 7B LLMs. To run +inference on our pretrained models, CPU is enough, and GPU is not needed. We +also instruction-tuned our pretrained Bangla, Hindi, Marathi, Tamil, and Telugu +models on 23k instructions in respective languages. Our pretrained and +instruction-tuned models which are first of its kind, most powerful efficient +small generative language models ever developed for Indic languages, and the +various results lead to the conclusion that high quality generative language +models are possible without high amount of compute power and humongous number +of parameters. We plan to release our models at https://www.bharatgpts.com. + +
+
+
+
+
+ + ☆ Supporting Anticipatory Governance using LLMs: Evaluating and Aligning + Large Language Models with the News Media to Anticipate the Negative Impacts + of AI + + +
+ Anticipating the negative impacts of emerging AI technologies is a challenge, +especially in the early stages of development. An understudied approach to such +anticipation is the use of LLMs to enhance and guide this process. Despite +advancements in LLMs and evaluation metrics to account for biases in generated +text, it is unclear how well these models perform in anticipatory tasks. +Specifically, the use of LLMs to anticipate AI impacts raises questions about +the quality and range of categories of negative impacts these models are +capable of generating. In this paper we leverage news media, a diverse data +source that is rich with normative assessments of emerging technologies, to +formulate a taxonomy of impacts to act as a baseline for comparing against. By +computationally analyzing thousands of news articles published by hundreds of +online news domains around the world, we develop a taxonomy consisting of ten +categories of AI impacts. We then evaluate both instruction-based (GPT-4 and +Mistral-7B-Instruct) and fine-tuned completion models (Mistral-7B and GPT-3) +using a sample from this baseline. We find that the generated impacts using +Mistral-7B, fine-tuned on impacts from the news media, tend to be qualitatively +on par with impacts generated using a larger scale model such as GPT-4. +Moreover, we find that these LLMs generate impacts that largely reflect the +taxonomy of negative impacts identified in the news media, however the impacts +produced by instruction-based models had gaps in the production of certain +categories of impacts in comparison to fine-tuned models. This research +highlights a potential bias in state-of-the-art LLMs when used for anticipating +impacts and demonstrates the advantages of aligning smaller LLMs with a diverse +range of impacts, such as those reflected in the news media, to better reflect +such impacts during anticipatory exercises. + +
+
+ comment: 14 pages + research ethics and social impact statement, references, + and appendix. Under conference review +
+
+
+
+
+ + ☆ Prompt-Driven LLM Safeguarding via Directed Representation Optimization + + +
+ Prepending model inputs with safety prompts is a common practice of +safeguarding large language models (LLMs) from complying with queries that +contain harmful intents. However, the working mechanisms of safety prompts have +not yet been fully understood, which hinders the potential for automatically +optimizing them for improved LLM safety. Motivated by this problem, we +investigate the impact of safety prompts from the perspective of model +representations. We find that in models' representation space, harmful and +harmless queries can be largely distinguished, but this is not noticeably +enhanced by safety prompts. Instead, the queries' representations are moved by +different safety prompts in similar directions, where models become more prone +to refusal (i.e., refusing to provide assistance) even when the queries are +harmless. Inspired by these findings, we propose a method called DRO (Directed +Representation Optimization) for automatic safety prompt optimization. DRO +treats safety prompts as continuous, trainable embeddings and learns to move +the representations of harmful/harmless queries along/opposite the direction in +which the model's refusal probability increases. We demonstrate that DRO +remarkably improves the safeguarding performance of human-crafted safety +prompts and outperforms strong baselines, as evaluated on out-of-domain +benchmarks, without compromising the general model capability. + +
+
+
+
+
+ + ☆ Desiderata for the Context Use of Question Answering Systems EACL 2024 + + +
+ Prior work has uncovered a set of common problems in state-of-the-art +context-based question answering (QA) systems: a lack of attention to the +context when the latter conflicts with a model's parametric knowledge, little +robustness to noise, and a lack of consistency with their answers. However, +most prior work focus on one or two of those problems in isolation, which makes +it difficult to see trends across them. We aim to close this gap, by first +outlining a set of -- previously discussed as well as novel -- desiderata for +QA models. We then survey relevant analysis and methods papers to provide an +overview of the state of the field. The second part of our work presents +experiments where we evaluate 15 QA systems on 5 datasets according to all +desiderata at once. We find many novel trends, including (1) systems that are +less susceptible to noise are not necessarily more consistent with their +answers when given irrelevant context; (2) most systems that are more +susceptible to noise are more likely to correctly answer according to a context +that conflicts with their parametric knowledge; and (3) the combination of +conflicting knowledge and noise can reduce system performance by up to 96%. As +such, our desiderata help increase our understanding of how these models work +and reveal potential avenues for improvements. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ☆ Entity Linking in the Job Market Domain EACL 2024 + + +
+ In Natural Language Processing, entity linking (EL) has centered around +Wikipedia, but yet remains underexplored for the job market domain. +Disambiguating skill mentions can help us get insight into the current labor +market demands. In this work, we are the first to explore EL in this domain, +specifically targeting the linkage of occupational skills to the ESCO taxonomy +(le Vrang et al., 2014). Previous efforts linked coarse-grained (full) +sentences to a corresponding ESCO skill. In this work, we link more +fine-grained span-level mentions of skills. We tune two high-performing neural +EL models, a bi-encoder (Wu et al., 2020) and an autoregressive model (Cao et +al., 2021), on a synthetically generated mention--skill pair dataset and +evaluate them on a human-annotated skill-linking benchmark. Our findings reveal +that both models are capable of linking implicit mentions of skills to their +correct taxonomy counterparts. Empirically, BLINK outperforms GENRE in strict +evaluation, but GENRE performs better in loose evaluation (accuracy@$k$). + +
+
+ comment: Accepted at EACL 2024 Findings +
+
+
+
+
+ + ☆ GUMsley: Evaluating Entity Salience in Summarization for 12 English + Genres EACL 2024 + + +
+ As NLP models become increasingly capable of understanding documents in terms +of coherent entities rather than strings, obtaining the most salient entities +for each document is not only an important end task in itself but also vital +for Information Retrieval (IR) and other downstream applications such as +controllable summarization. In this paper, we present and evaluate GUMsley, the +first entity salience dataset covering all named and non-named salient entities +for 12 genres of English text, aligned with entity types, Wikification links +and full coreference resolution annotations. We promote a strict definition of +salience using human summaries and demonstrate high inter-annotator agreement +for salience based on whether a source entity is mentioned in the summary. Our +evaluation shows poor performance by pre-trained SOTA summarization models and +zero-shot LLM prompting in capturing salient entities in generated summaries. +We also show that predicting or providing salient entities to several model +architectures enhances performance and helps derive higher-quality summaries by +alleviating the entity hallucination problem in existing abstractive +summarization. + +
+
+ comment: Camera-ready for EACL 2024 +
+
+
+
+
+ + ☆ [Lions: 1] and [Tigers: 2] and [Bears: 3], Oh My! Literary Coreference + Annotation with LLMs + + +
+ Coreference annotation and resolution is a vital component of computational +literary studies. However, it has previously been difficult to build high +quality systems for fiction. Coreference requires complicated structured +outputs, and literary text involves subtle inferences and highly varied +language. New language-model-based seq2seq systems present the opportunity to +solve both these problems by learning to directly generate a copy of an input +sentence with markdown-like annotations. We create, evaluate, and release +several trained models for coreference, as well as a workflow for training new +models. + +
+
+ comment: Accepted to LaTeCH-CLfL 2024 +
+
+
+
+
+ + ☆ LOCOST: State-Space Models for Long Document Abstractive Summarization EACL 2024 + + +
+ State-space models are a low-complexity alternative to transformers for +encoding long sequences and capturing long-term dependencies. We propose +LOCOST: an encoder-decoder architecture based on state-space models for +conditional text generation with long context inputs. With a computational +complexity of $O(L \log L)$, this architecture can handle significantly longer +sequences than state-of-the-art models that are based on sparse attention +patterns. We evaluate our model on a series of long document abstractive +summarization tasks. The model reaches a performance level that is 93-96% +comparable to the top-performing sparse transformers of the same size while +saving up to 50% memory during training and up to 87% during inference. +Additionally, LOCOST effectively handles input texts exceeding 600K tokens at +inference time, setting new state-of-the-art results on full-book summarization +and opening new perspectives for long input processing. + +
+
+ comment: 9 pages, 5 figures, 7 tables, EACL 2024 conference +
+
+
+
+
+ + ☆ SNNLP: Energy-Efficient Natural Language Processing Using Spiking Neural + Networks + + +
+ As spiking neural networks receive more attention, we look toward +applications of this computing paradigm in fields other than computer vision +and signal processing. One major field, underexplored in the neuromorphic +setting, is Natural Language Processing (NLP), where most state-of-the-art +solutions still heavily rely on resource-consuming and power-hungry traditional +deep learning architectures. Therefore, it is compelling to design NLP models +for neuromorphic architectures due to their low energy requirements, with the +additional benefit of a more human-brain-like operating model for processing +information. However, one of the biggest issues with bringing NLP to the +neuromorphic setting is in properly encoding text into a spike train so that it +can be seamlessly handled by both current and future SNN architectures. In this +paper, we compare various methods of encoding text as spikes and assess each +method's performance in an associated SNN on a downstream NLP task, namely, +sentiment analysis. Furthermore, we go on to propose a new method of encoding +text as spikes that outperforms a widely-used rate-coding technique, Poisson +rate-coding, by around 13\% on our benchmark NLP tasks. Subsequently, we +demonstrate the energy efficiency of SNNs implemented in hardware for the +sentiment analysis task compared to traditional deep neural networks, observing +an energy efficiency increase of more than 32x during inference and 60x during +training while incurring the expected energy-performance tradeoff. + +
+
+
+
+
+ + ☆ Revisiting speech segmentation and lexicon learning with better features + + +
+ We revisit a self-supervised method that segments unlabelled speech into +word-like segments. We start from the two-stage duration-penalised dynamic +programming method that performs zero-resource segmentation without learning an +explicit lexicon. In the first acoustic unit discovery stage, we replace +contrastive predictive coding features with HuBERT. After word segmentation in +the second stage, we get an acoustic word embedding for each segment by +averaging HuBERT features. These embeddings are clustered using K-means to get +a lexicon. The result is good full-coverage segmentation with a lexicon that +achieves state-of-the-art performance on the ZeroSpeech benchmarks. + +
+
+ comment: 2 pages +
+
+
+
+
+ + ☆ Employing Label Models on ChatGPT Answers Improves Legal Text Entailment + Performance + + +
+ The objective of legal text entailment is to ascertain whether the assertions +in a legal query logically follow from the information provided in one or +multiple legal articles. ChatGPT, a large language model, is robust in many +natural language processing tasks, including legal text entailment: when we set +the temperature = 0 (the ChatGPT answers are deterministic) and prompt the +model, it achieves 70.64% accuracy on COLIEE 2022 dataset, which outperforms +the previous SOTA of 67.89%. On the other hand, if the temperature is larger +than zero, ChatGPT answers are not deterministic, leading to inconsistent +answers and fluctuating results. We propose to leverage label models (a +fundamental component of weak supervision techniques) to integrate the +provisional answers by ChatGPT into consolidated labels. By that way, we treat +ChatGPT provisional answers as noisy predictions which can be consolidated by +label models. The experimental results demonstrate that this approach can +attain an accuracy of 76.15%, marking a significant improvement of 8.26% over +the prior state-of-the-art benchmark. Additionally, we perform an analysis of +the instances where ChatGPT produces incorrect answers, then we classify the +errors, offering insights that could guide potential enhancements for future +research endeavors. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ I Think, Therefore I am: Awareness in Large Language Models + + +
+ Do large language models (LLMs) exhibit any forms of awareness similar to +humans? In this paper, we introduce the concept of awareness to LLMs, arguing +that awareness is an essential aspect of trustworthiness for LLMs to enhance +their interaction with humans while ensuring ethical responses. We define +awareness in LLMs as the ability to perceive and understand themselves as AI +models and to exhibit social intelligence. We identify four key dimensions of +awareness: capability, mission, emotion, and perspective. To assess LLMs on +these dimensions, we introduce a specialized dataset, AwareLLM dataset. Our +findings reveal that LLMs demonstrate a decent degree of awareness, though they +still lack substantial capability awareness. + +
+
+
+
+
+ + ☆ Probing Language Models' Gesture Understanding for Enhanced Human-AI + Interaction + + +
+ The rise of Large Language Models (LLMs) has affected various disciplines +that got beyond mere text generation. Going beyond their textual nature, this +project proposal aims to investigate the interaction between LLMs and +non-verbal communication, specifically focusing on gestures. The proposal sets +out a plan to examine the proficiency of LLMs in deciphering both explicit and +implicit non-verbal cues within textual prompts and their ability to associate +these gestures with various contextual factors. The research proposes to test +established psycholinguistic study designs to construct a comprehensive dataset +that pairs textual prompts with detailed gesture descriptions, encompassing +diverse regional variations, and semantic labels. To assess LLMs' comprehension +of gestures, experiments are planned, evaluating their ability to simulate +human behaviour in order to replicate psycholinguistic experiments. These +experiments consider cultural dimensions and measure the agreement between +LLM-identified gestures and the dataset, shedding light on the models' +contextual interpretation of non-verbal cues (e.g. gestures). + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Global-Liar: Factuality of LLMs over Time and Geographic Regions + + +
+ The increasing reliance on AI-driven solutions, particularly Large Language +Models (LLMs) like the GPT series, for information retrieval highlights the +critical need for their factuality and fairness, especially amidst the rampant +spread of misinformation and disinformation online. Our study evaluates the +factual accuracy, stability, and biases in widely adopted GPT models, including +GPT-3.5 and GPT-4, contributing to reliability and integrity of AI-mediated +information dissemination. + We introduce 'Global-Liar,' a dataset uniquely balanced in terms of +geographic and temporal representation, facilitating a more nuanced evaluation +of LLM biases. Our analysis reveals that newer iterations of GPT models do not +always equate to improved performance. Notably, the GPT-4 version from March +demonstrates higher factual accuracy than its subsequent June release. +Furthermore, a concerning bias is observed, privileging statements from the +Global North over the Global South, thus potentially exacerbating existing +informational inequities. Regions such as Africa and the Middle East are at a +disadvantage, with much lower factual accuracy. The performance fluctuations +over time suggest that model updates may not consistently benefit all regions +equally. + Our study also offers insights into the impact of various LLM configuration +settings, such as binary decision forcing, model re-runs and temperature, on +model's factuality. Models constrained to binary (true/false) choices exhibit +reduced factuality compared to those allowing an 'unclear' option. Single +inference at a low temperature setting matches the reliability of majority +voting across various configurations. The insights gained highlight the need +for culturally diverse and geographically inclusive model training and +evaluation. This approach is key to achieving global equity in technology, +distributing AI benefits fairly worldwide. + +
+
+ comment: 24 pages, 12 figures, 9 tables +
+
+
+
+
+ + ☆ Neural Machine Translation for Malayalam Paraphrase Generation + + +
+ This study explores four methods of generating paraphrases in Malayalam, +utilizing resources available for English paraphrasing and pre-trained Neural +Machine Translation (NMT) models. We evaluate the resulting paraphrases using +both automated metrics, such as BLEU, METEOR, and cosine similarity, as well as +human annotation. Our findings suggest that automated evaluation measures may +not be fully appropriate for Malayalam, as they do not consistently align with +human judgment. This discrepancy underscores the need for more nuanced +paraphrase evaluation approaches especially for highly agglutinative languages. + +
+
+
+
+
+ + ☆ A Survey of Pre-trained Language Models for Processing Scientific Text + + +
+ The number of Language Models (LMs) dedicated to processing scientific text +is on the rise. Keeping pace with the rapid growth of scientific LMs (SciLMs) +has become a daunting task for researchers. To date, no comprehensive surveys +on SciLMs have been undertaken, leaving this issue unaddressed. Given the +constant stream of new SciLMs, appraising the state-of-the-art and how they +compare to each other remain largely unknown. This work fills that gap and +provides a comprehensive review of SciLMs, including an extensive analysis of +their effectiveness across different domains, tasks and datasets, and a +discussion on the challenges that lie ahead. + +
+
+ comment: Resources are available at https://github.com/Alab-NII/Awesome-SciLM +
+
+
+
+
+ + ☆ SWEA: Changing Factual Knowledge in Large Language Models via Subject + Word Embedding Altering + + +
+ Model editing has recently gained widespread attention. Current model editing +methods primarily involve modifying model parameters or adding additional +modules to the existing model. However, the former causes irreversible damage +to LLMs, while the latter incurs additional inference overhead and fuzzy vector +matching is not always reliable. To address these issues, we propose an +expandable Subject Word Embedding Altering (SWEA) framework, which modifies the +representation of subjects and achieve the goal of editing knowledge during the +inference stage. SWEA uses precise key matching outside the model and performs +reliable subject word embedding altering, thus protecting the original weights +of the model without increasing inference overhead. We then propose optimizing +then suppressing fusion method, which first optimizes the embedding vector for +the editing target and then suppresses the Knowledge Embedding Dimension (KED) +to obtain the final fused embedding. We thus propose SWEAOS method for editing +factual knowledge in LLMs. We demonstrate the state-of-the-art performance of +SWEAOS on the COUNTERFACT and zsRE datasets. To further validate the reasoning +ability of SWEAOS in editing knowledge, we evaluate it on the more complex +RIPPLEEDITS benchmark. The results on two subdatasets demonstrate that our +SWEAOS possesses state-of-the-art reasoning ability. + +
+
+ comment: Work in progress; Our code will be released +
+
+
+
+
+ + ☆ CauESC: A Causal Aware Model for Emotional Support Conversation + + +
+ Emotional Support Conversation aims at reducing the seeker's emotional +distress through supportive response. Existing approaches have two limitations: +(1) They ignore the emotion causes of the distress, which is important for +fine-grained emotion understanding; (2) They focus on the seeker's own mental +state rather than the emotional dynamics during interaction between speakers. +To address these issues, we propose a novel framework CauESC, which firstly +recognizes the emotion causes of the distress, as well as the emotion effects +triggered by the causes, and then understands each strategy of verbal grooming +independently and integrates them skillfully. Experimental results on the +benchmark dataset demonstrate the effectiveness of our approach and show the +benefits of emotion understanding from cause to effect and +independent-integrated strategy modeling. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Enhancing Large Language Model with Decomposed Reasoning for Emotion + Cause Pair Extraction + + +
+ Emotion-Cause Pair Extraction (ECPE) involves extracting clause pairs +representing emotions and their causes in a document. Existing methods tend to +overfit spurious correlations, such as positional bias in existing benchmark +datasets, rather than capturing semantic features. Inspired by recent work, we +explore leveraging large language model (LLM) to address ECPE task without +additional training. Despite strong capabilities, LLMs suffer from +uncontrollable outputs, resulting in mediocre performance. To address this, we +introduce chain-of-thought to mimic human cognitive process and propose the +Decomposed Emotion-Cause Chain (DECC) framework. Combining inducing inference +and logical pruning, DECC guides LLMs to tackle ECPE task. We further enhance +the framework by incorporating in-context learning. Experiment results +demonstrate the strength of DECC compared to state-of-the-art supervised +fine-tuning methods. Finally, we analyze the effectiveness of each component +and the robustness of the method in various scenarios, including different LLM +bases, rebalanced datasets, and multi-pair extraction. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ WSC+: Enhancing The Winograd Schema Challenge Using Tree-of-Experts EACL 2024 + + +
+ The Winograd Schema Challenge (WSC) serves as a prominent benchmark for +evaluating machine understanding. While Large Language Models (LLMs) excel at +answering WSC questions, their ability to generate such questions remains less +explored. In this work, we propose Tree-of-Experts (ToE), a novel prompting +method which enhances the generation of WSC instances (50% valid cases vs. 10% +in recent methods). Using this approach, we introduce WSC+, a novel dataset +comprising 3,026 LLM-generated sentences. Notably, we extend the WSC framework +by incorporating new 'ambiguous' and 'offensive' categories, providing a deeper +insight into model overconfidence and bias. Our analysis reveals nuances in +generation-evaluation consistency, suggesting that LLMs may not always +outperform in evaluating their own generated questions when compared to those +crafted by other models. On WSC+, GPT-4, the top-performing LLM, achieves an +accuracy of 68.7%, significantly below the human benchmark of 95.1%. + +
+
+ comment: Accepted for publication in main proceedings of EACL 2024 conference, + 22 pages, 16 figures +
+
+
+
+
+ + ☆ Mitigating the Problem of Strong Priors in LMs with Context + Extrapolation + + +
+ Language models (LMs) have become important tools in a variety of +applications, from data processing to the creation of instruction-following +assistants. But despite their advantages, LMs have certain idiosyncratic +limitations such as the problem of `strong priors', where a model learns to +output typical continuations in response to certain, usually local, portions of +the input regardless of any earlier instructions. For example, prompt injection +attacks can induce models to ignore explicit directives. In some cases, larger +models have been shown to be more susceptible to these problems than similar +smaller models, an example of the phenomenon of `inverse scaling'. We develop a +new technique for mitigating the problem of strong priors: we take the original +set of instructions, produce a weakened version of the original prompt that is +even more susceptible to the strong priors problem, and then extrapolate the +continuation away from the weakened prompt. This lets us infer how the model +would continue a hypothetical strengthened set of instructions. Our technique +conceptualises LMs as mixture models which combine a family of data generation +processes, reinforcing the desired elements of the mixture. Our approach works +at inference time, removing any need for retraining. We apply it to eleven +models including GPT-2, GPT-3, Llama 2, and Mistral on four tasks, and find +improvements in 41/44. Across all 44 combinations the median increase in +proportion of tasks completed is 40%. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Deductive Beam Search: Decoding Deducible Rationale for Chain-of-Thought + Reasoning + + +
+ Recent advancements have significantly augmented the reasoning capabilities +of Large Language Models (LLMs) through various methodologies, especially +chain-of-thought (CoT) reasoning. However, previous methods fail to address +reasoning errors in intermediate steps, leading to accumulative errors.In this +paper, we propose Deductive Beam Search (DBS), which seamlessly integrates CoT +and deductive reasoning with step-wise beam search for LLMs. Our approach +deploys a verifier, verifying the deducibility of a reasoning step and its +premises, thus alleviating the error accumulation. Furthermore, we introduce a +scalable and labor-free data construction method to amplify our model's +verification capabilities. Extensive experiments demonstrate that our approach +significantly enhances the base performance of LLMs of various scales (7B, 13B, +70B, and ChatGPT) across 8 reasoning datasets from 3 diverse reasoning genres, +including arithmetic, commonsense, and symbolic. Moreover, our analysis proves +DBS's capability of detecting diverse and subtle reasoning errors and +robustness on different model scales. + +
+
+
+
+
+ + ☆ Contextual Feature Extraction Hierarchies Converge in Large Language + Models and the Brain + + +
+ Recent advancements in artificial intelligence have sparked interest in the +parallels between large language models (LLMs) and human neural processing, +particularly in language comprehension. While prior research has established +similarities in the representation of LLMs and the brain, the underlying +computational principles that cause this convergence, especially in the context +of evolving LLMs, remain elusive. Here, we examined a diverse selection of +high-performance LLMs with similar parameter sizes to investigate the factors +contributing to their alignment with the brain's language processing +mechanisms. We find that as LLMs achieve higher performance on benchmark tasks, +they not only become more brain-like as measured by higher performance when +predicting neural responses from LLM embeddings, but also their hierarchical +feature extraction pathways map more closely onto the brain's while using fewer +layers to do the same encoding. We also compare the feature extraction pathways +of the LLMs to each other and identify new ways in which high-performing models +have converged toward similar hierarchical processing mechanisms. Finally, we +show the importance of contextual information in improving model performance +and brain similarity. Our findings reveal the converging aspects of language +processing in the brain and LLMs and offer new directions for developing models +that align more closely with human cognitive processing. + +
+
+ comment: 19 pages, 5 figures and 4 supplementary figures +
+
+
+
+
+ + ☆ Document Structure in Long Document Transformers EACL 2024 + + +
+ Long documents often exhibit structure with hierarchically organized elements +of different functions, such as section headers and paragraphs. Despite the +omnipresence of document structure, its role in natural language processing +(NLP) remains opaque. Do long-document Transformer models acquire an internal +representation of document structure during pre-training? How can structural +information be communicated to a model after pre-training, and how does it +influence downstream performance? To answer these questions, we develop a novel +suite of probing tasks to assess structure-awareness of long-document +Transformers, propose general-purpose structure infusion methods, and evaluate +the effects of structure infusion on QASPER and Evidence Inference, two +challenging long-document NLP tasks. Results on LED and LongT5 suggest that +they acquire implicit understanding of document structure during pre-training, +which can be further enhanced by structure infusion, leading to improved +end-task performance. To foster research on the role of document structure in +NLP modeling, we make our data and code publicly available. + +
+
+ comment: Accepted at EACL 2024. Code and data: + http://github.com/UKPLab/eacl2024-doc-structure +
+
+
+
+
+ + ☆ Navigating the OverKill in Large Language Models + + +
+ Large language models are meticulously aligned to be both helpful and +harmless. However, recent research points to a potential overkill which means +models may refuse to answer benign queries. In this paper, we investigate the +factors for overkill by exploring how models handle and determine the safety of +queries. Our findings reveal the presence of shortcuts within models, leading +to an over-attention of harmful words like 'kill' and prompts emphasizing +safety will exacerbate overkill. Based on these insights, we introduce +Self-Contrastive Decoding (Self-CD), a training-free and model-agnostic +strategy, to alleviate this phenomenon. We first extract such over-attention by +amplifying the difference in the model's output distributions when responding +to system prompts that either include or omit an emphasis on safety. Then we +determine the final next-token predictions by downplaying the over-attention +from the model via contrastive decoding. Empirical results indicate that our +method has achieved an average reduction of the refusal rate by 20\% while +having almost no impact on safety. + +
+
+
+
+
+ + ☆ What Do Self-Supervised Speech and Speaker Models Learn? New Findings + From a Cross Model Layer-Wise Analysis ICASSP 2024 + + +
+ Self-supervised learning (SSL) has attracted increased attention for learning +meaningful speech representations. Speech SSL models, such as WavLM, employ +masked prediction training to encode general-purpose representations. In +contrast, speaker SSL models, exemplified by DINO-based models, adopt +utterance-level training objectives primarily for speaker representation. +Understanding how these models represent information is essential for refining +model efficiency and effectiveness. Unlike the various analyses of speech SSL, +there has been limited investigation into what information speaker SSL captures +and how its representation differs from speech SSL or other fully-supervised +speaker models. This paper addresses these fundamental questions. We explore +the capacity to capture various speech properties by applying SUPERB evaluation +probing tasks to speech and speaker SSL models. We also examine which layers +are predominantly utilized for each task to identify differences in how speech +is represented. Furthermore, we conduct direct comparisons to measure the +similarities between layers within and across models. Our analysis unveils that +1) the capacity to represent content information is somewhat unrelated to +enhanced speaker representation, 2) specific layers of speech SSL models would +be partly specialized in capturing linguistic information, and 3) speaker SSL +models tend to disregard linguistic information but exhibit more sophisticated +speaker representation. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Neighboring Perturbations of Knowledge Editing on Large Language Models + + +
+ Despite their exceptional capabilities, large language models (LLMs) are +prone to generating unintended text due to false or outdated knowledge. Given +the resource-intensive nature of retraining LLMs, there has been a notable +increase in the development of knowledge editing. However, current approaches +and evaluations rarely explore the perturbation of editing on neighboring +knowledge. This paper studies whether updating new knowledge to LLMs perturbs +the neighboring knowledge encapsulated within them. Specifically, we seek to +figure out whether appending a new answer into an answer list to a factual +question leads to catastrophic forgetting of original correct answers in this +list, as well as unintentional inclusion of incorrect answers. A metric of +additivity is introduced and a benchmark dubbed as Perturbation Evaluation of +Appending Knowledge (PEAK) is constructed to evaluate the degree of +perturbation to neighboring knowledge when appending new knowledge. Besides, a +plug-and-play framework termed Appending via Preservation and Prevention (APP) +is proposed to mitigate the neighboring perturbation by maintaining the +integrity of the answer list. Experiments demonstrate the effectiveness of APP +coupling with four editing methods on three LLMs. + +
+
+
+
+
+ + ☆ Assertion Detection Large Language Model In-context Learning LoRA + Fine-tuning + + +
+ In this study, we aim to address the task of assertion detection when +extracting medical concepts from clinical notes, a key process in clinical +natural language processing (NLP). Assertion detection in clinical NLP usually +involves identifying assertion types for medical concepts in the clinical text, +namely certainty (whether the medical concept is positive, negated, possible, +or hypothetical), temporality (whether the medical concept is for present or +the past history), and experiencer (whether the medical concept is described +for the patient or a family member). These assertion types are essential for +healthcare professionals to quickly and clearly understand the context of +medical conditions from unstructured clinical texts, directly influencing the +quality and outcomes of patient care. Although widely used, traditional +methods, particularly rule-based NLP systems and machine learning or deep +learning models, demand intensive manual efforts to create patterns and tend to +overlook less common assertion types, leading to an incomplete understanding of +the context. To address this challenge, our research introduces a novel +methodology that utilizes Large Language Models (LLMs) pre-trained on a vast +array of medical data for assertion detection. We enhanced the current method +with advanced reasoning techniques, including Tree of Thought (ToT), Chain of +Thought (CoT), and Self-Consistency (SC), and refine it further with Low-Rank +Adaptation (LoRA) fine-tuning. We first evaluated the model on the i2b2 2010 +assertion dataset. Our method achieved a micro-averaged F-1 of 0.89, with 0.11 +improvements over the previous works. To further assess the generalizability of +our approach, we extended our evaluation to a local dataset that focused on +sleep concept extraction. Our approach achieved an F-1 of 0.74, which is 0.31 +higher than the previous method. + +
+
+
+
+
+ + ☆ Good at captioning, bad at counting: Benchmarking GPT-4V on Earth + observation data + + +
+ Large Vision-Language Models (VLMs) have demonstrated impressive performance +on complex tasks involving visual input with natural language instructions. +However, it remains unclear to what extent capabilities on natural images +transfer to Earth observation (EO) data, which are predominantly satellite and +aerial images less common in VLM training data. In this work, we propose a +comprehensive benchmark to gauge the progress of VLMs toward being useful tools +for EO data by assessing their abilities on scene understanding, localization +and counting, and change detection tasks. Motivated by real-world applications, +our benchmark includes scenarios like urban monitoring, disaster relief, land +use, and conservation. We discover that, although state-of-the-art VLMs like +GPT-4V possess extensive world knowledge that leads to strong performance on +open-ended tasks like location understanding and image captioning, their poor +spatial reasoning limits usefulness on object localization and counting tasks. +Our benchmark will be made publicly available at https://vleo.danielz.ch/ and +on Hugging Face at +https://huggingface.co/collections/mit-ei/vleo-benchmark-datasets-65b789b0466555489cce0d70 +for easy model evaluation. + +
+
+ comment: 62 pages; work in progress +
+
+
+
+
+ + ☆ SPECTRUM: Speaker-Enhanced Pre-Training for Long Dialogue Summarization + + +
+ Multi-turn dialogues are characterized by their extended length and the +presence of turn-taking conversations. Traditional language models often +overlook the distinct features of these dialogues by treating them as regular +text. In this paper, we propose a speaker-enhanced pre-training method for long +dialogue summarization, which leverages the inherent structure of multiple-turn +dialogues. To support our study, we curate a diverse dataset that includes +transcripts from real-world scenarios, movie or TV show transcripts, and +dialogues generated by a Large Language Model. We then perform a pre-training, +which encompasses the detection of speaker changes, and masked utterance +generation. Experimental results of fine-tuned models demonstrate that our +model achieves state-of-the-art performance on downstream benchmarks with long +context, surpassing baseline models and highlighting the effectiveness of our +approach. Our findings highlight the importance of curating pre-training +datasets that exhibit diversity and variations in length distribution to ensure +effective alignment with downstream datasets. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Local and Global Contexts for Conversation + + +
+ The context in conversation is the dialog history crucial for multi-turn +dialogue. Learning from the relevant contexts in dialog history for grounded +conversation is a challenging problem. Local context is the most neighbor and +more sensitive to the subsequent response, and global context is relevant to a +whole conversation far beyond neighboring utterances. Currently, pretrained +transformer models for conversation challenge capturing the correlation and +connection between local and global contexts. We introduce a local and global +conversation model (LGCM) for general-purpose conversation in open domain. It +is a local-global hierarchical transformer model that excels at accurately +discerning and assimilating the relevant contexts necessary for generating +responses. It employs a local encoder to grasp the local context at the level +of individual utterances and a global encoder to understand the broader context +at the dialogue level. The seamless fusion of these locally and globally +contextualized encodings ensures a comprehensive comprehension of the +conversation. Experiments on popular datasets show that LGCM outperforms the +existing conversation models on the performance of automatic metrics with +significant margins. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Propagation and Pitfalls: Reasoning-based Assessment of Knowledge + Editing through Counterfactual Tasks + + +
+ Current approaches of knowledge editing struggle to effectively propagate +updates to interconnected facts. In this work, we delve into the barriers that +hinder the appropriate propagation of updated knowledge within these models for +accurate reasoning. To support our analysis, we introduce a novel +reasoning-based benchmark -- ReCoE (Reasoning-based Counterfactual Editing +dataset) -- which covers six common reasoning schemes in real world. We conduct +a thorough analysis of existing knowledge editing techniques, including input +augmentation, finetuning, and locate-and-edit. We found that all model editing +methods show notably low performance on this dataset, especially in certain +reasoning schemes. Our analysis over the chain-of-thought generation of edited +models further uncover key reasons behind the inadequacy of existing knowledge +editing methods from a reasoning standpoint, involving aspects on fact-wise +editing, fact recall ability, and coherence in generation. We will make our +benchmark publicly available. + +
+
+ comment: 22 pages, 14 figures, 5 tables +
+
+
+
+
+ + ☆ Scavenging Hyena: Distilling Transformers into Long Convolution Models + + +
+ The rapid evolution of Large Language Models (LLMs), epitomized by +architectures like GPT-4, has reshaped the landscape of natural language +processing. This paper introduces a pioneering approach to address the +efficiency concerns associated with LLM pre-training, proposing the use of +knowledge distillation for cross-architecture transfer. Leveraging insights +from the efficient Hyena mechanism, our method replaces attention heads in +transformer models by Hyena, offering a cost-effective alternative to +traditional pre-training while confronting the challenge of processing long +contextual information, inherent in quadratic attention mechanisms. Unlike +conventional compression-focused methods, our technique not only enhances +inference speed but also surpasses pre-training in terms of both accuracy and +efficiency. In the era of evolving LLMs, our work contributes to the pursuit of +sustainable AI solutions, striking a balance between computational power and +environmental impact. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ☆ PipeNet: Question Answering with Semantic Pruning over Knowledge Graphs + + +
+ It is well acknowledged that incorporating explicit knowledge graphs (KGs) +can benefit question answering. Existing approaches typically follow a +grounding-reasoning pipeline in which entity nodes are first grounded for the +query (question and candidate answers), and then a reasoning module reasons +over the matched multi-hop subgraph for answer prediction. Although the +pipeline largely alleviates the issue of extracting essential information from +giant KGs, efficiency is still an open challenge when scaling up hops in +grounding the subgraphs. In this paper, we target at finding semantically +related entity nodes in the subgraph to improve the efficiency of graph +reasoning with KG. We propose a grounding-pruning-reasoning pipeline to prune +noisy nodes, remarkably reducing the computation cost and memory usage while +also obtaining decent subgraph representation. In detail, the pruning module +first scores concept nodes based on the dependency distance between matched +spans and then prunes the nodes according to score ranks. To facilitate the +evaluation of pruned subgraphs, we also propose a graph attention network (GAT) +based module to reason with the subgraph data. Experimental results on +CommonsenseQA and OpenBookQA demonstrate the effectiveness of our method. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ FEUDA: Frustratingly Easy Prompt Based Unsupervised Domain Adaptation + + +
+ A major thread of unsupervised domain adaptation (UDA) methods uses unlabeled +data from both source and target domains to learn domain-invariant +representations for adaptation. However, these methods showcase certain +limitations, encouraging the use of self-supervised learning through continued +pre-training. The necessity of continued pre-training or learning +domain-invariant representations is still unclear in the prompt-based +classification framework, where an input example is modified by a template and +then fed into a language model (LM) to generate a label string. To examine this +new paradigm of UDA in the prompt-based setup, we propose a frustratingly easy +UDA method (FEUDA) that trains an autoregressive LM on both unlabeled and +labeled examples using two different instruction-tuning tasks. Specifically, +the first task trains the LM on unlabeled texts from both domains via masked +language modeling (MLM), and the other uses supervised instruction-tuning on +source-labeled data for classification. We conduct extensive experiments on 24 +real-world domain pairs to show the effectiveness of our method over strong +domain-invariant learning methods. Our analysis sheds light on why masked +language modeling improves target-domain classification performance in +prompt-based UDA. We discover that MLM helps the model learn both semantic and +background knowledge of a domain, which are both beneficial for downstream +classification. + +
+
+
+
+
+ + ☆ Linguistically Communicating Uncertainty in Patient-Facing Risk + Prediction Models + + +
+ This paper addresses the unique challenges associated with uncertainty +quantification in AI models when applied to patient-facing contexts within +healthcare. Unlike traditional eXplainable Artificial Intelligence (XAI) +methods tailored for model developers or domain experts, additional +considerations of communicating in natural language, its presentation and +evaluating understandability are necessary. We identify the challenges in +communication model performance, confidence, reasoning and unknown knowns using +natural language in the context of risk prediction. We propose a design aimed +at addressing these challenges, focusing on the specific application of +in-vitro fertilisation outcome prediction. + +
+
+
+
+
+ + ☆ Exploring the limits of decoder-only models trained on public speech + recognition corpora + + +
+ The emergence of industrial-scale speech recognition (ASR) models such as +Whisper and USM, trained on 1M hours of weakly labelled and 12M hours of audio +only proprietary data respectively, has led to a stronger need for large scale +public ASR corpora and competitive open source pipelines. Unlike the said +models, large language models are typically based on Transformer decoders, and +it remains unclear if decoder-only models trained on public data alone can +deliver competitive performance. In this work, we investigate factors such as +choice of training datasets and modeling components necessary for obtaining the +best performance using public English ASR corpora alone. Our Decoder-Only +Transformer for ASR (DOTA) model comprehensively outperforms the +encoder-decoder open source replication of Whisper (OWSM) on nearly all English +ASR benchmarks and outperforms Whisper large-v3 on 7 out of 15 test sets. We +release our codebase and model checkpoints under permissive license. + +
+
+
+
+
+ + ☆ Are Generative AI systems Capable of Supporting Information Needs of + Patients? + + +
+ Patients managing a complex illness such as cancer face a complex information +challenge where they not only must learn about their illness but also how to +manage it. Close interaction with healthcare experts (radiologists, +oncologists) can improve patient learning and thereby, their disease outcome. +However, this approach is resource intensive and takes expert time away from +other critical tasks. Given the recent advancements in Generative AI models +aimed at improving the healthcare system, our work investigates whether and how +generative visual question answering systems can responsibly support patient +information needs in the context of radiology imaging data. We conducted a +formative need-finding study in which participants discussed chest computed +tomography (CT) scans and associated radiology reports of a fictitious close +relative with a cardiothoracic radiologist. Using thematic analysis of the +conversation between participants and medical experts, we identified commonly +occurring themes across interactions, including clarifying medical terminology, +locating the problems mentioned in the report in the scanned image, +understanding disease prognosis, discussing the next diagnostic steps, and +comparing treatment options. Based on these themes, we evaluated two +state-of-the-art generative visual language models against the radiologist's +responses. Our results reveal variability in the quality of responses generated +by the models across various themes. We highlight the importance of +patient-facing generative AI systems to accommodate a diverse range of +conversational themes, catering to the real-world informational needs of +patients. + +
+
+
+
+
+ + ☆ De-identification is not always enough + + +
+ For sharing privacy-sensitive data, de-identification is commonly regarded as +adequate for safeguarding privacy. Synthetic data is also being considered as a +privacy-preserving alternative. Recent successes with numerical and tabular +data generative models and the breakthroughs in large generative language +models raise the question of whether synthetically generated clinical notes +could be a viable alternative to real notes for research purposes. In this +work, we demonstrated that (i) de-identification of real clinical notes does +not protect records against a membership inference attack, (ii) proposed a +novel approach to generate synthetic clinical notes using the current +state-of-the-art large language models, (iii) evaluated the performance of the +synthetically generated notes in a clinical domain task, and (iv) proposed a +way to mount a membership inference attack where the target model is trained +with synthetic data. We observed that when synthetically generated notes +closely match the performance of real data, they also exhibit similar privacy +concerns to the real data. Whether other approaches to synthetically generated +clinical notes could offer better trade-offs and become a better alternative to +sensitive real notes warrants further investigation. + +
+
+
+
+
+ + ☆ Multimodal Clinical Pseudo-notes for Emergency Department Prediction + Tasks using Multiple Embedding Model for EHR (MEME) ICML + + +
+ In this work, we introduce Multiple Embedding Model for EHR (MEME), an +approach that views Electronic Health Records (EHR) as multimodal data. This +approach incorporates "pseudo-notes", textual representations of tabular EHR +concepts such as diagnoses and medications, and allows us to effectively employ +Large Language Models (LLMs) for EHR representation. This framework also adopts +a multimodal approach, embedding each EHR modality separately. We demonstrate +the effectiveness of MEME by applying it to several tasks within the Emergency +Department across multiple hospital systems. Our findings show that MEME +surpasses the performance of both single modality embedding methods and +traditional machine learning approaches. However, we also observe notable +limitations in generalizability across hospital institutions for all tested +models. + +
+
+ comment: ICML Submission. However it is under review until May +
+
+
+
+
+ + ☆ Dolma: an Open Corpus of Three Trillion Tokens for Language Model + Pretraining Research + + +
+ Language models have become a critical technology to tackling a wide range of +natural language processing tasks, yet many details about how the +best-performing language models were developed are not reported. In particular, +information about their pretraining corpora is seldom discussed: commercial +language models rarely provide any information about their data; even open +models rarely release datasets they are trained on, or an exact recipe to +reproduce them. As a result, it is challenging to conduct certain threads of +language modeling research, such as understanding how training data impacts +model capabilities and shapes their limitations. To facilitate open research on +language model pretraining, we release Dolma, a three trillion tokens English +corpus, built from a diverse mixture of web content, scientific papers, code, +public-domain books, social media, and encyclopedic materials. In addition, we +open source our data curation toolkit to enable further experimentation and +reproduction of our work. In this report, we document Dolma, including its +design principles, details about its construction, and a summary of its +contents. We interleave this report with analyses and experimental results from +training language models on intermediate states of Dolma to share what we have +learned about important data curation practices, including the role of content +or quality filters, deduplication, and multi-source mixing. Dolma has been used +to train OLMo, a state-of-the-art, open language model and framework designed +to build and study the science of language modeling. + +
+
+ comment: Dataset available at: https://huggingface.co/datasets/allenai/dolma +
+
+
+
+
+ + ☆ Large Language Models for Mathematical Reasoning: Progresses and + Challenges EACL 2024 + + +
+ Mathematical reasoning serves as a cornerstone for assessing the fundamental +cognitive capabilities of human intelligence. In recent times, there has been a +notable surge in the development of Large Language Models (LLMs) geared towards +the automated resolution of mathematical problems. However, the landscape of +mathematical problem types is vast and varied, with LLM-oriented techniques +undergoing evaluation across diverse datasets and settings. This diversity +makes it challenging to discern the true advancements and obstacles within this +burgeoning field. This survey endeavors to address four pivotal dimensions: i) +a comprehensive exploration of the various mathematical problems and their +corresponding datasets that have been investigated; ii) an examination of the +spectrum of LLM-oriented techniques that have been proposed for mathematical +problem-solving; iii) an overview of factors and concerns affecting LLMs in +solving math; and iv) an elucidation of the persisting challenges within this +domain. To the best of our knowledge, this survey stands as one of the first +extensive examinations of the landscape of LLMs in the realm of mathematics, +providing a holistic perspective on the current state, accomplishments, and +future challenges in this rapidly evolving field. + +
+
+ comment: EACL 2024 Student Research Workshop, 8 pages +
+
+
+
+
+ + ☆ The Impact of Language Adapters in Cross-Lingual Transfer for NLU + + +
+ Modular deep learning has been proposed for the efficient adaption of +pre-trained models to new tasks, domains and languages. In particular, +combining language adapters with task adapters has shown potential where no +supervised data exists for a language. In this paper, we explore the role of +language adapters in zero-shot cross-lingual transfer for natural language +understanding (NLU) benchmarks. We study the effect of including a +target-language adapter in detailed ablation studies with two multilingual +models and three multilingual datasets. Our results show that the effect of +target-language adapters is highly inconsistent across tasks, languages and +models. Retaining the source-language adapter instead often leads to an +equivalent, and sometimes to a better, performance. Removing the language +adapter after training has only a weak negative effect, indicating that the +language adapters do not have a strong impact on the predictions. + +
+
+
+
+
+ + ☆ Making a Long Story Short in Conversation Modeling EACL 2024 + + +
+ Conversation systems accommodate diverse users with unique personalities and +distinct writing styles. Within the domain of multi-turn dialogue modeling, +this work studies the impact of varied utterance lengths on the quality of +subsequent responses generated by conversation models. Using GPT-3 as the base +model, multiple dialogue datasets, and several metrics, we conduct a thorough +exploration of this aspect of conversational models. Our analysis sheds light +on the complex relationship between utterance lengths and the quality of +follow-up responses generated by dialogue systems. Empirical findings suggests +that, for certain types of conversations, utterance lengths can be reduced by +up to 72% without any noticeable difference in the quality of follow-up +responses. + +
+
+ comment: This paper was accepted by TEICAI workshop at EACL 2024 +
+
+
+
+
+ + ☆ Common Sense Reasoning for Deep Fake Detection + + +
+ State-of-the-art approaches rely on image-based features extracted via neural +networks for the deepfake detection binary classification. While these +approaches trained in the supervised sense extract likely fake features, they +may fall short in representing unnatural `non-physical' semantic facial +attributes -- blurry hairlines, double eyebrows, rigid eye pupils, or unnatural +skin shading. However, such facial attributes are generally easily perceived by +humans via common sense reasoning. Furthermore, image-based feature extraction +methods that provide visual explanation via saliency maps can be hard to be +interpreted by humans. To address these challenges, we propose the use of +common sense reasoning to model deepfake detection, and extend it to the +Deepfake Detection VQA (DD-VQA) task with the aim to model human intuition in +explaining the reason behind labeling an image as either real or fake. To this +end, we introduce a new dataset that provides answers to the questions related +to the authenticity of an image, along with its corresponding explanations. We +also propose a Vision and Language Transformer-based framework for the DD-VQA +task, incorporating text and image aware feature alignment formulations. +Finally, we evaluate our method on both the performance of deepfake detection +and the quality of the generated explanations. We hope that this task inspires +researchers to explore new avenues for enhancing language-based +interpretability and cross-modality applications in the realm of deepfake +detection. + +
+
+
+
+
+ + ☆ Comparing Template-based and Template-free Language Model Probing EACL 2024 + + +
+ The differences between cloze-task language model (LM) probing with 1) +expert-made templates and 2) naturally-occurring text have often been +overlooked. Here, we evaluate 16 different LMs on 10 probing English datasets +-- 4 template-based and 6 template-free -- in general and biomedical domains to +answer the following research questions: (RQ1) Do model rankings differ between +the two approaches? (RQ2) Do models' absolute scores differ between the two +approaches? (RQ3) Do the answers to RQ1 and RQ2 differ between general and +domain-specific models? Our findings are: 1) Template-free and template-based +approaches often rank models differently, except for the top domain-specific +models. 2) Scores decrease by up to 42% Acc@1 when comparing parallel +template-free and template-based prompts. 3) Perplexity is negatively +correlated with accuracy in the template-free approach, but, +counter-intuitively, they are positively correlated for template-based probing. +4) Models tend to predict the same answers frequently across prompts for +template-based probing, which is less common when employing template-free +techniques. + +
+
+ comment: Accepted to EACL 2024 +
+
+
+
+
+ + ☆ Multimodal Clinical Pseudo-notes for Emergency Department Prediction + Tasks using Multiple Embedding Model for EHR (MEME) + + +
+ In this work, we introduce Multiple Embedding Model for EHR (MEME), an +approach that views Electronic Health Records (EHR) as multimodal data. This +approach incorporates "pseudo-notes", textual representations of tabular EHR +concepts such as diagnoses and medications, and allows us to effectively employ +Large Language Models (LLMs) for EHR representation. This framework also adopts +a multimodal approach, embedding each EHR modality separately. We demonstrate +the effectiveness of MEME by applying it to several tasks within the Emergency +Department across multiple hospital systems. Our findings show that MEME +surpasses the performance of both single modality embedding methods and +traditional machine learning approaches. However, we also observe notable +limitations in generalizability across hospital institutions for all tested +models. + +
+
+
+
+
+ + ♻ ☆ Baichuan2-Sum: Instruction Finetune Baichuan2-7B Model for Dialogue + Summarization + + +
+ Large language models (LLMs) like Llama, Baichuan and Bloom models show +remarkable ability with instruction fine-tuning in many natural language tasks. +Nevertheless, for the dialogue summarization task, which aims to generate +summaries for different roles in dialogue, most of the state-of-the-art methods +conduct on small models (e.g Bart and Bert). Existing methods try to add task +specified optimization on small models like adding global-local centrality +score to models. In this paper, we propose an instruction fine-tuning model: +Baichuan2-Sum, for role-oriented diaglouge summarization. By setting different +instructions for different roles, the model can learn from the dialogue +interactions and output the expected summaries. Furthermore, we applied NEFTune +technique to add suitable noise during training to improve the results. The +experiments demonstrate that the proposed model achieves the new +state-of-the-art results on two public dialogue summarization datasets: CSDS +and SAMSUM. We release our model and related codes to facilitate future studies +on dialogue summarization task. + +
+
+
+
+
+ + ♻ ☆ An Empathetic AI Coach for Self-Attachment Therapy + + +
+ In this work, we present a new dataset and a computational strategy for a +digital coach that aims to guide users in practicing the protocols of +self-attachment therapy. Our framework augments a rule-based conversational +agent with a deep-learning classifier for identifying the underlying emotion in +a user's text response, as well as a deep-learning assisted retrieval method +for producing novel, fluent and empathetic utterances. We also craft a set of +human-like personas that users can choose to interact with. Our goal is to +achieve a high level of engagement during virtual therapy sessions. We evaluate +the effectiveness of our framework in a non-clinical trial with N=16 +participants, all of whom have had at least four interactions with the agent +over the course of five days. We find that our platform is consistently rated +higher for empathy, user engagement and usefulness than the simple rule-based +framework. Finally, we provide guidelines to further improve the design and +performance of the application, in accordance with the feedback received. + +
+
+
+
+
+ + ♻ ☆ Leveraging Multi-lingual Positive Instances in Contrastive Learning to + Improve Sentence Embedding EACL 2024 + + +
+ Learning multi-lingual sentence embeddings is a fundamental task in natural +language processing. Recent trends in learning both mono-lingual and +multi-lingual sentence embeddings are mainly based on contrastive learning (CL) +among an anchor, one positive, and multiple negative instances. In this work, +we argue that leveraging multiple positives should be considered for +multi-lingual sentence embeddings because (1) positives in a diverse set of +languages can benefit cross-lingual learning, and (2) transitive similarity +across multiple positives can provide reliable structural information for +learning. In order to investigate the impact of multiple positives in CL, we +propose a novel approach, named MPCL, to effectively utilize multiple positive +instances to improve the learning of multi-lingual sentence embeddings. +Experimental results on various backbone models and downstream tasks +demonstrate that MPCL leads to better retrieval, semantic similarity, and +classification performances compared to conventional CL. We also observe that +in unseen languages, sentence embedding models trained on multiple positives +show better cross-lingual transfer performance than models trained on a single +positive instance. + +
+
+ comment: Accepted to EACL 2024, main conference +
+
+
+
+
+ + ♻ ☆ $μ$PLAN: Summarizing using a Content Plan as Cross-Lingual Bridge EACL 2024 + + +
+ Cross-lingual summarization consists of generating a summary in one language +given an input document in a different language, allowing for the dissemination +of relevant content across speakers of other languages. The task is challenging +mainly due to the paucity of cross-lingual datasets and the compounded +difficulty of summarizing and translating. This work presents $\mu$PLAN, an +approach to cross-lingual summarization that uses an intermediate planning step +as a cross-lingual bridge. We formulate the plan as a sequence of entities +capturing the summary's content and the order in which it should be +communicated. Importantly, our plans abstract from surface form: using a +multilingual knowledge base, we align entities to their canonical designation +across languages and generate the summary conditioned on this cross-lingual +bridge and the input. Automatic and human evaluation on the XWikis dataset +(across four language pairs) demonstrates that our planning objective achieves +state-of-the-art performance in terms of informativeness and faithfulness. +Moreover, $\mu$PLAN models improve the zero-shot transfer to new cross-lingual +language pairs compared to baselines without a planning component. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ♻ ☆ On-the-fly Denoising for Data Augmentation in Natural Language + Understanding EACL 2024 + + +
+ Data Augmentation (DA) is frequently used to provide additional training data +without extra human annotation automatically. However, data augmentation may +introduce noisy data that impairs training. To guarantee the quality of +augmented data, existing methods either assume no noise exists in the augmented +data and adopt consistency training or use simple heuristics such as training +loss and diversity constraints to filter out "noisy" data. However, those +filtered examples may still contain useful information, and dropping them +completely causes a loss of supervision signals. In this paper, based on the +assumption that the original dataset is cleaner than the augmented data, we +propose an on-the-fly denoising technique for data augmentation that learns +from soft augmented labels provided by an organic teacher model trained on the +cleaner original data. To further prevent overfitting on noisy labels, a simple +self-regularization module is applied to force the model prediction to be +consistent across two distinct dropouts. Our method can be applied to general +augmentation techniques and consistently improve the performance on both text +classification and question-answering tasks. + +
+
+ comment: Findings of EACL 2024 +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Vanishing Gradients in Reinforcement Finetuning of Language Models ICLR 2024 + + +
+ Pretrained language models are commonly aligned with human preferences and +downstream tasks via reinforcement finetuning (RFT), which refers to maximizing +a (possibly learned) reward function using policy gradient algorithms. This +work identifies a fundamental optimization obstacle in RFT: we prove that the +expected gradient for an input vanishes when its reward standard deviation +under the model is small, even if the expected reward is far from optimal. +Through experiments on an RFT benchmark and controlled environments, as well as +a theoretical analysis, we then demonstrate that vanishing gradients due to +small reward standard deviation are prevalent and detrimental, leading to +extremely slow reward maximization. Lastly, we explore ways to overcome +vanishing gradients in RFT. We find the common practice of an initial +supervised finetuning (SFT) phase to be the most promising candidate, which +sheds light on its importance in an RFT pipeline. Moreover, we show that a +relatively small number of SFT optimization steps on as few as 1% of the input +samples can suffice, indicating that the initial SFT phase need not be +expensive in terms of compute and data labeling efforts. Overall, our results +emphasize that being mindful for inputs whose expected gradient vanishes, as +measured by the reward standard deviation, is crucial for successful execution +of RFT. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ A RAG-based Question Answering System Proposal for Understanding Islam: + MufassirQAS LLM + + +
+ There exist challenges in learning and understanding religions as the +presence of complexity and depth of religious doctrines and teachings. Chatbots +as question-answering systems can help in solving these challenges. LLM +chatbots use NLP techniques to establish connections between topics and +accurately respond to complex questions. These capabilities make it perfect to +be used in enlightenment on religion as a question answering chatbot. However, +LLMs also have a tendency to generate false information, known as +hallucination. The responses of the chatbots can include content that insults +personal religious beliefs, interfaith conflicts, and controversial or +sensitive topics. It needs to avoid such cases without promoting hate speech or +offending certain groups of people or their beliefs. This study uses a vector +database-based Retrieval Augmented Generation (RAG) approach to enhance the +accuracy and transparency of LLMs. Our question-answering system is called as +"MufassirQAS". We created a vector database with several open-access books that +include Turkish context. These are Turkish translations, and interpretations on +Islam. We worked on creating system prompts with care, ensuring they provide +instructions that prevent harmful, offensive, or disrespectful responses. We +also tested the MufassirQAS and ChatGPT with sensitive questions. We got better +performance with our system. Study and enhancements are still in progress. +Results and future works are given. + +
+
+
+
+
+ + ♻ ☆ Efficient Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +important tasks such as natural language understanding, language generation, +and complex reasoning and have the potential to make a substantial impact on +our society. Such capabilities, however, come with the considerable resources +they demand, highlighting the strong need to develop effective techniques for +addressing their efficiency challenges.In this survey, we provide a systematic +and comprehensive review of efficient LLMs research. We organize the literature +in a taxonomy consisting of three main categories, covering distinct yet +interconnected efficient LLMs topics from model-centric, data-centric, and +framework-centric perspective, respectively. We have also created a GitHub +repository where we compile the papers featured in this survey at +https://github.com/AIoT-MLSys-Lab/Efficient-LLMs-Survey, and will actively +maintain this repository and incorporate new research as it emerges. We hope +our survey can serve as a valuable resource to help researchers and +practitioners gain a systematic understanding of the research developments in +efficient LLMs and inspire them to contribute to this important and exciting +field. + +
+
+ comment: Version 3: Added more latest papers +
+
+
+
+
+ + ♻ ☆ Semantic Sensitivities and Inconsistent Predictions: Measuring the + Fragility of NLI Models EACL 2024 + + +
+ Recent studies of the emergent capabilities of transformer-based Natural +Language Understanding (NLU) models have indicated that they have an +understanding of lexical and compositional semantics. We provide evidence that +suggests these claims should be taken with a grain of salt: we find that +state-of-the-art Natural Language Inference (NLI) models are sensitive towards +minor semantics preserving surface-form variations, which lead to sizable +inconsistent model decisions during inference. Notably, this behaviour differs +from valid and in-depth comprehension of compositional semantics, however does +neither emerge when evaluating model accuracy on standard benchmarks nor when +probing for syntactic, monotonic, and logically robust reasoning. We propose a +novel framework to measure the extent of semantic sensitivity. To this end, we +evaluate NLI models on adversarially generated examples containing minor +semantics-preserving surface-form input noise. This is achieved using +conditional text generation, with the explicit condition that the NLI model +predicts the relationship between the original and adversarial inputs as a +symmetric equivalence entailment. We systematically study the effects of the +phenomenon across NLI models for $\textbf{in-}$ and $\textbf{out-of-}$ domain +settings. Our experiments show that semantic sensitivity causes performance +degradations of $12.92\%$ and $23.71\%$ average over $\textbf{in-}$ and +$\textbf{out-of-}$ domain settings, respectively. We further perform ablation +studies, analysing this phenomenon across models, datasets, and variations in +inference and show that semantic sensitivity can lead to major inconsistency +within model predictions. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ♻ ☆ Do self-supervised speech and language models extract similar + representations as human brain? + + +
+ Speech and language models trained through self-supervised learning (SSL) +demonstrate strong alignment with brain activity during speech and language +perception. However, given their distinct training modalities, it remains +unclear whether they correlate with the same neural aspects. We directly +address this question by evaluating the brain prediction performance of two +representative SSL models, Wav2Vec2.0 and GPT-2, designed for speech and +language tasks. Our findings reveal that both models accurately predict speech +responses in the auditory cortex, with a significant correlation between their +brain predictions. Notably, shared speech contextual information between +Wav2Vec2.0 and GPT-2 accounts for the majority of explained variance in brain +activity, surpassing static semantic and lower-level acoustic-phonetic +information. These results underscore the convergence of speech contextual +representations in SSL models and their alignment with the neural network +underlying speech perception, offering valuable insights into both SSL models +and the neural basis of speech and language processing. + +
+
+ comment: To appear in 2024 IEEE International Conference on Acoustics, Speech + and Signal Processing +
+
+
+
+
+ + ♻ ☆ Multilingual Text-to-Image Generation Magnifies Gender Stereotypes and + Prompt Engineering May Not Help You + + +
+ Text-to-image generation models have recently achieved astonishing results in +image quality, flexibility, and text alignment and are consequently employed in +a fast-growing number of applications. Through improvements in multilingual +abilities, a larger community now has access to this kind of technology. Yet, +as we will show, multilingual models suffer similarly from (gender) biases as +monolingual models. Furthermore, the natural expectation is that these models +will provide similar results across languages, but this is not the case and +there are important differences between languages. Thus, we propose a novel +benchmark MAGBIG intending to foster research in multilingual models without +gender bias. We investigate whether multilingual T2I models magnify gender bias +with MAGBIG. To this end, we use multilingual prompts requesting portrait +images of persons of a certain occupation or trait (using adjectives). Our +results show not only that models deviate from the normative assumption that +each gender should be equally likely to be generated, but that there are also +big differences across languages. Furthermore, we investigate prompt +engineering strategies, i.e. the use of indirect, neutral formulations, as a +possible remedy for these biases. Unfortunately, they help only to a limited +extent and result in worse text-to-image alignment. Consequently, this work +calls for more research into diverse representations across languages in image +generators. + +
+
+
+
+
+ + ♻ ☆ ViLexNorm: A Lexical Normalization Corpus for Vietnamese Social Media + Text EACL 2024 + + +
+ Lexical normalization, a fundamental task in Natural Language Processing +(NLP), involves the transformation of words into their canonical forms. This +process has been proven to benefit various downstream NLP tasks greatly. In +this work, we introduce Vietnamese Lexical Normalization (ViLexNorm), the +first-ever corpus developed for the Vietnamese lexical normalization task. The +corpus comprises over 10,000 pairs of sentences meticulously annotated by human +annotators, sourced from public comments on Vietnam's most popular social media +platforms. Various methods were used to evaluate our corpus, and the +best-performing system achieved a result of 57.74% using the Error Reduction +Rate (ERR) metric (van der Goot, 2019a) with the Leave-As-Is (LAI) baseline. +For extrinsic evaluation, employing the model trained on ViLexNorm demonstrates +the positive impact of the Vietnamese lexical normalization task on other NLP +tasks. Our corpus is publicly available exclusively for research purposes. + +
+
+ comment: Accepted at the EACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ GRATH: Gradual Self-Truthifying for Large Language Models + + +
+ Truthfulness is paramount for large language models (LLMs) as they are +increasingly deployed in real-world applications. However, existing LLMs still +struggle with generating truthful content, as evidenced by their modest +performance on benchmarks like TruthfulQA. To address this issue, we propose +GRAdual self-truTHifying (GRATH), a novel post-processing method to enhance +truthfulness of LLMs. GRATH utilizes out-of-domain question prompts to generate +pairwise truthfulness training data with each pair containing a question and +its correct and incorrect answers, and then optimizes the model via direct +preference optimization (DPO) to learn from the truthfulness difference between +answer pairs. GRATH iteratively refines truthfulness data and updates the +model, leading to a gradual improvement in model truthfulness in a +self-supervised manner. Empirically, we evaluate GRATH using different 7B-LLMs +and compare with LLMs with similar or even larger sizes on benchmark datasets. +Our results show that GRATH effectively improves LLMs' truthfulness without +compromising other core capabilities. Notably, GRATH achieves state-of-the-art +performance on TruthfulQA, with MC1 accuracy of 54.71% and MC2 accuracy of +69.10%, which even surpass those on 70B-LLMs. + +
+
+
+
+
+ + ♻ ☆ Survey of Natural Language Processing for Education: Taxonomy, + Systematic Review, and Future Trends + + +
+ Natural Language Processing (NLP) aims to analyze the text via techniques in +the computer science field. It serves the applications in healthcare, commerce, +and education domains. Particularly, NLP has been applied to the education +domain to help teaching and learning. In this survey, we review recent advances +in NLP with a focus on solving problems related to the education domain. In +detail, we begin with introducing the relevant background. Then, we present the +taxonomy of NLP in the education domain. Next, we illustrate the task +definition, challenges, and corresponding techniques based on the above +taxonomy. After that, we showcase some off-the-shelf demonstrations in this +domain and conclude with future directions. + +
+
+
+
+
+ + ♻ ☆ What Do Self-Supervised Speech Models Know About Words? + + +
+ Many self-supervised speech models (S3Ms) have been introduced over the last +few years, improving performance and data efficiency on various speech tasks. +However, these empirical successes alone do not give a complete picture of what +is learned during pre-training. Recent work has begun analyzing how S3Ms encode +certain properties, such as phonetic and speaker information, but we still lack +a proper understanding of knowledge encoded at the word level and beyond. In +this work, we use lightweight analysis methods to study segment-level +linguistic properties -- word identity, boundaries, pronunciation, syntactic +features, and semantic features -- encoded in S3Ms. We present a comparative +study of layer-wise representations from ten S3Ms and find that (i) the +frame-level representations within each word segment are not all equally +informative, and (ii) the pre-training objective and model size heavily +influence the accessibility and distribution of linguistic information across +layers. We also find that on several tasks -- word discrimination, word +segmentation, and semantic sentence similarity -- S3Ms trained with visual +grounding outperform their speech-only counterparts. Finally, our task-based +analyses demonstrate improved performance on word segmentation and acoustic +word discrimination while using simpler methods than prior work. + +
+
+ comment: Pre-MIT Press publication version +
+
+
+
+
+ + ♻ ☆ How Does Beam Search improve Span-Level Confidence Estimation in + Generative Sequence Labeling? EACL 2024 + + +
+ Sequence labeling is a core task in text understanding for IE/IR systems. +Text generation models have increasingly become the go-to solution for such +tasks (e.g., entity extraction and dialog slot filling). While most research +has focused on the labeling accuracy, a key aspect -- of vital practical +importance -- has slipped through the cracks: understanding model confidence. +More specifically, we lack a principled understanding of how to reliably gauge +the confidence of a model in its predictions for each labeled span. This paper +aims to provide some empirical insights on estimating model confidence for +generative sequence labeling. Most notably, we find that simply using the +decoder's output probabilities \textbf{is not} the best in realizing +well-calibrated confidence estimates. As verified over six public datasets of +different tasks, we show that our proposed approach -- which leverages +statistics from top-$k$ predictions by a beam search -- significantly reduces +calibration errors of the predictions of a generative sequence labeling model. + +
+
+ comment: UncertaiNLP 2024 (an EACL 2024 workshop: + https://uncertainlp.github.io/) +
+
+
+
+
+ + ♻ ☆ A RelEntLess Benchmark for Modelling Graded Relations between Named + Entities EACL 2024 + + +
+ Relations such as "is influenced by", "is known for" or "is a competitor of" +are inherently graded: we can rank entity pairs based on how well they satisfy +these relations, but it is hard to draw a line between those pairs that satisfy +them and those that do not. Such graded relations play a central role in many +applications, yet they are typically not covered by existing Knowledge Graphs. +In this paper, we consider the possibility of using Large Language Models +(LLMs) to fill this gap. To this end, we introduce a new benchmark, in which +entity pairs have to be ranked according to how much they satisfy a given +graded relation. The task is formulated as a few-shot ranking problem, where +models only have access to a description of the relation and five prototypical +instances. We use the proposed benchmark to evaluate state-of-the-art relation +embedding strategies as well as several recent LLMs, covering both publicly +available LLMs and closed models such as GPT-4. Overall, we find a strong +correlation between model size and performance, with smaller Language Models +struggling to outperform a naive baseline. The results of the largest Flan-T5 +and OPT models are remarkably strong, although a clear gap with human +performance remains. + +
+
+ comment: EACL 2024 main conference +
+
+
+
+
+ + ♻ ☆ Improving Small Language Models' Mathematical Reasoning via + Equation-of-Thought Distillation + + +
+ This work addresses the challenge of democratizing advanced Large Language +Models (LLMs) by compressing their mathematical reasoning capabilities into +sub-billion parameter Small Language Models (SLMs) without compromising +performance. We introduce Equation-of-Thought Distillation (EoTD), a novel +technique that encapsulates the reasoning process into equation-based +representations to construct an EoTD dataset for fine-tuning SLMs. +Additionally, we propose the Ensemble Thoughts Distillation (ETD) framework to +enhance the reasoning performance of SLMs. This involves creating a reasoning +dataset with multiple thought processes, including Chain-of-Thought (CoT), +Program-of-Thought (PoT), and Equation-of-Thought (EoT), and using it for +fine-tuning. Our experimental findings demonstrate that EoTD significantly +boosts the reasoning abilities of SLMs, while ETD enables these models to +achieve state-of-the-art reasoning performance. + +
+
+
+
+
+ + ♻ ☆ Monolingual or Multilingual Instruction Tuning: Which Makes a Better + Alpaca ACL + + +
+ Foundational large language models (LLMs) can be instruction-tuned to perform +open-domain question answering, facilitating applications like chat assistants. +While such efforts are often carried out in a single language, we empirically +analyze cost-efficient strategies for multilingual scenarios. Our study employs +the Alpaca dataset and machine translations of it to form multilingual data, +which is then used to tune LLMs through either low-rank adaptation or +full-parameter training. Under a controlled computation budget, comparisons +show that multilingual tuning is on par or better than tuning a model for each +language. Furthermore, multilingual tuning with downsampled data can be as +powerful and more robust. Our findings serve as a guide for expanding language +support through instruction tuning. + +
+
+ comment: Accepted to Findings of ACL: EACL 2024. Added human evaluation and + shortened writing +
+
+
+
+
+ + ♻ ☆ MAPLE: Mobile App Prediction Leveraging Large Language Model Embeddings + + +
+ In recent years, predicting mobile app usage has become increasingly +important for areas like app recommendation, user behaviour analysis, and +mobile resource management. Existing models, however, struggle with the +heterogeneous nature of contextual data and the user cold start problem. This +study introduces a novel prediction model, Mobile App Prediction Leveraging +Large Language Model Embeddings (MAPLE), which employs Large Language Models +(LLMs) and installed app similarity to overcome these challenges. MAPLE +utilises the power of LLMs to process contextual data and discern intricate +relationships within it effectively. Additionally, we explore the use of +installed app similarity to address the cold start problem, facilitating the +modelling of user preferences and habits, even for new users with limited +historical data. In essence, our research presents MAPLE as a novel, potent, +and practical approach to app usage prediction, making significant strides in +resolving issues faced by existing models. MAPLE stands out as a comprehensive +and effective solution, setting a new benchmark for more precise and +personalised app usage predictions. In tests on two real-world datasets, MAPLE +surpasses contemporary models in both standard and cold start scenarios. These +outcomes validate MAPLE's capacity for precise app usage predictions and its +resilience against the cold start problem. This enhanced performance stems from +the model's proficiency in capturing complex temporal patterns and leveraging +contextual information. As a result, MAPLE can potentially improve personalised +mobile app usage predictions and user experiences markedly. + +
+
+
+
+
+ + ♻ ☆ APPLS: Evaluating Evaluation Metrics for Plain Language Summarization + + +
+ While there has been significant development of models for Plain Language +Summarization (PLS), evaluation remains a challenge. PLS lacks a dedicated +assessment metric, and the suitability of text generation evaluation metrics is +unclear due to the unique transformations involved (e.g., adding background +explanations, removing specialized terminology). To address these concerns, our +study presents a granular meta-evaluation testbed, APPLS, designed to evaluate +metrics for PLS. We define a set of perturbations along four criteria inspired +by previous work that a PLS metric should capture: informativeness, +simplification, coherence, and faithfulness. An analysis of metrics using our +testbed reveals that current metrics fail to capture simplification +consistently. In response, we introduce POMME, a new metric designed to assess +text simplification in PLS; the metric is calculated as the normalized +perplexity difference between an in-domain and out-of-domain language model. We +demonstrate POMME's correlation with fine-grained variations in simplification +and validate its sensitivity across 4 text simplification datasets. This work +contributes the first meta-evaluation testbed for PLS and a comprehensive +evaluation of existing metrics. The APPLS testbed and POMME is available at +https://github.com/LinguisticAnomalies/APPLS. + +
+
+
+
+
+ + ♻ ☆ Gaussian Adaptive Attention is All You Need: Robust Contextual + Representations Across Multiple Modalities + + +
+ We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a +novel probabilistic attention framework, and the Gaussian Adaptive Transformer +(GAT), designed to enhance information aggregation across multiple modalities, +including Speech, Text and Vision. GAAM integrates learnable mean and variance +into its attention mechanism, implemented in a Multi-Headed framework enabling +it to collectively model any Probability Distribution for dynamic recalibration +of feature significance. This method demonstrates significant improvements, +especially with highly non-stationary data, surpassing the state-of-the-art +attention techniques in model performance (up to approximately +20% in +accuracy) by identifying key elements within the feature space. GAAM's +compatibility with dot-product-based attention models and relatively low number +of parameters showcases its adaptability and potential to boost existing +attention frameworks. Empirically, GAAM exhibits superior adaptability and +efficacy across a diverse range of tasks, including emotion recognition in +speech, image classification, and text classification, thereby establishing its +robustness and versatility in handling multi-modal data. Furthermore, we +introduce the Importance Factor (IF), a new learning-based metric that enhances +the explainability of models trained with GAAM-based methods. Overall, GAAM +represents an advancement towards development of better performing and more +explainable attention models across multiple modalities. + +
+
+
+
+
+ + ♻ ☆ InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining + + +
+ Pretraining auto-regressive large language models (LLMs) with retrieval +demonstrates better perplexity and factual accuracy by leveraging external +databases. However, the size of existing pretrained retrieval-augmented LLM is +still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness +of instruction tuning and zero-shot generalization. In this work, we introduce +Retro 48B, the largest LLM pretrained with retrieval. Specifically, we continue +to pretrain a 43B GPT model on additional 100 billion tokens using the Retro +augmentation method by retrieving from 1.2 trillion tokens. Notably, the +obtained foundation model, Retro 48B, largely outperforms the counterpart GPT +43B trained on 1.2T tokens in terms of perplexity with only 2.58% additional +GPU hours, demonstrating the significant scaling potential of the method. After +instruction tuning on Retro, InstructRetro demonstrates significant improvement +over the instruction tuned GPT on a wide range of zero-shot tasks. +Specifically, the average improvement of InstructRetro is 7% over its GPT +counterpart across 8 short-form QA and reading comprehension tasks, 10% over +GPT across 4 challenging long-form QA tasks, and 16% over GPT across 3 +summarization tasks. Surprisingly, we find that one can ablate the encoder from +InstructRetro architecture and directly use its decoder backbone, while +achieving comparable results. Our results highlight the promising direction to +obtain a better GPT decoder through continued pretraining with retrieval before +instruction tuning. Our code and checkpoints are publicly available at: +https://github.com/NVIDIA/Megatron-LM/tree/InstructRetro/tools/retro. + +
+
+
+
+
+ + ♻ ☆ Code Llama: Open Foundation Models for Code + + +
+ We release Code Llama, a family of large language models for code based on +Llama 2 providing state-of-the-art performance among open models, infilling +capabilities, support for large input contexts, and zero-shot instruction +following ability for programming tasks. We provide multiple flavors to cover a +wide range of applications: foundation models (Code Llama), Python +specializations (Code Llama - Python), and instruction-following models (Code +Llama - Instruct) with 7B, 13B, 34B and 70B parameters each. All models are +trained on sequences of 16k tokens and show improvements on inputs with up to +100k tokens. 7B, 13B and 70B Code Llama and Code Llama - Instruct variants +support infilling based on surrounding content. Code Llama reaches +state-of-the-art performance among open models on several code benchmarks, with +scores of up to 67% and 65% on HumanEval and MBPP, respectively. Notably, Code +Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our +models outperform every other publicly available model on MultiPL-E. We release +Code Llama under a permissive license that allows for both research and +commercial use. + +
+
+
+
+
+ + ♻ ☆ Characterizing Large Language Models as Rationalizers of + Knowledge-intensive Tasks + + +
+ Large language models (LLMs) are proficient at generating fluent text with +minimal task-specific supervision. Yet, their ability to provide well-grounded +rationalizations for knowledge-intensive tasks remains under-explored. Such +tasks, like commonsense multiple-choice questions, require rationales based on +world knowledge to support predictions and refute alternate options. We +consider the task of generating knowledge-guided rationalization in natural +language by using expert-written examples in a few-shot manner. Surprisingly, +crowd-workers preferred knowledge-grounded rationales over crowdsourced +rationalizations, citing their factuality, sufficiency, and comprehensive +refutations. Although LLMs-generated rationales were preferable, further +improvements in conciseness and novelty are required. In another study, we show +how rationalization of incorrect model predictions erodes humans' trust in +LLM-generated rationales. Motivated by these observations, we create a +two-stage pipeline to review task predictions and eliminate potential incorrect +decisions before rationalization, enabling trustworthy rationale generation. + +
+
+
+
+
+ + ♻ ☆ A decoder-only foundation model for time-series forecasting + + +
+ Motivated by recent advances in large language models for Natural Language +Processing (NLP), we design a time-series foundation model for forecasting +whose out-of-the-box zero-shot performance on a variety of public datasets +comes close to the accuracy of state-of-the-art supervised forecasting models +for each individual dataset. Our model is based on pretraining a +patched-decoder style attention model on a large time-series corpus, and can +work well across different forecasting history lengths, prediction lengths and +temporal granularities. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 110 + +
+
+
+ + ☆ Motion Guidance: Diffusion-Based Image Editing with Differentiable + Motion Estimators + + +
+ Diffusion models are capable of generating impressive images conditioned on +text descriptions, and extensions of these models allow users to edit images at +a relatively coarse scale. However, the ability to precisely edit the layout, +position, pose, and shape of objects in images with diffusion models is still +difficult. To this end, we propose motion guidance, a zero-shot technique that +allows a user to specify dense, complex motion fields that indicate where each +pixel in an image should move. Motion guidance works by steering the diffusion +sampling process with the gradients through an off-the-shelf optical flow +network. Specifically, we design a guidance loss that encourages the sample to +have the desired motion, as estimated by a flow network, while also being +visually similar to the source image. By simultaneously sampling from a +diffusion model and guiding the sample to have low guidance loss, we can obtain +a motion-edited image. We demonstrate that our technique works on complex +motions and produces high quality edits of real and generated images. + +
+
+
+
+
+ + ☆ Binding Touch to Everything: Learning Unified Multimodal Tactile + Representations + + +
+ The ability to associate touch with other modalities has huge implications +for humans and computational systems. However, multimodal learning with touch +remains challenging due to the expensive data collection process and +non-standardized sensor outputs. We introduce UniTouch, a unified tactile model +for vision-based touch sensors connected to multiple modalities, including +vision, language, and sound. We achieve this by aligning our UniTouch +embeddings to pretrained image embeddings already associated with a variety of +other modalities. We further propose learnable sensor-specific tokens, allowing +the model to learn from a set of heterogeneous tactile sensors, all at the same +time. UniTouch is capable of conducting various touch sensing tasks in the +zero-shot setting, from robot grasping prediction to touch image question +answering. To the best of our knowledge, UniTouch is the first to demonstrate +such capabilities. Project page: https://cfeng16.github.io/UniTouch/ + +
+
+
+
+
+ + ☆ Improved Scene Landmark Detection for Camera Localization 3DV 2024 + + +
+ Camera localization methods based on retrieval, local feature matching, and +3D structure-based pose estimation are accurate but require high storage, are +slow, and are not privacy-preserving. A method based on scene landmark +detection (SLD) was recently proposed to address these limitations. It involves +training a convolutional neural network (CNN) to detect a few predetermined, +salient, scene-specific 3D points or landmarks and computing camera pose from +the associated 2D-3D correspondences. Although SLD outperformed existing +learning-based approaches, it was notably less accurate than 3D structure-based +methods. In this paper, we show that the accuracy gap was due to insufficient +model capacity and noisy labels during training. To mitigate the capacity +issue, we propose to split the landmarks into subgroups and train a separate +network for each subgroup. To generate better training labels, we propose using +dense reconstructions to estimate visibility of scene landmarks. Finally, we +present a compact architecture to improve memory efficiency. Accuracy wise, our +approach is on par with state of the art structure based methods on the +INDOOR-6 dataset but runs significantly faster and uses less storage. Code and +models can be found at https://github.com/microsoft/SceneLandmarkLocalization. + +
+
+ comment: To be presented at 3DV 2024 +
+
+
+
+
+ + ☆ CARFF: Conditional Auto-encoded Radiance Field for 3D Scene Forecasting + + +
+ We propose CARFF: Conditional Auto-encoded Radiance Field for 3D Scene +Forecasting, a method for predicting future 3D scenes given past observations, +such as 2D ego-centric images. Our method maps an image to a distribution over +plausible 3D latent scene configurations using a probabilistic encoder, and +predicts the evolution of the hypothesized scenes through time. Our latent +scene representation conditions a global Neural Radiance Field (NeRF) to +represent a 3D scene model, which enables explainable predictions and +straightforward downstream applications. This approach extends beyond previous +neural rendering work by considering complex scenarios of uncertainty in +environmental states and dynamics. We employ a two-stage training of +Pose-Conditional-VAE and NeRF to learn 3D representations. Additionally, we +auto-regressively predict latent scene representations as a partially +observable Markov decision process, utilizing a mixture density network. We +demonstrate the utility of our method in realistic scenarios using the CARLA +driving simulator, where CARFF can be used to enable efficient trajectory and +contingency planning in complex multi-agent autonomous driving scenarios +involving visual occlusions. + +
+
+
+
+
+ + ☆ Benchmarking Sensitivity of Continual Graph Learning for Skeleton-Based + Action Recognition + + +
+ Continual learning (CL) is the research field that aims to build machine +learning models that can accumulate knowledge continuously over different tasks +without retraining from scratch. Previous studies have shown that pre-training +graph neural networks (GNN) may lead to negative transfer (Hu et al., 2020) +after fine-tuning, a setting which is closely related to CL. Thus, we focus on +studying GNN in the continual graph learning (CGL) setting. We propose the +first continual graph learning benchmark for spatio-temporal graphs and use it +to benchmark well-known CGL methods in this novel setting. The benchmark is +based on the N-UCLA and NTU-RGB+D datasets for skeleton-based action +recognition. Beyond benchmarking for standard performance metrics, we study the +class and task-order sensitivity of CGL methods, i.e., the impact of learning +order on each class/task's performance, and the architectural sensitivity of +CGL methods with backbone GNN at various widths and depths. We reveal that +task-order robust methods can still be class-order sensitive and observe +results that contradict previous empirical observations on architectural +sensitivity in CL. + +
+
+ comment: This work is accepted at VISAPP 2024 as a short paper +
+
+
+
+
+ + ☆ DROP: Decouple Re-Identification and Human Parsing with Task-specific + Features for Occluded Person Re-identification + + +
+ The paper introduces the Decouple Re-identificatiOn and human Parsing (DROP) +method for occluded person re-identification (ReID). Unlike mainstream +approaches using global features for simultaneous multi-task learning of ReID +and human parsing, or relying on semantic information for attention guidance, +DROP argues that the inferior performance of the former is due to distinct +granularity requirements for ReID and human parsing features. ReID focuses on +instance part-level differences between pedestrian parts, while human parsing +centers on semantic spatial context, reflecting the internal structure of the +human body. To address this, DROP decouples features for ReID and human +parsing, proposing detail-preserving upsampling to combine varying resolution +feature maps. Parsing-specific features for human parsing are decoupled, and +human position information is exclusively added to the human parsing branch. In +the ReID branch, a part-aware compactness loss is introduced to enhance +instance-level part differences. Experimental results highlight the efficacy of +DROP, especially achieving a Rank-1 accuracy of 76.8% on Occluded-Duke, +surpassing two mainstream methods. The codebase is accessible at +https://github.com/shuguang-52/DROP. + +
+
+
+
+
+ + ☆ Multilinear Operator Networks + + +
+ Despite the remarkable capabilities of deep neural networks in image +recognition, the dependence on activation functions remains a largely +unexplored area and has yet to be eliminated. On the other hand, Polynomial +Networks is a class of models that does not require activation functions, but +have yet to perform on par with modern architectures. In this work, we aim +close this gap and propose MONet, which relies solely on multilinear operators. +The core layer of MONet, called Mu-Layer, captures multiplicative interactions +of the elements of the input token. MONet captures high-degree interactions of +the input elements and we demonstrate the efficacy of our approach on a series +of image recognition and scientific computing benchmarks. The proposed model +outperforms prior polynomial networks and performs on par with modern +architectures. We believe that MONet can inspire further research on models +that use entirely multilinear operations. + +
+
+ comment: International Conference on Learning Representations Poster(2024) +
+
+
+
+
+ + ☆ Shrub of a thousand faces: an individual segmentation from satellite + images using deep learning + + +
+ Monitoring the distribution and size structure of long-living shrubs, such as +Juniperus communis, can be used to estimate the long-term effects of climate +change on high-mountain and high latitude ecosystems. Historical aerial +very-high resolution imagery offers a retrospective tool to monitor shrub +growth and distribution at high precision. Currently, deep learning models +provide impressive results for detecting and delineating the contour of objects +with defined shapes. However, adapting these models to detect natural objects +that express complex growth patterns, such as junipers, is still a challenging +task. + This research presents a novel approach that leverages remotely sensed RGB +imagery in conjunction with Mask R-CNN-based instance segmentation models to +individually delineate Juniperus shrubs above the treeline in Sierra Nevada +(Spain). In this study, we propose a new data construction design that consists +in using photo interpreted (PI) and field work (FW) data to respectively +develop and externally validate the model. We also propose a new shrub-tailored +evaluation algorithm based on a new metric called Multiple Intersections over +Ground Truth Area (MIoGTA) to assess and optimize the model shrub delineation +performance. Finally, we deploy the developed model for the first time to +generate a wall-to-wall map of Juniperus individuals. + The experimental results demonstrate the efficiency of our dual data +construction approach in overcoming the limitations associated with traditional +field survey methods. They also highlight the robustness of MIoGTA metric in +evaluating instance segmentation models on species with complex growth patterns +showing more resilience against data annotation uncertainty. Furthermore, they +show the effectiveness of employing Mask R-CNN with ResNet101-C4 backbone in +delineating PI and FW shrubs, achieving an F1-score of 87,87% and 76.86%, +respectively. + +
+
+ comment: 39 pages, 20 figures +
+
+
+
+
+ + ☆ Enhancing Multimodal Large Language Models with Vision Detection Models: + An Empirical Study + + +
+ Despite the impressive capabilities of Multimodal Large Language Models +(MLLMs) in integrating text and image modalities, challenges remain in +accurately interpreting detailed visual elements. This paper presents an +empirical study on enhancing MLLMs with state-of-the-art (SOTA) object +detection and Optical Character Recognition models to improve fine-grained +image understanding and reduce hallucination in responses. Our research +investigates the embedding-based infusion of detection information, the impact +of such infusion on the MLLMs' original abilities, and the interchangeability +of detection models. We conduct systematic experiments with models such as +LLaVA-1.5, DINO, and PaddleOCRv2, revealing that our approach not only refines +MLLMs' performance in specific visual tasks but also maintains their original +strengths. The resulting enhanced MLLMs outperform SOTA models on 9 out of 10 +benchmarks, achieving an improvement of up to 12.99% on the normalized average +score, marking a notable advancement in multimodal understanding. We release +our codes to facilitate further exploration into the fine-grained multimodal +dialogue capabilities of MLLMs. + +
+
+
+
+
+ + ☆ MelNet: A Real-Time Deep Learning Algorithm for Object Detection + + +
+ In this study, a novel deep learning algorithm for object detection, named +MelNet, was introduced. MelNet underwent training utilizing the KITTI dataset +for object detection. Following 300 training epochs, MelNet attained an mAP +(mean average precision) score of 0.732. Additionally, three alternative models +-YOLOv5, EfficientDet, and Faster-RCNN-MobileNetv3- were trained on the KITTI +dataset and juxtaposed with MelNet for object detection. + The outcomes underscore the efficacy of employing transfer learning in +certain instances. Notably, preexisting models trained on prominent datasets +(e.g., ImageNet, COCO, and Pascal VOC) yield superior results. Another finding +underscores the viability of creating a new model tailored to a specific +scenario and training it on a specific dataset. This investigation demonstrates +that training MelNet exclusively on the KITTI dataset also surpasses +EfficientDet after 150 epochs. Consequently, post-training, MelNet's +performance closely aligns with that of other pre-trained models. + +
+
+ comment: 11 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ HyperZ$\cdot$Z$\cdot$W Operator Connects Slow-Fast Networks for Full + Context Interaction + + +
+ The self-attention mechanism utilizes large implicit weight matrices, +programmed through dot product-based activations with very few trainable +parameters, to enable long sequence modeling. In this paper, we investigate the +possibility of discarding residual learning by employing large implicit kernels +to achieve full context interaction at each layer of the network. To accomplish +it, we introduce coordinate-based implicit MLPs as a slow network to generate +hyper-kernels for another fast convolutional network. To get context-varying +weights for fast dynamic encoding, we propose a +$\mathrm{Hyper}\mathcal{Z{\cdot}Z{\cdot}W}$ operator that connects +hyper-kernels ($\mathcal{W}$) and hidden activations ($\mathcal{Z}$) through +simple elementwise multiplication, followed by convolution of $\mathcal{Z}$ +using the context-dependent $\mathcal{W}$. Based on this design, we present a +novel Terminator architecture that integrates hyper-kernels of different sizes +to produce multi-branch hidden representations for enhancing the feature +extraction capability of each layer. Additionally, a bottleneck layer is +employed to compress the concatenated channels, allowing only valuable +information to propagate to the subsequent layers. Notably, our model +incorporates several innovative components and exhibits excellent properties, +such as introducing local feedback error for updating the slow network, stable +zero-mean features, faster training convergence, and fewer model parameters. +Extensive experimental results on pixel-level 1D and 2D image classification +benchmarks demonstrate the superior performance of our architecture. + +
+
+ comment: 10 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ Source-free Domain Adaptive Object Detection in Remote Sensing Images + + +
+ Recent studies have used unsupervised domain adaptive object detection +(UDAOD) methods to bridge the domain gap in remote sensing (RS) images. +However, UDAOD methods typically assume that the source domain data can be +accessed during the domain adaptation process. This setting is often +impractical in the real world due to RS data privacy and transmission +difficulty. To address this challenge, we propose a practical source-free +object detection (SFOD) setting for RS images, which aims to perform target +domain adaptation using only the source pre-trained model. We propose a new +SFOD method for RS images consisting of two parts: perturbed domain generation +and alignment. The proposed multilevel perturbation constructs the perturbed +domain in a simple yet efficient form by perturbing the domain-variant features +at the image level and feature level according to the color and style bias. The +proposed multilevel alignment calculates feature and label consistency between +the perturbed domain and the target domain across the teacher-student network, +and introduces the distillation of feature prototype to mitigate the noise of +pseudo-labels. By requiring the detector to be consistent in the perturbed +domain and the target domain, the detector is forced to focus on +domaininvariant features. Extensive results of three synthetic-to-real +experiments and three cross-sensor experiments have validated the effectiveness +of our method which does not require access to source domain RS images. +Furthermore, experiments on computer vision datasets show that our method can +be extended to other fields as well. Our code will be available at: +https://weixliu.github.io/ . + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Controllable Dense Captioner with Multimodal Embedding Bridging + + +
+ In this paper, we propose a controllable dense captioner (ControlCap), which +accommodates user's intention to dense captioning by introducing linguistic +guidance. ControlCap is defined as a multimodal embedding bridging +architecture, which comprises multimodal embedding generation (MEG) module and +bi-directional embedding bridging (BEB) module. While MEG module represents +objects/regions by combining embeddings of detailed information with +context-aware ones, it also endows ControlCap the adaptability to specialized +controls by utilizing them as linguistic guidance. BEB module aligns the +linguistic guidance with visual embeddings through borrowing/returning features +from/to the visual domain and gathering such features to predict text +descriptions. Experiments on Visual Genome and VG-COCO datasets show that +ControlCap respectively outperforms the state-of-the-art methods by 1.5% and +3.7% (mAP). Last but not least, with the capability of converting +region-category pairs to region-text pairs, ControlCap is able to act as a +powerful data engine for dense captioning. Code is available at +https://github.com/callsys/ControlCap. + +
+
+ comment: https://github.com/callsys/ControlCap +
+
+
+
+
+ + ☆ Hi-SAM: Marrying Segment Anything Model for Hierarchical Text + Segmentation + + +
+ The Segment Anything Model (SAM), a profound vision foundation model +pre-trained on a large-scale dataset, breaks the boundaries of general +segmentation and sparks various downstream applications. This paper introduces +Hi-SAM, a unified model leveraging SAM for hierarchical text segmentation. +Hi-SAM excels in text segmentation across four hierarchies, including stroke, +word, text-line, and paragraph, while realizing layout analysis as well. +Specifically, we first turn SAM into a high-quality text stroke segmentation +(TSS) model through a parameter-efficient fine-tuning approach. We use this TSS +model to iteratively generate the text stroke labels in a semi-automatical +manner, unifying labels across the four text hierarchies in the HierText +dataset. Subsequently, with these complete labels, we launch the end-to-end +trainable Hi-SAM based on the TSS architecture with a customized hierarchical +mask decoder. During inference, Hi-SAM offers both automatic mask generation +(AMG) mode and promptable segmentation mode. In terms of the AMG mode, Hi-SAM +segments text stroke foreground masks initially, then samples foreground points +for hierarchical text mask generation and achieves layout analysis in passing. +As for the promptable mode, Hi-SAM provides word, text-line, and paragraph +masks with a single point click. Experimental results show the state-of-the-art +performance of our TSS model: 84.86% fgIOU on Total-Text and 88.96% fgIOU on +TextSeg for text stroke segmentation. Moreover, compared to the previous +specialist for joint hierarchical detection and layout analysis on HierText, +Hi-SAM achieves significant improvements: 4.73% PQ and 5.39% F1 on the +text-line level, 5.49% PQ and 7.39% F1 on the paragraph level layout analysis, +requiring 20x fewer training epochs. The code is available at +https://github.com/ymy-k/Hi-SAM. + +
+
+ comment: GitHub repository: https://github.com/ymy-k/Hi-SAM +
+
+
+
+
+ + ☆ ReplaceAnything3D:Text-Guided 3D Scene Editing with Compositional Neural + Radiance Fields + + +
+ We introduce ReplaceAnything3D model (RAM3D), a novel text-guided 3D scene +editing method that enables the replacement of specific objects within a scene. +Given multi-view images of a scene, a text prompt describing the object to +replace, and a text prompt describing the new object, our Erase-and-Replace +approach can effectively swap objects in the scene with newly generated content +while maintaining 3D consistency across multiple viewpoints. We demonstrate the +versatility of ReplaceAnything3D by applying it to various realistic 3D scenes, +showcasing results of modified foreground objects that are well-integrated with +the rest of the scene without affecting its overall integrity. + +
+
+ comment: For our project page, see https://replaceanything3d.github.io/ +
+
+
+
+
+ + ☆ Reimagining Reality: A Comprehensive Survey of Video Inpainting + Techniques + + +
+ This paper offers a comprehensive analysis of recent advancements in video +inpainting techniques, a critical subset of computer vision and artificial +intelligence. As a process that restores or fills in missing or corrupted +portions of video sequences with plausible content, video inpainting has +evolved significantly with the advent of deep learning methodologies. Despite +the plethora of existing methods and their swift development, the landscape +remains complex, posing challenges to both novices and established researchers. +Our study deconstructs major techniques, their underpinning theories, and their +effective applications. Moreover, we conduct an exhaustive comparative study, +centering on two often-overlooked dimensions: visual quality and computational +efficiency. We adopt a human-centric approach to assess visual quality, +enlisting a panel of annotators to evaluate the output of different video +inpainting techniques. This provides a nuanced qualitative understanding that +complements traditional quantitative metrics. Concurrently, we delve into the +computational aspects, comparing inference times and memory demands across a +standardized hardware setup. This analysis underscores the balance between +quality and efficiency: a critical consideration for practical applications +where resources may be constrained. By integrating human validation and +computational resource comparison, this survey not only clarifies the present +landscape of video inpainting techniques but also charts a course for future +explorations in this vibrant and evolving field. + +
+
+
+
+
+ + ☆ PVLR: Prompt-driven Visual-Linguistic Representation Learning for + Multi-Label Image Recognition + + +
+ Multi-label image recognition is a fundamental task in computer vision. +Recently, vision-language models have made notable advancements in this area. +However, previous methods often failed to effectively leverage the rich +knowledge within language models and instead incorporated label semantics into +visual features in a unidirectional manner. In this paper, we propose a +Prompt-driven Visual-Linguistic Representation Learning (PVLR) framework to +better leverage the capabilities of the linguistic modality. In PVLR, we first +introduce a dual-prompting strategy comprising Knowledge-Aware Prompting (KAP) +and Context-Aware Prompting (CAP). KAP utilizes fixed prompts to capture the +intrinsic semantic knowledge and relationships across all labels, while CAP +employs learnable prompts to capture context-aware label semantics and +relationships. Later, we propose an Interaction and Fusion Module (IFM) to +interact and fuse the representations obtained from KAP and CAP. In contrast to +the unidirectional fusion in previous works, we introduce a Dual-Modal +Attention (DMA) that enables bidirectional interaction between textual and +visual features, yielding context-aware label representations and +semantic-related visual representations, which are subsequently used to +calculate similarities and generate final predictions for all labels. Extensive +experiments on three popular datasets including MS-COCO, Pascal VOC 2007, and +NUS-WIDE demonstrate the superiority of PVLR. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ☆ AEROBLADE: Training-Free Detection of Latent Diffusion Images Using + Autoencoder Reconstruction Error + + +
+ With recent text-to-image models, anyone can generate deceptively realistic +images with arbitrary contents, fueling the growing threat of visual +disinformation. A key enabler for generating high-resolution images with low +computational cost has been the development of latent diffusion models (LDMs). +In contrast to conventional diffusion models, LDMs perform the denoising +process in the low-dimensional latent space of a pre-trained autoencoder (AE) +instead of the high-dimensional image space. Despite their relevance, the +forensic analysis of LDMs is still in its infancy. In this work we propose +AEROBLADE, a novel detection method which exploits an inherent component of +LDMs: the AE used to transform images between image and latent space. We find +that generated images can be more accurately reconstructed by the AE than real +images, allowing for a simple detection approach based on the reconstruction +error. Most importantly, our method is easy to implement and does not require +any training, yet nearly matches the performance of detectors that rely on +extensive training. We empirically demonstrate that AEROBLADE is effective +against state-of-the-art LDMs including Stable Diffusion and Midjourney. Beyond +detection, our approach allows for the qualitative analysis of images, which +can be leveraged for identifying inpainted regions. + +
+
+
+
+
+ + ☆ VR-based generation of photorealistic synthetic data for training + hand-object tracking models + + +
+ Supervised learning models for precise tracking of hand-object interactions +(HOI) in 3D require large amounts of annotated data for training. Moreover, it +is not intuitive for non-experts to label 3D ground truth (e.g. 6DoF object +pose) on 2D images. To address these issues, we present "blender-hoisynth", an +interactive synthetic data generator based on the Blender software. +Blender-hoisynth can scalably generate and automatically annotate visual HOI +training data. Other competing approaches usually generate synthetic HOI data +compeletely without human input. While this may be beneficial in some +scenarios, HOI applications inherently necessitate direct control over the HOIs +as an expression of human intent. With blender-hoisynth, it is possible for +users to interact with objects via virtual hands using standard Virtual Reality +hardware. The synthetically generated data are characterized by a high degree +of photorealism and contain visually plausible and physically realistic videos +of hands grasping objects and moving them around in 3D. To demonstrate the +efficacy of our data generation, we replace large parts of the training data in +the well-known DexYCB dataset with hoisynth data and train a state-of-the-art +HOI reconstruction model with it. We show that there is no significant +degradation in the model performance despite the data replacement. + +
+
+
+
+
+ + ☆ Convolution Meets LoRA: Parameter Efficient Finetuning for Segment + Anything Model ICLR 2024 + + +
+ The Segment Anything Model (SAM) stands as a foundational framework for image +segmentation. While it exhibits remarkable zero-shot generalization in typical +scenarios, its advantage diminishes when applied to specialized domains like +medical imagery and remote sensing. To address this limitation, this paper +introduces Conv-LoRA, a simple yet effective parameter-efficient fine-tuning +approach. By integrating ultra-lightweight convolutional parameters into +Low-Rank Adaptation (LoRA), Conv-LoRA can inject image-related inductive biases +into the plain ViT encoder, further reinforcing SAM's local prior assumption. +Notably, Conv-LoRA not only preserves SAM's extensive segmentation knowledge +but also revives its capacity of learning high-level image semantics, which is +constrained by SAM's foreground-background segmentation pretraining. +Comprehensive experimentation across diverse benchmarks spanning multiple +domains underscores Conv-LoRA's superiority in adapting SAM to real-world +semantic segmentation tasks. + +
+
+ comment: Accepted at ICLR 2024 Conference +
+
+
+
+
+ + ☆ Proximity QA: Unleashing the Power of Multi-Modal Large Language Models + for Spatial Proximity Analysis + + +
+ Multi-modal large language models (MLLMs) have demonstrated remarkable +vision-language capabilities, primarily due to the exceptional in-context +understanding and multi-task learning strengths of large language models +(LLMs). The advent of visual instruction tuning has further enhanced MLLMs' +performance in vision-language understanding. However, while existing MLLMs +adeptly recognize \textit{what} objects are in an image, they still face +challenges in effectively discerning \textit{where} these objects are, +particularly along the distance (scene depth) axis. To overcome this limitation +in MLLMs, we introduce Proximity Question Answering (Proximity QA), a novel +framework designed to enable MLLMs to infer the proximity relationship between +objects in images. The framework operates in two phases: the first phase +focuses on guiding the models to understand the relative depth of objects, and +the second phase further encourages the models to infer the proximity +relationships between objects based on their depth perceptions. We also propose +a VQA dataset called Proximity-110K, containing additional instructions that +incorporate depth information and the proximity relationships of objects. We +have conducted extensive experiments to validate Proximity QA's superior +ability in depth perception and proximity analysis, outperforming other +state-of-the-art MLLMs. Code and dataset will be released at +\textcolor{magenta}{https://github.com/NorthSummer/ProximityQA.git}. + +
+
+ comment: 15 pages,version 1 +
+
+
+
+
+ + ☆ Semantic Anything in 3D Gaussians + + +
+ 3D Gaussian Splatting has emerged as an alternative 3D representation of +Neural Radiance Fields (NeRFs), benefiting from its high-quality rendering +results and real-time rendering speed. Considering the 3D Gaussian +representation remains unparsed, it is necessary first to execute object +segmentation within this domain. Subsequently, scene editing and collision +detection can be performed, proving vital to a multitude of applications, such +as virtual reality (VR), augmented reality (AR), game/movie production, etc. In +this paper, we propose a novel approach to achieve object segmentation in 3D +Gaussian via an interactive procedure without any training process and learned +parameters. We refer to the proposed method as SA-GS, for Segment Anything in +3D Gaussians. Given a set of clicked points in a single input view, SA-GS can +generalize SAM to achieve 3D consistent segmentation via the proposed +multi-view mask generation and view-wise label assignment methods. We also +propose a cross-view label-voting approach to assign labels from different +views. In addition, in order to address the boundary roughness issue of +segmented objects resulting from the non-negligible spatial sizes of 3D +Gaussian located at the boundary, SA-GS incorporates the simple but effective +Gaussian Decomposition scheme. Extensive experiments demonstrate that SA-GS +achieves high-quality 3D segmentation results, which can also be easily applied +for scene editing and collision detection tasks. Codes will be released soon. + +
+
+
+
+
+ + ☆ Instruction-Guided Scene Text Recognition + + +
+ Multi-modal models have shown appealing performance in visual tasks recently, +as instruction-guided training has evoked the ability to understand +fine-grained visual content. However, current methods cannot be trivially +applied to scene text recognition (STR) due to the gap between natural and text +images. In this paper, we introduce a novel paradigm that formulates STR as an +instruction learning problem, and propose instruction-guided scene text +recognition (IGTR) to achieve effective cross-modal learning. IGTR first +generates rich and diverse instruction triplets of , +serving as guidance for nuanced text image understanding. Then, we devise an +architecture with dedicated cross-modal feature fusion module, and multi-task +answer head to effectively fuse the required instruction and image features for +answering questions. Built upon these designs, IGTR facilitates accurate text +recognition by comprehending character attributes. Experiments on English and +Chinese benchmarks show that IGTR outperforms existing models by significant +margins. Furthermore, by adjusting the instructions, IGTR enables various +recognition schemes. These include zero-shot prediction, where the model is +trained based on instructions not explicitly targeting character recognition, +and the recognition of rarely appearing and morphologically similar characters, +which were previous challenges for existing models. + +
+
+
+
+
+ + ☆ Leveraging Swin Transformer for Local-to-Global Weakly Supervised + Semantic Segmentation + + +
+ In recent years, weakly supervised semantic segmentation using image-level +labels as supervision has received significant attention in the field of +computer vision. Most existing methods have addressed the challenges arising +from the lack of spatial information in these labels by focusing on +facilitating supervised learning through the generation of pseudo-labels from +class activation maps (CAMs). Due to the localized pattern detection of +Convolutional Neural Networks (CNNs), CAMs often emphasize only the most +discriminative parts of an object, making it challenging to accurately +distinguish foreground objects from each other and the background. Recent +studies have shown that Vision Transformer (ViT) features, due to their global +view, are more effective in capturing the scene layout than CNNs. However, the +use of hierarchical ViTs has not been extensively explored in this field. This +work explores the use of Swin Transformer by proposing "SWTformer" to enhance +the accuracy of the initial seed CAMs by bringing local and global views +together. SWTformer-V1 generates class probabilities and CAMs using only the +patch tokens as features. SWTformer-V2 incorporates a multi-scale feature +fusion mechanism to extract additional information and utilizes a +background-aware mechanism to generate more accurate localization maps with +improved cross-object discrimination. Based on experiments on the PascalVOC +2012 dataset, SWTformer-V1 achieves a 0.98% mAP higher localization accuracy, +outperforming state-of-the-art models. It also yields comparable performance by +0.82% mIoU on average higher than other methods in generating initial +localization maps, depending only on the classification network. SWTformer-V2 +further improves the accuracy of the generated seed CAMs by 5.32% mIoU, further +proving the effectiveness of the local-to-global view provided by the Swin +transformer. + +
+
+ comment: 7 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Do Object Detection Localization Errors Affect Human Performance and + Trust? + + +
+ Bounding boxes are often used to communicate automatic object detection +results to humans, aiding humans in a multitude of tasks. We investigate the +relationship between bounding box localization errors and human task +performance. We use observer performance studies on a visual multi-object +counting task to measure both human trust and performance with different levels +of bounding box accuracy. The results show that localization errors have no +significant impact on human accuracy or trust in the system. Recall and +precision errors impact both human performance and trust, suggesting that +optimizing algorithms based on the F1 score is more beneficial in +human-computer tasks. Lastly, the paper offers an improvement on bounding boxes +in multi-object counting tasks with center dots, showing improved performance +and better resilience to localization inaccuracy. + +
+
+
+
+
+ + ☆ Advances in 3D Generation: A Survey + + +
+ Generating 3D models lies at the core of computer graphics and has been the +focus of decades of research. With the emergence of advanced neural +representations and generative models, the field of 3D content generation is +developing rapidly, enabling the creation of increasingly high-quality and +diverse 3D models. The rapid growth of this field makes it difficult to stay +abreast of all recent developments. In this survey, we aim to introduce the +fundamental methodologies of 3D generation methods and establish a structured +roadmap, encompassing 3D representation, generation methods, datasets, and +corresponding applications. Specifically, we introduce the 3D representations +that serve as the backbone for 3D generation. Furthermore, we provide a +comprehensive overview of the rapidly growing literature on generation methods, +categorized by the type of algorithmic paradigms, including feedforward +generation, optimization-based generation, procedural generation, and +generative novel view synthesis. Lastly, we discuss available datasets, +applications, and open challenges. We hope this survey will help readers +explore this exciting topic and foster further advancements in the field of 3D +content generation. + +
+
+ comment: 33 pages, 12 figures +
+
+
+
+
+ + ☆ SimAda: A Simple Unified Framework for Adapting Segment Anything Model + in Underperformed Scenes + + +
+ Segment anything model (SAM) has demonstrated excellent generalization +capabilities in common vision scenarios, yet lacking an understanding of +specialized data. Although numerous works have focused on optimizing SAM for +downstream tasks, these task-specific approaches usually limit the +generalizability to other downstream tasks. In this paper, we aim to +investigate the impact of the general vision modules on finetuning SAM and +enable them to generalize across all downstream tasks. We propose a simple +unified framework called SimAda for adapting SAM in underperformed scenes. +Specifically, our framework abstracts the general modules of different methods +into basic design elements, and we design four variants based on a shared +theoretical framework. SimAda is simple yet effective, which removes all +dataset-specific designs and focuses solely on general optimization, ensuring +that SimAda can be applied to all SAM-based and even Transformer-based models. +We conduct extensive experiments on nine datasets of six downstream tasks. The +results demonstrate that SimAda significantly improves the performance of SAM +on multiple downstream tasks and achieves state-of-the-art performance on most +of them, without requiring task-specific designs. Code is available at: +https://github.com/zongzi13545329/SimAda + +
+
+
+
+
+ + ☆ M2-RAAP: A Multi-Modal Recipe for Advancing Adaptation-based + Pre-training towards Effective and Efficient Zero-shot Video-text Retrieval + + +
+ We present a Multi-Modal Recipe for Advancing Adaptation-based Pre-training +towards effective and efficient zero-shot video-text retrieval, dubbed M2-RAAP. +Upon popular image-text models like CLIP, most current adaptation-based +video-text pre-training methods are confronted by three major issues, i.e., +noisy data corpus, time-consuming pre-training, and limited performance gain. +Towards this end, we conduct a comprehensive study including four critical +steps in video-text pre-training. Specifically, we investigate 1) data +filtering and refinement, 2) video input type selection, 3) temporal modeling, +and 4) video feature enhancement. We then summarize this empirical study into +the M2-RAAP recipe, where our technical contributions lie in 1) the data +filtering and text re-writing pipeline resulting in 1M high-quality bilingual +video-text pairs, 2) the replacement of video inputs with key-frames to +accelerate pre-training, and 3) the Auxiliary-Caption-Guided (ACG) strategy to +enhance video features. We conduct extensive experiments by adapting three +image-text foundation models on two refined video-text datasets from different +languages, validating the robustness and reproducibility of M2-RAAP for +adaptation-based pre-training. Results demonstrate that M2-RAAP yields superior +performance with significantly reduced data (-90%) and time consumption (-95%), +establishing a new SOTA on four English zero-shot retrieval datasets and two +Chinese ones. We are preparing our refined bilingual data annotations and +codebase, which will be available at +https://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/M2_RAAP. + +
+
+
+
+
+ + ☆ RADIN: Souping on a Budget + + +
+ Model Soups, extending Stochastic Weights Averaging (SWA), combine models +fine-tuned with different hyperparameters. Yet, their adoption is hindered by +computational challenges due to subset selection issues. In this paper, we +propose to speed up model soups by approximating soups performance using +averaged ensemble logits performances. Theoretical insights validate the +congruence between ensemble logits and weight averaging soups across any mixing +ratios. Our Resource ADjusted soups craftINg (RADIN) procedure stands out by +allowing flexible evaluation budgets, enabling users to adjust his budget of +exploration adapted to his resources while increasing performance at lower +budget compared to previous greedy approach (up to 4% on ImageNet). + +
+
+
+
+
+ + ☆ Robustly overfitting latents for flexible neural image compression + + +
+ Neural image compression has made a great deal of progress. State-of-the-art +models are based on variational autoencoders and are outperforming classical +models. Neural compression models learn to encode an image into a quantized +latent representation that can be efficiently sent to the decoder, which +decodes the quantized latent into a reconstructed image. While these models +have proven successful in practice, they lead to sub-optimal results due to +imperfect optimization and limitations in the encoder and decoder capacity. +Recent work shows how to use stochastic Gumbel annealing (SGA) to refine the +latents of pre-trained neural image compression models. We extend this idea by +introducing SGA+, which contains three different methods that build upon SGA. +Further, we give a detailed analysis of our proposed methods, show how they +improve performance, and show that they are less sensitive to hyperparameter +choices. Besides, we show how each method can be extended to three- instead of +two-class rounding. Finally, we show how refinement of the latents with our +best-performing method improves the compression performance on the Tecnick +dataset and how it can be deployed to partly move along the rate-distortion +curve. + +
+
+
+
+
+ + ☆ Double InfoGAN for Contrastive Analysis AISTATS 2024 + + +
+ Contrastive Analysis (CA) deals with the discovery of what is common and what +is distinctive of a target domain compared to a background one. This is of +great interest in many applications, such as medical imaging. Current +state-of-the-art (SOTA) methods are latent variable models based on VAE +(CA-VAEs). However, they all either ignore important constraints or they don't +enforce fundamental assumptions. This may lead to sub-optimal solutions where +distinctive factors are mistaken for common ones (or viceversa). Furthermore, +the generated images have a rather poor quality, typical of VAEs, decreasing +their interpretability and usefulness. Here, we propose Double InfoGAN, the +first GAN based method for CA that leverages the high-quality synthesis of GAN +and the separation power of InfoGAN. Experimental results on four visual +datasets, from simple synthetic examples to complex medical images, show that +the proposed method outperforms SOTA CA-VAEs in terms of latent separation and +image quality. Datasets and code are available online. + +
+
+ comment: Accepted at AISTATS 2024 +
+
+
+
+
+ + ☆ SNP-S3: Shared Network Pre-training and Significant Semantic + Strengthening for Various Video-Text Tasks + + +
+ We present a framework for learning cross-modal video representations by +directly pre-training on raw data to facilitate various downstream video-text +tasks. Our main contributions lie in the pre-training framework and proxy +tasks. First, based on the shortcomings of two mainstream pixel-level +pre-training architectures (limited applications or less efficient), we propose +Shared Network Pre-training (SNP). By employing one shared BERT-type network to +refine textual and cross-modal features simultaneously, SNP is lightweight and +could support various downstream applications. Second, based on the intuition +that people always pay attention to several "significant words" when +understanding a sentence, we propose the Significant Semantic Strengthening +(S3) strategy, which includes a novel masking and matching proxy task to +promote the pre-training performance. Experiments conducted on three downstream +video-text tasks and six datasets demonstrate that, we establish a new +state-of-the-art in pixel-level video-text pre-training; we also achieve a +satisfactory balance between the pre-training efficiency and the fine-tuning +performance. The codebase are available at +https://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/snps3_vtp. + +
+
+ comment: Accepted by TCSVT (IEEE Transactions on Circuits and Systems for + Video Technology) +
+
+
+
+
+ + ☆ Fine-Grained Zero-Shot Learning: Advances, Challenges, and Prospects + + +
+ Recent zero-shot learning (ZSL) approaches have integrated fine-grained +analysis, i.e., fine-grained ZSL, to mitigate the commonly known seen/unseen +domain bias and misaligned visual-semantics mapping problems, and have made +profound progress. Notably, this paradigm differs from existing close-set +fine-grained methods and, therefore, can pose unique and nontrivial challenges. +However, to the best of our knowledge, there remains a lack of systematic +summaries of this topic. To enrich the literature of this domain and provide a +sound basis for its future development, in this paper, we present a broad +review of recent advances for fine-grained analysis in ZSL. Concretely, we +first provide a taxonomy of existing methods and techniques with a thorough +analysis of each category. Then, we summarize the benchmark, covering publicly +available datasets, models, implementations, and some more details as a +library. Last, we sketch out some related applications. In addition, we discuss +vital challenges and suggest potential future directions. + +
+
+ comment: 11 pages, 1 figure, 4 tables +
+
+
+
+
+ + ☆ Tiered approach for rapid damage characterisation of infrastructure + enabled by remote sensing and deep learning technologies + + +
+ Critical infrastructure such as bridges are systematically targeted during +wars and conflicts. This is because critical infrastructure is vital for +enabling connectivity and transportation of people and goods, and hence, +underpinning the national and international defence planning and economic +growth. Mass destruction of bridges, along with minimal or no accessibility to +these assets during natural and anthropogenic disasters, prevents us from +delivering rapid recovery. As a result, systemic resilience is drastically +reduced. A solution to this challenge is to use technology for stand-off +observations. Yet, no method exists to characterise damage at different scales, +i.e. regional, asset, and structural (component), and more so there is little +or no systematic correlation between assessments at scale. We propose an +integrated three-level tiered approach to fill this capability gap, and we +demonstrate the methods for damage characterisation enabled by fit-for-purpose +digital technologies. Next, this method is applied and validated to a case +study in Ukraine that includes 17 bridges. From macro to micro, we deploy +technology at scale, from Sentinel-1 SAR images, crowdsourced information, and +high-resolution images to deep learning for damaged infrastructure. For the +first time, the interferometric coherence difference and semantic segmentation +of images were deployed to improve the reliability of damage characterisations +from regional to infrastructure component level, when enhanced assessment +accuracy is required. This integrated method improves the speed of +decision-making, and thus, enhances resilience. Keywords: critical +infrastructure, damage characterisation, targeted attacks, restoration + +
+
+ comment: Main text (34 pages,18 figures); Supplementary materials (13 pages) +
+
+
+
+
+ + ☆ Leveraging Human-Machine Interactions for Computer Vision Dataset + Quality Enhancement + + +
+ Large-scale datasets for single-label multi-class classification, such as +\emph{ImageNet-1k}, have been instrumental in advancing deep learning and +computer vision. However, a critical and often understudied aspect is the +comprehensive quality assessment of these datasets, especially regarding +potential multi-label annotation errors. In this paper, we introduce a +lightweight, user-friendly, and scalable framework that synergizes human and +machine intelligence for efficient dataset validation and quality enhancement. +We term this novel framework \emph{Multilabelfy}. Central to Multilabelfy is an +adaptable web-based platform that systematically guides annotators through the +re-evaluation process, effectively leveraging human-machine interactions to +enhance dataset quality. By using Multilabelfy on the ImageNetV2 dataset, we +found that approximately $47.88\%$ of the images contained at least two labels, +underscoring the need for more rigorous assessments of such influential +datasets. Furthermore, our analysis showed a negative correlation between the +number of potential labels per image and model top-1 accuracy, illuminating a +crucial factor in model evaluation and selection. Our open-source framework, +Multilabelfy, offers a convenient, lightweight solution for dataset +enhancement, emphasizing multi-label proportions. This study tackles major +challenges in dataset integrity and provides key insights into model +performance evaluation. Moreover, it underscores the advantages of integrating +human expertise with machine capabilities to produce more robust models and +trustworthy data development. The source code for Multilabelfy will be +available at https://github.com/esla/Multilabelfy. + \keywords{Computer Vision \and Dataset Quality Enhancement \and Dataset +Validation \and Human-Computer Interaction \and Multi-label Annotation.} + +
+
+
+
+
+ + ☆ COMET: Contrastive Mean Teacher for Online Source-Free Universal Domain + Adaptation + + +
+ In real-world applications, there is often a domain shift from training to +test data. This observation resulted in the development of test-time adaptation +(TTA). It aims to adapt a pre-trained source model to the test data without +requiring access to the source data. Thereby, most existing works are limited +to the closed-set assumption, i.e. there is no category shift between source +and target domain. We argue that in a realistic open-world setting a category +shift can appear in addition to a domain shift. This means, individual source +classes may not appear in the target domain anymore, samples of new classes may +be part of the target domain or even both at the same time. Moreover, in many +real-world scenarios the test data is not accessible all at once but arrives +sequentially as a stream of batches demanding an immediate prediction. Hence, +TTA must be applied in an online manner. To the best of our knowledge, the +combination of these aspects, i.e. online source-free universal domain +adaptation (online SF-UniDA), has not been studied yet. In this paper, we +introduce a Contrastive Mean Teacher (COMET) tailored to this novel scenario. +It applies a contrastive loss to rebuild a feature space where the samples of +known classes build distinct clusters and the samples of new classes separate +well from them. It is complemented by an entropy loss which ensures that the +classifier output has a small entropy for samples of known classes and a large +entropy for samples of new classes to be easily detected and rejected as +unknown. To provide the losses with reliable pseudo labels, they are embedded +into a mean teacher (MT) framework. We evaluate our method across two datasets +and all category shifts to set an initial benchmark for online SF-UniDA. +Thereby, COMET yields state-of-the-art performance and proves to be consistent +and robust across a variety of different scenarios. + +
+
+
+
+
+ + ☆ 3D-Plotting Algorithm for Insects using YOLOv5 + + +
+ In ecological research, accurately collecting spatiotemporal position data is +a fundamental task for understanding the behavior and ecology of insects and +other organisms. In recent years, advancements in computer vision techniques +have reached a stage of maturity where they can support, and in some cases, +replace manual observation. In this study, a simple and inexpensive method for +monitoring insects in three dimensions (3D) was developed so that their +behavior could be observed automatically in experimental environments. The main +achievements of this study have been to create a 3D monitoring algorithm using +inexpensive cameras and other equipment to design an adjusting algorithm for +depth error, and to validate how our plotting algorithm is quantitatively +precise, all of which had not been realized in conventional studies. By +offering detailed 3D visualizations of insects, the plotting algorithm aids +researchers in more effectively comprehending how insects interact within their +environments. + +
+
+
+
+
+ + ☆ Unified Physical-Digital Face Attack Detection + + +
+ Face Recognition (FR) systems can suffer from physical (i.e., print photo) +and digital (i.e., DeepFake) attacks. However, previous related work rarely +considers both situations at the same time. This implies the deployment of +multiple models and thus more computational burden. The main reasons for this +lack of an integrated model are caused by two factors: (1) The lack of a +dataset including both physical and digital attacks with ID consistency which +means the same ID covers the real face and all attack types; (2) Given the +large intra-class variance between these two attacks, it is difficult to learn +a compact feature space to detect both attacks simultaneously. To address these +issues, we collect a Unified physical-digital Attack dataset, called +UniAttackData. The dataset consists of $1,800$ participations of 2 and 12 +physical and digital attacks, respectively, resulting in a total of 29,706 +videos. Then, we propose a Unified Attack Detection framework based on +Vision-Language Models (VLMs), namely UniAttackDetection, which includes three +main modules: the Teacher-Student Prompts (TSP) module, focused on acquiring +unified and specific knowledge respectively; the Unified Knowledge Mining (UKM) +module, designed to capture a comprehensive feature space; and the Sample-Level +Prompt Interaction (SLPI) module, aimed at grasping sample-level semantics. +These three modules seamlessly form a robust unified attack detection +framework. Extensive experiments on UniAttackData and three other datasets +demonstrate the superiority of our approach for unified face attack detection. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Datacube segmentation via Deep Spectral Clustering + + +
+ Extended Vision techniques are ubiquitous in physics. However, the data cubes +steaming from such analysis often pose a challenge in their interpretation, due +to the intrinsic difficulty in discerning the relevant information from the +spectra composing the data cube. + Furthermore, the huge dimensionality of data cube spectra poses a complex +task in its statistical interpretation; nevertheless, this complexity contains +a massive amount of statistical information that can be exploited in an +unsupervised manner to outline some essential properties of the case study at +hand, e.g.~it is possible to obtain an image segmentation via (deep) clustering +of data-cube's spectra, performed in a suitably defined low-dimensional +embedding space. + To tackle this topic, we explore the possibility of applying unsupervised +clustering methods in encoded space, i.e. perform deep clustering on the +spectral properties of datacube pixels. A statistical dimensional reduction is +performed by an ad hoc trained (Variational) AutoEncoder, in charge of mapping +spectra into lower dimensional metric spaces, while the clustering process is +performed by a (learnable) iterative K-Means clustering algorithm. + We apply this technique to two different use cases, of different physical +origins: a set of Macro mapping X-Ray Fluorescence (MA-XRF) synthetic data on +pictorial artworks, and a dataset of simulated astrophysical observations. + +
+
+ comment: 20 pages, 10 figures, doi for code repository, dataset and trained + model available and reported in the paper +
+
+
+
+
+ + ☆ Image Anything: Towards Reasoning-coherent and Training-free Multi-modal + Image Generation + + +
+ The multifaceted nature of human perception and comprehension indicates that, +when we think, our body can naturally take any combination of senses, a.k.a., +modalities and form a beautiful picture in our brain. For example, when we see +a cattery and simultaneously perceive the cat's purring sound, our brain can +construct a picture of a cat in the cattery. Intuitively, generative AI models +should hold the versatility of humans and be capable of generating images from +any combination of modalities efficiently and collaboratively. This paper +presents ImgAny, a novel end-to-end multi-modal generative model that can mimic +human reasoning and generate high-quality images. Our method serves as the +first attempt in its capacity of efficiently and flexibly taking any +combination of seven modalities, ranging from language, audio to vision +modalities, including image, point cloud, thermal, depth, and event data. Our +key idea is inspired by human-level cognitive processes and involves the +integration and harmonization of multiple input modalities at both the entity +and attribute levels without specific tuning across modalities. Accordingly, +our method brings two novel training-free technical branches: 1) Entity Fusion +Branch ensures the coherence between inputs and outputs. It extracts entity +features from the multi-modal representations powered by our specially +constructed entity knowledge graph; 2) Attribute Fusion Branch adeptly +preserves and processes the attributes. It efficiently amalgamates distinct +attributes from diverse input modalities via our proposed attribute knowledge +graph. Lastly, the entity and attribute features are adaptively fused as the +conditional inputs to the pre-trained Stable Diffusion model for image +generation. Extensive experiments under diverse modality combinations +demonstrate its exceptional capability for visual content creation. + +
+
+
+
+
+ + ☆ All Beings Are Equal in Open Set Recognition AAAI + + +
+ In open-set recognition (OSR), a promising strategy is exploiting +pseudo-unknown data outside given $K$ known classes as an additional $K$+$1$-th +class to explicitly model potential open space. However, treating unknown +classes without distinction is unequal for them relative to known classes due +to the category-agnostic and scale-agnostic of the unknowns. This inevitably +not only disrupts the inherent distributions of unknown classes but also incurs +both class-wise and instance-wise imbalances between known and unknown classes. +Ideally, the OSR problem should model the whole class space as $K$+$\infty$, +but enumerating all unknowns is impractical. Since the core of OSR is to +effectively model the boundaries of known classes, this means just focusing on +the unknowns nearing the boundaries of targeted known classes seems sufficient. +Thus, as a compromise, we convert the open classes from infinite to $K$, with a +novel concept Target-Aware Universum (TAU) and propose a simple yet effective +framework Dual Contrastive Learning with Target-Aware Universum (DCTAU). In +details, guided by the targeted known classes, TAU automatically expands the +unknown classes from the previous $1$ to $K$, effectively alleviating the +distribution disruption and the imbalance issues mentioned above. Then, a novel +Dual Contrastive (DC) loss is designed, where all instances irrespective of +known or TAU are considered as positives to contrast with their respective +negatives. Experimental results indicate DCTAU sets a new state-of-the-art. + +
+
+ comment: Accepted by the main track The 38th Annual AAAI Conference on + Artificial Intelligence (AAAI 2024) +
+
+
+
+
+ + ☆ Exploring the Common Appearance-Boundary Adaptation for Nighttime + Optical Flow + + +
+ We investigate a challenging task of nighttime optical flow, which suffers +from weakened texture and amplified noise. These degradations weaken +discriminative visual features, thus causing invalid motion feature matching. +Typically, existing methods employ domain adaptation to transfer knowledge from +auxiliary domain to nighttime domain in either input visual space or output +motion space. However, this direct adaptation is ineffective, since there +exists a large domain gap due to the intrinsic heterogeneous nature of the +feature representations between auxiliary and nighttime domains. To overcome +this issue, we explore a common-latent space as the intermediate bridge to +reinforce the feature alignment between auxiliary and nighttime domains. In +this work, we exploit two auxiliary daytime and event domains, and propose a +novel common appearance-boundary adaptation framework for nighttime optical +flow. In appearance adaptation, we employ the intrinsic image decomposition to +embed the auxiliary daytime image and the nighttime image into a +reflectance-aligned common space. We discover that motion distributions of the +two reflectance maps are very similar, benefiting us to consistently transfer +motion appearance knowledge from daytime to nighttime domain. In boundary +adaptation, we theoretically derive the motion correlation formula between +nighttime image and accumulated events within a spatiotemporal gradient-aligned +common space. We figure out that the correlation of the two spatiotemporal +gradient maps shares significant discrepancy, benefitting us to contrastively +transfer boundary knowledge from event to nighttime domain. Moreover, +appearance adaptation and boundary adaptation are complementary to each other, +since they could jointly transfer global motion and local boundary knowledge to +the nighttime domain. + +
+
+
+
+
+ + ☆ Spatial-and-Frequency-aware Restoration method for Images based on + Diffusion Models + + +
+ Diffusion models have recently emerged as a promising framework for Image +Restoration (IR), owing to their ability to produce high-quality +reconstructions and their compatibility with established methods. Existing +methods for solving noisy inverse problems in IR, considers the pixel-wise +data-fidelity. In this paper, we propose SaFaRI, a spatial-and-frequency-aware +diffusion model for IR with Gaussian noise. Our model encourages images to +preserve data-fidelity in both the spatial and frequency domains, resulting in +enhanced reconstruction quality. We comprehensively evaluate the performance of +our model on a variety of noisy inverse problems, including inpainting, +denoising, and super-resolution. Our thorough evaluation demonstrates that +SaFaRI achieves state-of-the-art performance on both the ImageNet datasets and +FFHQ datasets, outperforming existing zero-shot IR methods in terms of LPIPS +and FID metrics. + +
+
+
+
+
+ + ☆ Unveiling the Power of Self-supervision for Multi-view Multi-human + Association and Tracking + + +
+ Multi-view multi-human association and tracking (MvMHAT), is a new but +important problem for multi-person scene video surveillance, aiming to track a +group of people over time in each view, as well as to identify the same person +across different views at the same time, which is different from previous MOT +and multi-camera MOT tasks only considering the over-time human tracking. This +way, the videos for MvMHAT require more complex annotations while containing +more information for self learning. In this work, we tackle this problem with a +self-supervised learning aware end-to-end network. Specifically, we propose to +take advantage of the spatial-temporal self-consistency rationale by +considering three properties of reflexivity, symmetry and transitivity. Besides +the reflexivity property that naturally holds, we design the self-supervised +learning losses based on the properties of symmetry and transitivity, for both +appearance feature learning and assignment matrix optimization, to associate +the multiple humans over time and across views. Furthermore, to promote the +research on MvMHAT, we build two new large-scale benchmarks for the network +training and testing of different algorithms. Extensive experiments on the +proposed benchmarks verify the effectiveness of our method. We have released +the benchmark and code to the public. + +
+
+
+
+
+ + ☆ LaneGraph2Seq: Lane Topology Extraction with Language Model via + Vertex-Edge Encoding and Connectivity Enhancement AAAI 2024 + + +
+ Understanding road structures is crucial for autonomous driving. Intricate +road structures are often depicted using lane graphs, which include centerline +curves and connections forming a Directed Acyclic Graph (DAG). Accurate +extraction of lane graphs relies on precisely estimating vertex and edge +information within the DAG. Recent research highlights Transformer-based +language models' impressive sequence prediction abilities, making them +effective for learning graph representations when graph data are encoded as +sequences. However, existing studies focus mainly on modeling vertices +explicitly, leaving edge information simply embedded in the network. +Consequently, these approaches fall short in the task of lane graph extraction. +To address this, we introduce LaneGraph2Seq, a novel approach for lane graph +extraction. It leverages a language model with vertex-edge encoding and +connectivity enhancement. Our serialization strategy includes a vertex-centric +depth-first traversal and a concise edge-based partition sequence. +Additionally, we use classifier-free guidance combined with nucleus sampling to +improve lane connectivity. We validate our method on prominent datasets, +nuScenes and Argoverse 2, showcasing consistent and compelling results. Our +LaneGraph2Seq approach demonstrates superior performance compared to +state-of-the-art techniques in lane graph extraction. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ☆ Computation and Parameter Efficient Multi-Modal Fusion Transformer for + Cued Speech Recognition + + +
+ Cued Speech (CS) is a pure visual coding method used by hearing-impaired +people that combines lip reading with several specific hand shapes to make the +spoken language visible. Automatic CS recognition (ACSR) seeks to transcribe +visual cues of speech into text, which can help hearing-impaired people to +communicate effectively. The visual information of CS contains lip reading and +hand cueing, thus the fusion of them plays an important role in ACSR. However, +most previous fusion methods struggle to capture the global dependency present +in long sequence inputs of multi-modal CS data. As a result, these methods +generally fail to learn the effective cross-modal relationships that contribute +to the fusion. Recently, attention-based transformers have been a prevalent +idea for capturing the global dependency over the long sequence in multi-modal +fusion, but existing multi-modal fusion transformers suffer from both poor +recognition accuracy and inefficient computation for the ACSR task. To address +these problems, we develop a novel computation and parameter efficient +multi-modal fusion transformer by proposing a novel Token-Importance-Aware +Attention mechanism (TIAA), where a token utilization rate (TUR) is formulated +to select the important tokens from the multi-modal streams. More precisely, +TIAA firstly models the modality-specific fine-grained temporal dependencies +over all tokens of each modality, and then learns the efficient cross-modal +interaction for the modality-shared coarse-grained temporal dependencies over +the important tokens of different modalities. Besides, a light-weight gated +hidden projection is designed to control the feature flows of TIAA. The +resulting model, named Economical Cued Speech Fusion Transformer (EcoCued), +achieves state-of-the-art performance on all existing CS datasets, compared +with existing transformer-based fusion methods and ACSR fusion methods. + +
+
+ comment: Accepted by TASLP +
+
+
+
+
+ + ☆ Topology-Aware Latent Diffusion for 3D Shape Generation + + +
+ We introduce a new generative model that combines latent diffusion with +persistent homology to create 3D shapes with high diversity, with a special +emphasis on their topological characteristics. Our method involves representing +3D shapes as implicit fields, then employing persistent homology to extract +topological features, including Betti numbers and persistence diagrams. The +shape generation process consists of two steps. Initially, we employ a +transformer-based autoencoding module to embed the implicit representation of +each 3D shape into a set of latent vectors. Subsequently, we navigate through +the learned latent space via a diffusion model. By strategically incorporating +topological features into the diffusion process, our generative module is able +to produce a richer variety of 3D shapes with different topological structures. +Furthermore, our framework is flexible, supporting generation tasks constrained +by a variety of inputs, including sparse and partial point clouds, as well as +sketches. By modifying the persistence diagrams, we can alter the topology of +the shapes generated from these input modalities. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ Good at captioning, bad at counting: Benchmarking GPT-4V on Earth + observation data + + +
+ Large Vision-Language Models (VLMs) have demonstrated impressive performance +on complex tasks involving visual input with natural language instructions. +However, it remains unclear to what extent capabilities on natural images +transfer to Earth observation (EO) data, which are predominantly satellite and +aerial images less common in VLM training data. In this work, we propose a +comprehensive benchmark to gauge the progress of VLMs toward being useful tools +for EO data by assessing their abilities on scene understanding, localization +and counting, and change detection tasks. Motivated by real-world applications, +our benchmark includes scenarios like urban monitoring, disaster relief, land +use, and conservation. We discover that, although state-of-the-art VLMs like +GPT-4V possess extensive world knowledge that leads to strong performance on +open-ended tasks like location understanding and image captioning, their poor +spatial reasoning limits usefulness on object localization and counting tasks. +Our benchmark will be made publicly available at https://vleo.danielz.ch/ and +on Hugging Face at +https://huggingface.co/collections/mit-ei/vleo-benchmark-datasets-65b789b0466555489cce0d70 +for easy model evaluation. + +
+
+ comment: 62 pages; work in progress +
+
+
+
+
+ + ☆ Head and Neck Tumor Segmentation from [18F]F-FDG PET/CT Images Based on + 3D Diffusion Model + + +
+ Head and neck (H&N) cancers are among the most prevalent types of cancer +worldwide, and [18F]F-FDG PET/CT is widely used for H&N cancer management. +Recently, the diffusion model has demonstrated remarkable performance in +various image-generation tasks. In this work, we proposed a 3D diffusion model +to accurately perform H&N tumor segmentation from 3D PET and CT volumes. The 3D +diffusion model was developed considering the 3D nature of PET and CT images +acquired. During the reverse process, the model utilized a 3D UNet structure +and took the concatenation of PET, CT, and Gaussian noise volumes as the +network input to generate the tumor mask. Experiments based on the HECKTOR +challenge dataset were conducted to evaluate the effectiveness of the proposed +diffusion model. Several state-of-the-art techniques based on U-Net and +Transformer structures were adopted as the reference methods. Benefits of +employing both PET and CT as the network input as well as further extending the +diffusion model from 2D to 3D were investigated based on various quantitative +metrics and the uncertainty maps generated. Results showed that the proposed 3D +diffusion model could generate more accurate segmentation results compared with +other methods. Compared to the diffusion model in 2D format, the proposed 3D +model yielded superior results. Our experiments also highlighted the advantage +of utilizing dual-modality PET and CT data over only single-modality data for +H&N tumor segmentation. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ Local Feature Matching Using Deep Learning: A Survey + + +
+ Local feature matching enjoys wide-ranging applications in the realm of +computer vision, encompassing domains such as image retrieval, 3D +reconstruction, and object recognition. However, challenges persist in +improving the accuracy and robustness of matching due to factors like viewpoint +and lighting variations. In recent years, the introduction of deep learning +models has sparked widespread exploration into local feature matching +techniques. The objective of this endeavor is to furnish a comprehensive +overview of local feature matching methods. These methods are categorized into +two key segments based on the presence of detectors. The Detector-based +category encompasses models inclusive of Detect-then-Describe, Joint Detection +and Description, Describe-then-Detect, as well as Graph Based techniques. In +contrast, the Detector-free category comprises CNN Based, Transformer Based, +and Patch Based methods. Our study extends beyond methodological analysis, +incorporating evaluations of prevalent datasets and metrics to facilitate a +quantitative comparison of state-of-the-art techniques. The paper also explores +the practical application of local feature matching in diverse domains such as +Structure from Motion, Remote Sensing Image Registration, and Medical Image +Registration, underscoring its versatility and significance across various +fields. Ultimately, we endeavor to outline the current challenges faced in this +domain and furnish future research directions, thereby serving as a reference +for researchers involved in local feature matching and its interconnected +domains. + +
+
+
+
+
+ + ☆ Agile But Safe: Learning Collision-Free High-Speed Legged Locomotion + + +
+ Legged robots navigating cluttered environments must be jointly agile for +efficient task execution and safe to avoid collisions with obstacles or humans. +Existing studies either develop conservative controllers (< 1.0 m/s) to ensure +safety, or focus on agility without considering potentially fatal collisions. +This paper introduces Agile But Safe (ABS), a learning-based control framework +that enables agile and collision-free locomotion for quadrupedal robots. ABS +involves an agile policy to execute agile motor skills amidst obstacles and a +recovery policy to prevent failures, collaboratively achieving high-speed and +collision-free navigation. The policy switch in ABS is governed by a learned +control-theoretic reach-avoid value network, which also guides the recovery +policy as an objective function, thereby safeguarding the robot in a closed +loop. The training process involves the learning of the agile policy, the +reach-avoid value network, the recovery policy, and an exteroception +representation network, all in simulation. These trained modules can be +directly deployed in the real world with onboard sensing and computation, +leading to high-speed and collision-free navigation in confined indoor and +outdoor spaces with both static and dynamic obstacles. + +
+
+ comment: Project website: https://agile-but-safe.github.io/ +
+
+
+
+
+ + ☆ Is Registering Raw Tagged-MR Enough for Strain Estimation in the Era of + Deep Learning? SP + + +
+ Magnetic Resonance Imaging with tagging (tMRI) has long been utilized for +quantifying tissue motion and strain during deformation. However, a phenomenon +known as tag fading, a gradual decrease in tag visibility over time, often +complicates post-processing. The first contribution of this study is to model +tag fading by considering the interplay between $T_1$ relaxation and the +repeated application of radio frequency (RF) pulses during serial imaging +sequences. This is a factor that has been overlooked in prior research on tMRI +post-processing. Further, we have observed an emerging trend of utilizing raw +tagged MRI within a deep learning-based (DL) registration framework for motion +estimation. In this work, we evaluate and analyze the impact of commonly used +image similarity objectives in training DL registrations on raw tMRI. This is +then compared with the Harmonic Phase-based approach, a traditional approach +which is claimed to be robust to tag fading. Our findings, derived from both +simulated images and an actual phantom scan, reveal the limitations of various +similarity losses in raw tMRI and emphasize caution in registration tasks where +image intensity changes over time. + +
+
+ comment: Accepted to SPIE Medical Imaging 2024 (oral) +
+
+
+
+
+ + ☆ Task-Oriented Diffusion Model Compression + + +
+ As recent advancements in large-scale Text-to-Image (T2I) diffusion models +have yielded remarkable high-quality image generation, diverse downstream +Image-to-Image (I2I) applications have emerged. Despite the impressive results +achieved by these I2I models, their practical utility is hampered by their +large model size and the computational burden of the iterative denoising +process. In this paper, we explore the compression potential of these I2I +models in a task-oriented manner and introduce a novel method for reducing both +model size and the number of timesteps. Through extensive experiments, we +observe key insights and use our empirical knowledge to develop practical +solutions that aim for near-optimal results with minimal exploration costs. We +validate the effectiveness of our method by applying it to InstructPix2Pix for +image editing and StableSR for image restoration. Our approach achieves +satisfactory output quality with 39.2% and 56.4% reduction in model footprint +and 81.4% and 68.7% decrease in latency to InstructPix2Pix and StableSR, +respectively. + +
+
+
+
+
+ + ☆ Trainable Fixed-Point Quantization for Deep Learning Acceleration on + FPGAs + + +
+ Quantization is a crucial technique for deploying deep learning models on +resource-constrained devices, such as embedded FPGAs. Prior efforts mostly +focus on quantizing matrix multiplications, leaving other layers like BatchNorm +or shortcuts in floating-point form, even though fixed-point arithmetic is more +efficient on FPGAs. A common practice is to fine-tune a pre-trained model to +fixed-point for FPGA deployment, but potentially degrading accuracy. + This work presents QFX, a novel trainable fixed-point quantization approach +that automatically learns the binary-point position during model training. +Additionally, we introduce a multiplier-free quantization strategy within QFX +to minimize DSP usage. QFX is implemented as a PyTorch-based library that +efficiently emulates fixed-point arithmetic, supported by FPGA HLS, in a +differentiable manner during backpropagation. With minimal effort, models +trained with QFX can readily be deployed through HLS, producing the same +numerical results as their software counterparts. Our evaluation shows that +compared to post-training quantization, QFX can quantize models trained with +element-wise layers quantized to fewer bits and achieve higher accuracy on both +CIFAR-10 and ImageNet datasets. We further demonstrate the efficacy of +multiplier-free quantization using a state-of-the-art binarized neural network +accelerator designed for an embedded FPGA (AMD Xilinx Ultra96 v2). We plan to +release QFX in open-source format. + +
+
+
+
+
+ + ☆ Data-Effective Learning: A Comprehensive Medical Benchmark + + +
+ Data-effective learning aims to use data in the most impactful way to train +AI models, which involves strategies that focus on data quality rather than +quantity, ensuring the data used for training has high informational value. +Data-effective learning plays a profound role in accelerating AI training, +reducing computational costs, and saving data storage, which is very important +as the volume of medical data in recent years has grown beyond many people's +expectations. However, due to the lack of standards and comprehensive +benchmark, research on medical data-effective learning is poorly studied. To +address this gap, our paper introduces a comprehensive benchmark specifically +for evaluating data-effective learning in the medical field. This benchmark +includes a dataset with millions of data samples from 31 medical centers +(DataDEL), a baseline method for comparison (MedDEL), and a new evaluation +metric (NormDEL) to objectively measure data-effective learning performance. +Our extensive experimental results show the baseline MedDEL can achieve +performance comparable to the original large dataset with only 5% of the data. +Establishing such an open data-effective learning benchmark is crucial for the +medical AI research community because it facilitates efficient data use, +promotes collaborative breakthroughs, and fosters the development of +cost-effective, scalable, and impactful healthcare solutions. The project can +be accessed at +https://github.com/shadow2469/Data-Effective-Learning-A-Comprehensive-Medical-Benchmark.git. + +
+
+
+
+
+ + ☆ Towards Image Semantics and Syntax Sequence Learning + + +
+ Convolutional neural networks and vision transformers have achieved +outstanding performance in machine perception, particularly for image +classification. Although these image classifiers excel at predicting +image-level class labels, they may not discriminate missing or shifted parts +within an object. As a result, they may fail to detect corrupted images that +involve missing or disarrayed semantic information in the object composition. +On the contrary, human perception easily distinguishes such corruptions. To +mitigate this gap, we introduce the concept of "image grammar", consisting of +"image semantics" and "image syntax", to denote the semantics of parts or +patches of an image and the order in which these parts are arranged to create a +meaningful object. To learn the image grammar relative to a class of visual +objects/scenes, we propose a weakly supervised two-stage approach. In the first +stage, we use a deep clustering framework that relies on iterative clustering +and feature refinement to produce part-semantic segmentation. In the second +stage, we incorporate a recurrent bi-LSTM module to process a sequence of +semantic segmentation patches to capture the image syntax. Our framework is +trained to reason over patch semantics and detect faulty syntax. We benchmark +the performance of several grammar learning models in detecting patch +corruptions. Finally, we verify the capabilities of our framework in Celeb and +SUNRGBD datasets and demonstrate that it can achieve a grammar validation +accuracy of 70 to 90% in a wide variety of semantic and syntactical corruption +scenarios. + +
+
+ comment: 21 pages, 22 figures, 5 tables +
+
+
+
+
+ + ☆ Capacity Constraint Analysis Using Object Detection for Smart + Manufacturing + + +
+ The increasing popularity of Deep Learning (DL) based Object Detection (OD) +methods and their real-world applications have opened new venues in smart +manufacturing. Traditional industries struck by capacity constraints after +Coronavirus Disease (COVID-19) require non-invasive methods for in-depth +operations' analysis to optimize and increase their revenue. In this study, we +have initially developed a Convolutional Neural Network (CNN) based OD model to +tackle this issue. This model is trained to accurately identify the presence of +chairs and individuals on the production floor. The identified objects are then +passed to the CNN based tracker, which tracks them throughout their life cycle +in the workstation. The extracted meta-data is further processed through a +novel framework for the capacity constraint analysis. We identified that the +Station C is only 70.6% productive through 6 months. Additionally, the time +spent at each station is recorded and aggregated for each object. This data +proves helpful in conducting annual audits and effectively managing labor and +material over time. + +
+
+
+
+
+ + ☆ Spectral Norm of Convolutional Layers with Circular and Zero Paddings + + +
+ This paper leverages the use of \emph{Gram iteration} an efficient, +deterministic, and differentiable method for computing spectral norm with an +upper bound guarantee. Designed for circular convolutional layers, we +generalize the use of the Gram iteration to zero padding convolutional layers +and prove its quadratic convergence. We also provide theorems for bridging the +gap between circular and zero padding convolution's spectral norm. We design a +\emph{spectral rescaling} that can be used as a competitive $1$-Lipschitz layer +that enhances network robustness. Demonstrated through experiments, our method +outperforms state-of-the-art techniques in precision, computational cost, and +scalability. The code of experiments is available at +https://github.com/blaisedelattre/lip4conv. + +
+
+
+
+
+ + ☆ Geometry aware 3D generation from in-the-wild images in ImageNet + + +
+ Generating accurate 3D models is a challenging problem that traditionally +requires explicit learning from 3D datasets using supervised learning. Although +recent advances have shown promise in learning 3D models from 2D images, these +methods often rely on well-structured datasets with multi-view images of each +instance or camera pose information. Furthermore, these datasets usually +contain clean backgrounds with simple shapes, making them expensive to acquire +and hard to generalize, which limits the applicability of these methods. To +overcome these limitations, we propose a method for reconstructing 3D geometry +from the diverse and unstructured Imagenet dataset without camera pose +information. We use an efficient triplane representation to learn 3D models +from 2D images and modify the architecture of the generator backbone based on +StyleGAN2 to adapt to the highly diverse dataset. To prevent mode collapse and +improve the training stability on diverse data, we propose to use multi-view +discrimination. The trained generator can produce class-conditional 3D models +as well as renderings from arbitrary viewpoints. The class-conditional +generation results demonstrate significant improvement over the current +state-of-the-art method. Additionally, using PTI, we can efficiently +reconstruct the whole 3D geometry from single-view images. + +
+
+
+
+
+ + ☆ Distance and Collision Probability Estimation from Gaussian Surface + Models + + +
+ This paper describes continuous-space methodologies to estimate the collision +probability, Euclidean distance and gradient between an ellipsoidal robot model +and an environment surface modeled as a set of Gaussian distributions. +Continuous-space collision probability estimation is critical for +uncertainty-aware motion planning. Most collision detection and avoidance +approaches assume the robot is modeled as a sphere, but ellipsoidal +representations provide tighter approximations and enable navigation in +cluttered and narrow spaces. State-of-the-art methods derive the Euclidean +distance and gradient by processing raw point clouds, which is computationally +expensive for large workspaces. Recent advances in Gaussian surface modeling +(e.g. mixture models, splatting) enable compressed and high-fidelity surface +representations. Few methods exist to estimate continuous-space occupancy from +such models. They require Gaussians to model free space and are unable to +estimate the collision probability, Euclidean distance and gradient for an +ellipsoidal robot. The proposed methods bridge this gap by extending prior work +in ellipsoid-to-ellipsoid Euclidean distance and collision probability +estimation to Gaussian surface models. A geometric blending approach is also +proposed to improve collision probability estimation. The approaches are +evaluated with numerical 2D and 3D experiments using real-world point cloud +data. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ☆ Weakly-Supervised Detection of Bone Lesions in CT SP + + +
+ The skeletal region is one of the common sites of metastatic spread of cancer +in the breast and prostate. CT is routinely used to measure the size of lesions +in the bones. However, they can be difficult to spot due to the wide variations +in their sizes, shapes, and appearances. Precise localization of such lesions +would enable reliable tracking of interval changes (growth, shrinkage, or +unchanged status). To that end, an automated technique to detect bone lesions +is highly desirable. In this pilot work, we developed a pipeline to detect bone +lesions (lytic, blastic, and mixed) in CT volumes via a proxy segmentation +task. First, we used the bone lesions that were prospectively marked by +radiologists in a few 2D slices of CT volumes and converted them into weak 3D +segmentation masks. Then, we trained a 3D full-resolution nnUNet model using +these weak 3D annotations to segment the lesions and thereby detected them. Our +automated method detected bone lesions in CT with a precision of 96.7% and +recall of 47.3% despite the use of incomplete and partial training data. To the +best of our knowledge, we are the first to attempt the direct detection of bone +lesions in CT via a proxy segmentation task. + +
+
+ comment: Accepted at SPIE 2024 +
+
+
+
+
+ + ☆ Improving Object Detection Quality in Football Through Super-Resolution + Techniques + + +
+ This study explores the potential of super-resolution techniques in enhancing +object detection accuracy in football. Given the sport's fast-paced nature and +the critical importance of precise object (e.g. ball, player) tracking for both +analysis and broadcasting, super-resolution could offer significant +improvements. We investigate how advanced image processing through +super-resolution impacts the accuracy and reliability of object detection +algorithms in processing football match footage. + Our methodology involved applying state-of-the-art super-resolution +techniques to a diverse set of football match videos from SoccerNet, followed +by object detection using Faster R-CNN. The performance of these algorithms, +both with and without super-resolution enhancement, was rigorously evaluated in +terms of detection accuracy. + The results indicate a marked improvement in object detection accuracy when +super-resolution preprocessing is applied. The improvement of object detection +through the integration of super-resolution techniques yields significant +benefits, especially for low-resolution scenarios, with a notable 12\% increase +in mean Average Precision (mAP) at an IoU (Intersection over Union) range of +0.50:0.95 for 320x240 size images when increasing the resolution fourfold using +RLFN. As the dimensions increase, the magnitude of improvement becomes more +subdued; however, a discernible improvement in the quality of detection is +consistently evident. Additionally, we discuss the implications of these +findings for real-time sports analytics, player tracking, and the overall +viewing experience. The study contributes to the growing field of sports +technology by demonstrating the practical benefits and limitations of +integrating super-resolution techniques in football analytics and broadcasting. + +
+
+
+
+
+ + ☆ Multimodal Neurodegenerative Disease Subtyping Explained by ChatGPT + + +
+ Alzheimer's disease (AD) is the most prevalent neurodegenerative disease; yet +its currently available treatments are limited to stopping disease progression. +Moreover, effectiveness of these treatments is not guaranteed due to the +heterogenetiy of the disease. Therefore, it is essential to be able to identify +the disease subtypes at a very early stage. Current data driven approaches are +able to classify the subtypes at later stages of AD or related disorders, but +struggle when predicting at the asymptomatic or prodromal stage. Moreover, most +existing models either lack explainability behind the classification or only +use a single modality for the assessment, limiting scope of its analysis. Thus, +we propose a multimodal framework that uses early-stage indicators such as +imaging, genetics and clinical assessments to classify AD patients into +subtypes at early stages. Similarly, we build prompts and use large language +models, such as ChatGPT, to interpret the findings of our model. In our +framework, we propose a tri-modal co-attention mechanism (Tri-COAT) to +explicitly learn the cross-modal feature associations. Our proposed model +outperforms baseline models and provides insight into key cross-modal feature +associations supported by known biological mechanisms. + +
+
+
+
+
+ + ☆ CMRNext: Camera to LiDAR Matching in the Wild for Localization and + Extrinsic Calibration + + +
+ LiDARs are widely used for mapping and localization in dynamic environments. +However, their high cost limits their widespread adoption. On the other hand, +monocular localization in LiDAR maps using inexpensive cameras is a +cost-effective alternative for large-scale deployment. Nevertheless, most +existing approaches struggle to generalize to new sensor setups and +environments, requiring retraining or fine-tuning. In this paper, we present +CMRNext, a novel approach for camera-LIDAR matching that is independent of +sensor-specific parameters, generalizable, and can be used in the wild for +monocular localization in LiDAR maps and camera-LiDAR extrinsic calibration. +CMRNext exploits recent advances in deep neural networks for matching +cross-modal data and standard geometric techniques for robust pose estimation. +We reformulate the point-pixel matching problem as an optical flow estimation +problem and solve the Perspective-n-Point problem based on the resulting +correspondences to find the relative pose between the camera and the LiDAR +point cloud. We extensively evaluate CMRNext on six different robotic +platforms, including three publicly available datasets and three in-house +robots. Our experimental evaluations demonstrate that CMRNext outperforms +existing approaches on both tasks and effectively generalizes to previously +unseen environments and sensor setups in a zero-shot manner. We make the code +and pre-trained models publicly available at http://cmrnext.cs.uni-freiburg.de . + +
+
+
+
+
+ + ☆ Real-time Traffic Object Detection for Autonomous Driving + + +
+ With recent advances in computer vision, it appears that autonomous driving +will be part of modern society sooner rather than later. However, there are +still a significant number of concerns to address. Although modern computer +vision techniques demonstrate superior performance, they tend to prioritize +accuracy over efficiency, which is a crucial aspect of real-time applications. +Large object detection models typically require higher computational power, +which is achieved by using more sophisticated onboard hardware. For autonomous +driving, these requirements translate to increased fuel costs and, ultimately, +a reduction in mileage. Further, despite their computational demands, the +existing object detectors are far from being real-time. In this research, we +assess the robustness of our previously proposed, highly efficient pedestrian +detector LSFM on well-established autonomous driving benchmarks, including +diverse weather conditions and nighttime scenes. Moreover, we extend our LSFM +model for general object detection to achieve real-time object detection in +traffic scenes. We evaluate its performance, low latency, and generalizability +on traffic object detection datasets. Furthermore, we discuss the inadequacy of +the current key performance indicator employed by object detection systems in +the context of autonomous driving and propose a more suitable alternative that +incorporates real-time requirements. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Common Sense Reasoning for Deep Fake Detection + + +
+ State-of-the-art approaches rely on image-based features extracted via neural +networks for the deepfake detection binary classification. While these +approaches trained in the supervised sense extract likely fake features, they +may fall short in representing unnatural `non-physical' semantic facial +attributes -- blurry hairlines, double eyebrows, rigid eye pupils, or unnatural +skin shading. However, such facial attributes are generally easily perceived by +humans via common sense reasoning. Furthermore, image-based feature extraction +methods that provide visual explanation via saliency maps can be hard to be +interpreted by humans. To address these challenges, we propose the use of +common sense reasoning to model deepfake detection, and extend it to the +Deepfake Detection VQA (DD-VQA) task with the aim to model human intuition in +explaining the reason behind labeling an image as either real or fake. To this +end, we introduce a new dataset that provides answers to the questions related +to the authenticity of an image, along with its corresponding explanations. We +also propose a Vision and Language Transformer-based framework for the DD-VQA +task, incorporating text and image aware feature alignment formulations. +Finally, we evaluate our method on both the performance of deepfake detection +and the quality of the generated explanations. We hope that this task inspires +researchers to explore new avenues for enhancing language-based +interpretability and cross-modality applications in the realm of deepfake +detection. + +
+
+
+
+
+ + ♻ ☆ High-Quality Image Restoration Following Human Instructions + + +
+ Image restoration is a fundamental problem that involves recovering a +high-quality clean image from its degraded observation. All-In-One image +restoration models can effectively restore images from various types and levels +of degradation using degradation-specific information as prompts to guide the +restoration model. In this work, we present the first approach that uses +human-written instructions to guide the image restoration model. Given natural +language prompts, our model can recover high-quality images from their degraded +counterparts, considering multiple degradation types. Our method, InstructIR, +achieves state-of-the-art results on several restoration tasks including image +denoising, deraining, deblurring, dehazing, and (low-light) image enhancement. +InstructIR improves +1dB over previous all-in-one restoration methods. +Moreover, our dataset and results represent a novel benchmark for new research +on text-guided image restoration and enhancement. Our code, datasets and models +are available at: https://github.com/mv-lab/InstructIR + +
+
+
+
+
+ + ♻ ☆ Separate-and-Enhance: Compositional Finetuning for Text2Image Diffusion + Models + + +
+ Despite recent significant strides achieved by diffusion-based Text-to-Image +(T2I) models, current systems are still less capable of ensuring decent +compositional generation aligned with text prompts, particularly for the +multi-object generation. This work illuminates the fundamental reasons for such +misalignment, pinpointing issues related to low attention activation scores and +mask overlaps. While previous research efforts have individually tackled these +issues, we assert that a holistic approach is paramount. Thus, we propose two +novel objectives, the Separate loss and the Enhance loss, that reduce object +mask overlaps and maximize attention scores, respectively. Our method diverges +from conventional test-time-adaptation techniques, focusing on finetuning +critical parameters, which enhances scalability and generalizability. +Comprehensive evaluations demonstrate the superior performance of our model in +terms of image realism, text-image alignment, and adaptability, notably +outperforming prominent baselines. Ultimately, this research paves the way for +T2I diffusion models with enhanced compositional capacities and broader +applicability. + +
+
+
+
+
+ + ♻ ☆ Multimodal Urban Areas of Interest Generation via Remote Sensing Imagery + and Geographical Prior + + +
+ Urban area-of-interest (AOI) refers to an integrated urban functional zone +with defined boundaries. The rapid development of urban commerce has resulted +in an increased demand for more precise requirements in defining AOIs. However, +existing research primarily concentrates on broad AOI mining for urban planning +or regional economic analysis, failing to cater to the precise requirements of +mobile Internet online-to-offline businesses. These businesses necessitate +accuracy down to a specific community, school, or hospital. In this paper, we +propose an end-to-end multimodal deep learning algorithm for detecting AOI +fence polygon using remote sensing images and multi-semantics reference +information. We then evaluate its timeliness through a cascaded module that +incorporates dynamic human mobility and logistics address information. +Specifically, we begin by selecting a point-of-interest (POI) of specific +category, and use it to recall corresponding remote sensing images, nearby +POIs, road nodes, human mobility, and logistics addresses to build a multimodal +detection model based on transformer encoder-decoder architecture, titled +AOITR. In the model, in addition to the remote sensing images, multi-semantic +information including core POI and road nodes is embedded and reorganized as +the query content part for the transformer decoder to generate the AOI polygon. +Meanwhile, relatively dynamic distribution features of human mobility, nearby +POIs, and logistics addresses are used for AOI reliability evaluation through a +cascaded feedforward network. The experimental results demonstrate that our +algorithm significantly outperforms two existing methods. + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Domain-Generalizable Multiple-Domain Clustering + + +
+ This work generalizes the problem of unsupervised domain generalization to +the case in which no labeled samples are available (completely unsupervised). +We are given unlabeled samples from multiple source domains, and we aim to +learn a shared predictor that assigns examples to semantically related +clusters. Evaluation is done by predicting cluster assignments in previously +unseen domains. Towards this goal, we propose a two-stage training framework: +(1) self-supervised pre-training for extracting domain invariant semantic +features. (2) multi-head cluster prediction with pseudo labels, which rely on +both the feature space and cluster head prediction, further leveraging a novel +prediction-based label smoothing scheme. We demonstrate empirically that our +model is more accurate than baselines that require fine-tuning using samples +from the target domain or some level of supervision. Our code is available at +https://github.com/AmitRozner/domain-generalizable-multiple-domain-clustering. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Deep Learning-based Global and Segmentation-based Semantic Feature + Fusion Approach for Indoor Scene Classification + + +
+ This work proposes a novel approach that uses a semantic segmentation mask to +obtain a 2D spatial layout of the segmentation-categories across the scene, +designated by segmentation-based semantic features (SSFs). These features +represent, per segmentation-category, the pixel count, as well as the 2D +average position and respective standard deviation values. Moreover, a +two-branch network, GS2F2App, that exploits CNN-based global features extracted +from RGB images and the segmentation-based features extracted from the proposed +SSFs, is also proposed. GS2F2App was evaluated in two indoor scene benchmark +datasets: the SUN RGB-D and the NYU Depth V2, achieving state-of-the-art +results on both datasets. + +
+
+ comment: Published at Pattern Recognition Letters 2024 (DOI: + 10.1016/j.patrec.2024.01.022) +
+
+
+
+
+ + ♻ ☆ Combining Deep Learning and Street View Imagery to Map Smallholder Crop + Types AAAI-24 + + +
+ Accurate crop type maps are an essential source of information for monitoring +yield progress at scale, projecting global crop production, and planning +effective policies. To date, however, crop type maps remain challenging to +create in low and middle-income countries due to a lack of ground truth labels +for training machine learning models. Field surveys are the gold standard in +terms of accuracy but require an often-prohibitively large amount of time, +money, and statistical capacity. In recent years, street-level imagery, such as +Google Street View, KartaView, and Mapillary, has become available around the +world. Such imagery contains rich information about crop types grown at +particular locations and times. In this work, we develop an automated system to +generate crop type ground references using deep learning and Google Street View +imagery. The method efficiently curates a set of street view images containing +crop fields, trains a model to predict crop type by utilizing weakly-labelled +images from disparate out-of-domain sources, and combines predicted labels with +remote sensing time series to create a wall-to-wall crop type map. We show +that, in Thailand, the resulting country-wide map of rice, cassava, maize, and +sugarcane achieves an accuracy of 93%. We publicly release the first-ever crop +type map for all of Thailand 2022 at 10m-resolution with no gaps. To our +knowledge, this is the first time a 10m-resolution, multi-crop map has been +created for any smallholder country. As the availability of roadside imagery +expands, our pipeline provides a way to map crop types at scale around the +globe, especially in underserved smallholder regions. + +
+
+ comment: Accepted to AAAI-24: Special Track on AI for Social Impact +
+
+
+
+
+ + ♻ ☆ Collaborative Multi-Object Tracking with Conformal Uncertainty + Propagation + + +
+ Object detection and multiple object tracking (MOT) are essential components +of self-driving systems. Accurate detection and uncertainty quantification are +both critical for onboard modules, such as perception, prediction, and +planning, to improve the safety and robustness of autonomous vehicles. +Collaborative object detection (COD) has been proposed to improve detection +accuracy and reduce uncertainty by leveraging the viewpoints of multiple +agents. However, little attention has been paid to how to leverage the +uncertainty quantification from COD to enhance MOT performance. In this paper, +as the first attempt to address this challenge, we design an uncertainty +propagation framework called MOT-CUP. Our framework first quantifies the +uncertainty of COD through direct modeling and conformal prediction, and +propagates this uncertainty information into the motion prediction and +association steps. MOT-CUP is designed to work with different collaborative +object detectors and baseline MOT algorithms. We evaluate MOT-CUP on V2X-Sim, a +comprehensive collaborative perception dataset, and demonstrate a 2% +improvement in accuracy and a 2.67X reduction in uncertainty compared to the +baselines, e.g. SORT and ByteTrack. In scenarios characterized by high +occlusion levels, our MOT-CUP demonstrates a noteworthy $4.01\%$ improvement in +accuracy. MOT-CUP demonstrates the importance of uncertainty quantification in +both COD and MOT, and provides the first attempt to improve the accuracy and +reduce the uncertainty in MOT based on COD through uncertainty propagation. Our +code is public on https://coperception.github.io/MOT-CUP/. + +
+
+ comment: This paper has been accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ PanAf20K: A Large Video Dataset for Wild Ape Detection and Behaviour + Recognition + + +
+ We present the PanAf20K dataset, the largest and most diverse open-access +annotated video dataset of great apes in their natural environment. It +comprises more than 7 million frames across ~20,000 camera trap videos of +chimpanzees and gorillas collected at 14 field sites in tropical Africa as part +of the Pan African Programme: The Cultured Chimpanzee. The footage is +accompanied by a rich set of annotations and benchmarks making it suitable for +training and testing a variety of challenging and ecologically important +computer vision tasks including ape detection and behaviour recognition. +Furthering AI analysis of camera trap information is critical given the +International Union for Conservation of Nature now lists all species in the +great ape family as either Endangered or Critically Endangered. We hope the +dataset can form a solid basis for engagement of the AI community to improve +performance, efficiency, and result interpretation in order to support +assessments of great ape presence, abundance, distribution, and behaviour and +thereby aid conservation efforts. + +
+
+ comment: Accepted at IJCV +
+
+
+
+
+ + ♻ ☆ Endo-4DGS: Endoscopic Monocular Scene Reconstruction with 4D Gaussian + Splatting + + +
+ In the realm of robot-assisted minimally invasive surgery, dynamic scene +reconstruction can significantly enhance downstream tasks and improve surgical +outcomes. Neural Radiance Fields (NeRF)-based methods have recently risen to +prominence for their exceptional ability to reconstruct scenes. Nonetheless, +these methods are hampered by slow inference, prolonged training, and +substantial computational demands. Additionally, some rely on stereo depth +estimation, which is often infeasible due to the high costs and logistical +challenges associated with stereo cameras. Moreover, the monocular +reconstruction quality for deformable scenes is currently inadequate. To +overcome these obstacles, we present Endo-4DGS, an innovative, real-time +endoscopic dynamic reconstruction approach that utilizes 4D Gaussian Splatting +(GS) and requires no ground truth depth data. This method extends 3D GS by +incorporating a temporal component and leverages a lightweight MLP to capture +temporal Gaussian deformations. This effectively facilitates the reconstruction +of dynamic surgical scenes with variable conditions. We also integrate +Depth-Anything to generate pseudo-depth maps from monocular views, enhancing +the depth-guided reconstruction process. Our approach has been validated on two +surgical datasets, where it can effectively render in real-time, compute +efficiently, and reconstruct with remarkable accuracy. These results underline +the vast potential of Endo-4DGS to improve surgical assistance. + +
+
+
+
+
+ + ♻ ☆ On the Generalizability of ECG-based Stress Detection Models ICML + + +
+ Stress is prevalent in many aspects of everyday life including work, +healthcare, and social interactions. Many works have studied handcrafted +features from various bio-signals that are indicators of stress. Recently, deep +learning models have also been proposed to detect stress. Typically, stress +models are trained and validated on the same dataset, often involving one +stressful scenario. However, it is not practical to collect stress data for +every scenario. So, it is crucial to study the generalizability of these models +and determine to what extent they can be used in other scenarios. In this +paper, we explore the generalization capabilities of Electrocardiogram +(ECG)-based deep learning models and models based on handcrafted ECG features, +i.e., Heart Rate Variability (HRV) features. To this end, we train three HRV +models and two deep learning models that use ECG signals as input. We use ECG +signals from two popular stress datasets - WESAD and SWELL-KW - differing in +terms of stressors and recording devices. First, we evaluate the models using +leave-one-subject-out (LOSO) cross-validation using training and validation +samples from the same dataset. Next, we perform a cross-dataset validation of +the models, that is, LOSO models trained on the WESAD dataset are validated +using SWELL-KW samples and vice versa. While deep learning models achieve the +best results on the same dataset, models based on HRV features considerably +outperform them on data from a different dataset. This trend is observed for +all the models on both datasets. Therefore, HRV models are a better choice for +stress recognition in applications that are different from the dataset +scenario. To the best of our knowledge, this is the first work to compare the +cross-dataset generalizability between ECG-based deep learning models and HRV +models. + +
+
+ comment: Published in Proceedings of 2022 21st IEEE International Conference + on Machine Learning and Applications (ICMLA) +
+
+
+
+
+ + ♻ ☆ BlockFusion: Expandable 3D Scene Generation using Latent Tri-plane + Extrapolation + + +
+ We present BlockFusion, a diffusion-based model that generates 3D scenes as +unit blocks and seamlessly incorporates new blocks to extend the scene. +BlockFusion is trained using datasets of 3D blocks that are randomly cropped +from complete 3D scene meshes. Through per-block fitting, all training blocks +are converted into the hybrid neural fields: with a tri-plane containing the +geometry features, followed by a Multi-layer Perceptron (MLP) for decoding the +signed distance values. A variational auto-encoder is employed to compress the +tri-planes into the latent tri-plane space, on which the denoising diffusion +process is performed. Diffusion applied to the latent representations allows +for high-quality and diverse 3D scene generation. To expand a scene during +generation, one needs only to append empty blocks to overlap with the current +scene and extrapolate existing latent tri-planes to populate new blocks. The +extrapolation is done by conditioning the generation process with the feature +samples from the overlapping tri-planes during the denoising iterations. Latent +tri-plane extrapolation produces semantically and geometrically meaningful +transitions that harmoniously blend with the existing scene. A 2D layout +conditioning mechanism is used to control the placement and arrangement of +scene elements. Experimental results indicate that BlockFusion is capable of +generating diverse, geometrically consistent and unbounded large 3D scenes with +unprecedented high-quality shapes in both indoor and outdoor scenarios. + +
+
+ comment: Video: https://www.youtube.com/watch?v=PxIBtd6G0mA +
+
+
+
+
+ + ♻ ☆ CARPE-ID: Continuously Adaptable Re-identification for Personalized + Robot Assistance ICRA + + +
+ In today's Human-Robot Interaction (HRI) scenarios, a prevailing tendency +exists to assume that the robot shall cooperate with the closest individual or +that the scene involves merely a singular human actor. However, in realistic +scenarios, such as shop floor operations, such an assumption may not hold and +personalized target recognition by the robot in crowded environments is +required. To fulfil this requirement, in this work, we propose a person +re-identification module based on continual visual adaptation techniques that +ensure the robot's seamless cooperation with the appropriate individual even +subject to varying visual appearances or partial or complete occlusions. We +test the framework singularly using recorded videos in a laboratory environment +and an HRI scenario, i.e., a person-following task by a mobile robot. The +targets are asked to change their appearance during tracking and to disappear +from the camera field of view to test the challenging cases of occlusion and +outfit variations. We compare our framework with one of the state-of-the-art +Multi-Object Tracking (MOT) methods and the results show that the CARPE-ID can +accurately track each selected target throughout the experiments in all the +cases (except two limit cases). At the same time, the s-o-t-a MOT has a mean of +4 tracking errors for each video. + +
+
+ comment: Accepted to the International Conference on Robotics and Automation + (ICRA) 2024 +
+
+
+
+
+ + ♻ ☆ ConcatPlexer: Additional Dim1 Batching for Faster ViTs + + +
+ Transformers have demonstrated tremendous success not only in the natural +language processing (NLP) domain but also the field of computer vision, +igniting various creative approaches and applications. Yet, the superior +performance and modeling flexibility of transformers came with a severe +increase in computation costs, and hence several works have proposed methods to +reduce this burden. Inspired by a cost-cutting method originally proposed for +language models, Data Multiplexing (DataMUX), we propose a novel approach for +efficient visual recognition that employs additional dim1 batching (i.e., +concatenation) that greatly improves the throughput with little compromise in +the accuracy. We first introduce a naive adaptation of DataMux for vision +models, Image Multiplexer, and devise novel components to overcome its +weaknesses, rendering our final model, ConcatPlexer, at the sweet spot between +inference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and +CIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and +83.4% validation accuracy, respectively. + +
+
+
+
+
+ + ♻ ☆ Domain generalization across tumor types, laboratories, and species -- + insights from the 2022 edition of the Mitosis Domain Generalization Challenge + + +
+ Recognition of mitotic figures in histologic tumor specimens is highly +relevant to patient outcome assessment. This task is challenging for algorithms +and human experts alike, with deterioration of algorithmic performance under +shifts in image representations. Considerable covariate shifts occur when +assessment is performed on different tumor types, images are acquired using +different digitization devices, or specimens are produced in different +laboratories. This observation motivated the inception of the 2022 challenge on +MItosis Domain Generalization (MIDOG 2022). The challenge provided annotated +histologic tumor images from six different domains and evaluated the +algorithmic approaches for mitotic figure detection provided by nine challenge +participants on ten independent domains. Ground truth for mitotic figure +detection was established in two ways: a three-expert consensus and an +independent, immunohistochemistry-assisted set of labels. This work represents +an overview of the challenge tasks, the algorithmic strategies employed by the +participants, and potential factors contributing to their success. With an +$F_1$ score of 0.764 for the top-performing team, we summarize that domain +generalization across various tumor domains is possible with today's deep +learning-based recognition pipelines. However, we also found that domain +characteristics not present in the training set (feline as new species, spindle +cell shape as new morphology and a new scanner) led to small but significant +decreases in performance. When assessed against the +immunohistochemistry-assisted reference standard, all methods resulted in +reduced recall scores, but with only minor changes in the order of participants +in the ranking. + +
+
+
+
+
+ + ♻ ☆ Rectify the Regression Bias in Long-Tailed Object Detection + + +
+ Long-tailed object detection faces great challenges because of its extremely +imbalanced class distribution. Recent methods mainly focus on the +classification bias and its loss function design, while ignoring the subtle +influence of the regression branch. This paper shows that the regression bias +exists and does adversely and seriously impact the detection accuracy. While +existing methods fail to handle the regression bias, the class-specific +regression head for rare classes is hypothesized to be the main cause of it in +this paper. As a result, three kinds of viable solutions to cater for the rare +categories are proposed, including adding a class-agnostic branch, clustering +heads and merging heads. The proposed methods brings in consistent and +significant improvements over existing long-tailed detection methods, +especially in rare and common classes. The proposed method achieves +state-of-the-art performance in the large vocabulary LVIS dataset with +different backbones and architectures. It generalizes well to more difficult +evaluation metrics, relatively balanced datasets, and the mask branch. This is +the first attempt to reveal and explore rectifying of the regression bias in +long-tailed object detection. + +
+
+
+
+
+ + ♻ ☆ Variational Autoencoding of Dental Point Clouds + + +
+ Digital dentistry has made significant advancements, yet numerous challenges +remain. This paper introduces the FDI 16 dataset, an extensive collection of +tooth meshes and point clouds. Additionally, we present a novel approach: +Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder +designed for point clouds. Notably, prior latent variable models for point +clouds lack a one-to-one correspondence between input and output points. +Instead, they rely on optimizing Chamfer distances, a metric that lacks a +normalized distributional counterpart, rendering it unsuitable for +probabilistic modeling. We replace the explicit minimization of Chamfer +distances with a suitable encoder, increasing computational efficiency while +simplifying the probabilistic extension. This allows for straightforward +application in various tasks, including mesh generation, shape completion, and +representation learning. Empirically, we provide evidence of lower +reconstruction error in dental reconstruction and interpolation, showcasing +state-of-the-art performance in dental sample generation while identifying +valuable latent representations. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Scaling Law for OCR + + +
+ The laws of model size, data volume, computation and model performance have +been extensively studied in the field of Natural Language Processing (NLP). +However, the scaling laws in Optical Character Recognition (OCR) have not yet +been investigated. To address this, we conducted comprehensive studies that +involved examining the correlation between performance and the scale of models, +data volume and computation in the field of text recognition.Conclusively, the +study demonstrates smooth power laws between performance and model size, as +well as training data volume, when other influencing factors are held constant. +Additionally, we have constructed a large-scale dataset called REBU-Syn, which +comprises 6 million real samples and 18 million synthetic samples. Based on our +scaling law and new dataset, we have successfully trained a scene text +recognition model, achieving a new state-ofthe-art on 6 common test benchmarks +with a top-1 average accuracy of 97.42%. The models and dataset are publicly +available at https://github.com/large-ocr-model/large-ocr-model.github.io. + +
+
+
+
+
+ + ♻ ☆ SAMF: Small-Area-Aware Multi-focus Image Fusion for Object Detection ICASSP + + +
+ Existing multi-focus image fusion (MFIF) methods often fail to preserve the +uncertain transition region and detect small focus areas within large defocused +regions accurately. To address this issue, this study proposes a new +small-area-aware MFIF algorithm for enhancing object detection capability. +First, we enhance the pixel attributes within the small focus and boundary +regions, which are subsequently combined with visual saliency detection to +obtain the pre-fusion results used to discriminate the distribution of focused +pixels. To accurately ensure pixel focus, we consider the source image as a +combination of focused, defocused, and uncertain regions and propose a +three-region segmentation strategy. Finally, we design an effective pixel +selection rule to generate segmentation decision maps and obtain the final +fusion results. Experiments demonstrated that the proposed method can +accurately detect small and smooth focus areas while improving object detection +performance, outperforming existing methods in both subjective and objective +evaluations. The source code is available at https://github.com/ixilai/SAMF. + +
+
+ comment: Accepted to International Conference on Acoustics, Speech and Signal + Processing (ICASSP) 2024 +
+
+
+
+
+ + ♻ ☆ Bridging the Gap between Multi-focus and Multi-modal: A Focused + Integration Framework for Multi-modal Image Fusion WACV + + +
+ Multi-modal image fusion (MMIF) integrates valuable information from +different modality images into a fused one. However, the fusion of multiple +visible images with different focal regions and infrared images is a +unprecedented challenge in real MMIF applications. This is because of the +limited depth of the focus of visible optical lenses, which impedes the +simultaneous capture of the focal information within the same scene. To address +this issue, in this paper, we propose a MMIF framework for joint focused +integration and modalities information extraction. Specifically, a +semi-sparsity-based smoothing filter is introduced to decompose the images into +structure and texture components. Subsequently, a novel multi-scale operator is +proposed to fuse the texture components, capable of detecting significant +information by considering the pixel focus attributes and relevant data from +various modal images. Additionally, to achieve an effective capture of scene +luminance and reasonable contrast maintenance, we consider the distribution of +energy information in the structural components in terms of multi-directional +frequency variance and information entropy. Extensive experiments on existing +MMIF datasets, as well as the object detection and depth estimation tasks, +consistently demonstrate that the proposed algorithm can surpass the +state-of-the-art methods in visual perception and quantitative evaluation. The +code is available at https://github.com/ixilai/MFIF-MMIF. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2024 +
+
+
+
+
+ + ♻ ☆ PLVS: A SLAM System with Points, Lines, Volumetric Mapping, and 3D + Incremental Segmentation + + +
+ This document presents PLVS: a real-time system that leverages sparse SLAM, +volumetric mapping, and 3D unsupervised incremental segmentation. PLVS stands +for Points, Lines, Volumetric mapping, and Segmentation. It supports RGB-D and +Stereo cameras, which may be optionally equipped with IMUs. The SLAM module is +keyframe-based, and extracts and tracks sparse points and line segments as +features. Volumetric mapping runs in parallel with respect to the SLAM +front-end and generates a 3D reconstruction of the explored environment by +fusing point clouds backprojected from keyframes. Different volumetric mapping +methods are supported and integrated in PLVS. We use a novel reprojection error +to bundle-adjust line segments. This error exploits available depth information +to stabilize the position estimates of line segment endpoints. An incremental +and geometric-based segmentation method is implemented and integrated for RGB-D +cameras in the PLVS framework. We present qualitative and quantitative +evaluations of the PLVS framework on some publicly available datasets. The +appendix details the adopted stereo line triangulation method and provides a +derivation of the Jacobians we used for line error terms. The software is +available as open-source. + +
+
+
+
+
+ + ♻ ☆ Large Trajectory Models are Scalable Motion Predictors and Planners + + +
+ Motion prediction and planning are vital tasks in autonomous driving, and +recent efforts have shifted to machine learning-based approaches. The +challenges include understanding diverse road topologies, reasoning traffic +dynamics over a long time horizon, interpreting heterogeneous behaviors, and +generating policies in a large continuous state space. Inspired by the success +of large language models in addressing similar complexities through model +scaling, we introduce a scalable trajectory model called State Transformer +(STR). STR reformulates the motion prediction and motion planning problems by +arranging observations, states, and actions into one unified sequence modeling +task. Our approach unites trajectory generation problems with other sequence +modeling problems, powering rapid iterations with breakthroughs in neighbor +domains such as language modeling. Remarkably, experimental results reveal that +large trajectory models (LTMs), such as STR, adhere to the scaling laws by +presenting outstanding adaptability and learning efficiency. Qualitative +results further demonstrate that LTMs are capable of making plausible +predictions in scenarios that diverge significantly from the training data +distribution. LTMs also learn to make complex reasonings for long-term +planning, without explicit loss designs or costly high-level annotations. + +
+
+
+
+
+ + ♻ ☆ On Inference Stability for Diffusion Models AAAI 2024 + + +
+ Denoising Probabilistic Models (DPMs) represent an emerging domain of +generative models that excel in generating diverse and high-quality images. +However, most current training methods for DPMs often neglect the correlation +between timesteps, limiting the model's performance in generating images +effectively. Notably, we theoretically point out that this issue can be caused +by the cumulative estimation gap between the predicted and the actual +trajectory. To minimize that gap, we propose a novel \textit{sequence-aware} +loss that aims to reduce the estimation gap to enhance the sampling quality. +Furthermore, we theoretically show that our proposed loss function is a tighter +upper bound of the estimation loss in comparison with the conventional loss in +DPMs. Experimental results on several benchmark datasets including CIFAR10, +CelebA, and CelebA-HQ consistently show a remarkable improvement of our +proposed method regarding the image generalization quality measured by FID and +Inception Score compared to several DPM baselines. Our code and pre-trained +checkpoints are available at \url{https://github.com/VinAIResearch/SA-DPM}. + +
+
+ comment: Oral presentation at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields + + +
+ Text-driven 3D scene generation is widely applicable to video gaming, film +industry, and metaverse applications that have a large demand for 3D scenes. +However, existing text-to-3D generation methods are limited to producing 3D +objects with simple geometries and dreamlike styles that lack realism. In this +work, we present Text2NeRF, which is able to generate a wide range of 3D scenes +with complicated geometric structures and high-fidelity textures purely from a +text prompt. To this end, we adopt NeRF as the 3D representation and leverage a +pre-trained text-to-image diffusion model to constrain the 3D reconstruction of +the NeRF to reflect the scene description. Specifically, we employ the +diffusion model to infer the text-related image as the content prior and use a +monocular depth estimation method to offer the geometric prior. Both content +and geometric priors are utilized to update the NeRF model. To guarantee +textured and geometric consistency between different views, we introduce a +progressive scene inpainting and updating strategy for novel view synthesis of +the scene. Our method requires no additional training data but only a natural +language description of the scene as the input. Extensive experiments +demonstrate that our Text2NeRF outperforms existing methods in producing +photo-realistic, multi-view consistent, and diverse 3D scenes from a variety of +natural language prompts. Our code is available at +https://github.com/eckertzhang/Text2NeRF. + +
+
+ comment: Accepted by TVCG; Homepage: + https://eckertzhang.github.io/Text2NeRF.github.io/ + Code:https://github.com/eckertzhang/Text2NeRF +
+
+
+
+
+ + ♻ ☆ ResFields: Residual Neural Fields for Spatiotemporal Signals ICLR 2024 + + +
+ Neural fields, a category of neural networks trained to represent +high-frequency signals, have gained significant attention in recent years due +to their impressive performance in modeling complex 3D data, such as signed +distance (SDFs) or radiance fields (NeRFs), via a single multi-layer perceptron +(MLP). However, despite the power and simplicity of representing signals with +an MLP, these methods still face challenges when modeling large and complex +temporal signals due to the limited capacity of MLPs. In this paper, we propose +an effective approach to address this limitation by incorporating temporal +residual layers into neural fields, dubbed ResFields. It is a novel class of +networks specifically designed to effectively represent complex temporal +signals. We conduct a comprehensive analysis of the properties of ResFields and +propose a matrix factorization technique to reduce the number of trainable +parameters and enhance generalization capabilities. Importantly, our +formulation seamlessly integrates with existing MLP-based neural fields and +consistently improves results across various challenging tasks: 2D video +approximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF +reconstruction. Lastly, we demonstrate the practical utility of ResFields by +showcasing its effectiveness in capturing dynamic 3D scenes from sparse RGBD +cameras of a lightweight capture system. + +
+
+ comment: [ICLR 2024 Spotlight] Project and code at: + https://markomih.github.io/ResFields/ +
+
+
+
+
+ + ♻ ☆ GMS-3DQA: Projection-based Grid Mini-patch Sampling for 3D Model Quality + Assessment + + +
+ Nowadays, most 3D model quality assessment (3DQA) methods have been aimed at +improving performance. However, little attention has been paid to the +computational cost and inference time required for practical applications. +Model-based 3DQA methods extract features directly from the 3D models, which +are characterized by their high degree of complexity. As a result, many +researchers are inclined towards utilizing projection-based 3DQA methods. +Nevertheless, previous projection-based 3DQA methods directly extract features +from multi-projections to ensure quality prediction accuracy, which calls for +more resource consumption and inevitably leads to inefficiency. Thus in this +paper, we address this challenge by proposing a no-reference (NR) +projection-based \textit{\underline{G}rid \underline{M}ini-patch +\underline{S}ampling \underline{3D} Model \underline{Q}uality +\underline{A}ssessment (GMS-3DQA)} method. The projection images are rendered +from six perpendicular viewpoints of the 3D model to cover sufficient quality +information. To reduce redundancy and inference resources, we propose a +multi-projection grid mini-patch sampling strategy (MP-GMS), which samples grid +mini-patches from the multi-projections and forms the sampled grid mini-patches +into one quality mini-patch map (QMM). The Swin-Transformer tiny backbone is +then used to extract quality-aware features from the QMMs. The experimental +results show that the proposed GMS-3DQA outperforms existing state-of-the-art +NR-3DQA methods on the point cloud quality assessment databases. The efficiency +analysis reveals that the proposed GMS-3DQA requires far less computational +resources and inference time than other 3DQA competitors. The code will be +available at https://github.com/zzc-1998/GMS-3DQA. + +
+
+
+
+
+ + ♻ ☆ Motion-I2V: Consistent and Controllable Image-to-Video Generation with + Explicit Motion Modeling + + +
+ We introduce Motion-I2V, a novel framework for consistent and controllable +image-to-video generation (I2V). In contrast to previous methods that directly +learn the complicated image-to-video mapping, Motion-I2V factorizes I2V into +two stages with explicit motion modeling. For the first stage, we propose a +diffusion-based motion field predictor, which focuses on deducing the +trajectories of the reference image's pixels. For the second stage, we propose +motion-augmented temporal attention to enhance the limited 1-D temporal +attention in video latent diffusion models. This module can effectively +propagate reference image's feature to synthesized frames with the guidance of +predicted trajectories from the first stage. Compared with existing methods, +Motion-I2V can generate more consistent videos even at the presence of large +motion and viewpoint variation. By training a sparse trajectory ControlNet for +the first stage, Motion-I2V can support users to precisely control motion +trajectories and motion regions with sparse trajectory and region annotations. +This offers more controllability of the I2V process than solely relying on +textual instructions. Additionally, Motion-I2V's second stage naturally +supports zero-shot video-to-video translation. Both qualitative and +quantitative comparisons demonstrate the advantages of Motion-I2V over prior +approaches in consistent and controllable image-to-video generation. Please see +our project page at https://xiaoyushi97.github.io/Motion-I2V/. + +
+
+ comment: Project page: https://xiaoyushi97.github.io/Motion-I2V/ +
+
+
+
+
+ + ♻ ☆ UniM-OV3D: Uni-Modality Open-Vocabulary 3D Scene Understanding with + Fine-Grained Feature Representation + + +
+ 3D open-vocabulary scene understanding aims to recognize arbitrary novel +categories beyond the base label space. However, existing works not only fail +to fully utilize all the available modal information in the 3D domain but also +lack sufficient granularity in representing the features of each modality. In +this paper, we propose a unified multimodal 3D open-vocabulary scene +understanding network, namely UniM-OV3D, which aligns point clouds with image, +language and depth. To better integrate global and local features of the point +clouds, we design a hierarchical point cloud feature extraction module that +learns comprehensive fine-grained feature representations. Further, to +facilitate the learning of coarse-to-fine point-semantic representations from +captions, we propose the utilization of hierarchical 3D caption pairs, +capitalizing on geometric constraints across various viewpoints of 3D scenes. +Extensive experimental results demonstrate the effectiveness and superiority of +our method in open-vocabulary semantic and instance segmentation, which +achieves state-of-the-art performance on both indoor and outdoor benchmarks +such as ScanNet, ScanNet200, S3IDS and nuScenes. Code is available at +https://github.com/hithqd/UniM-OV3D. + +
+
+
+
+
+ + ♻ ☆ Empowering CAM-Based Methods with Capability to Generate Fine-Grained + and High-Faithfulness Explanations AAAI2024 + + +
+ Recently, the explanation of neural network models has garnered considerable +research attention. In computer vision, CAM (Class Activation Map)-based +methods and LRP (Layer-wise Relevance Propagation) method are two common +explanation methods. However, since most CAM-based methods can only generate +global weights, they can only generate coarse-grained explanations at a deep +layer. LRP and its variants, on the other hand, can generate fine-grained +explanations. But the faithfulness of the explanations is too low. To address +these challenges, in this paper, we propose FG-CAM (Fine-Grained CAM), which +extends CAM-based methods to enable generating fine-grained and +high-faithfulness explanations. FG-CAM uses the relationship between two +adjacent layers of feature maps with resolution differences to gradually +increase the explanation resolution, while finding the contributing pixels and +filtering out the pixels that do not contribute. Our method not only solves the +shortcoming of CAM-based methods without changing their characteristics, but +also generates fine-grained explanations that have higher faithfulness than LRP +and its variants. We also present FG-CAM with denoising, which is a variant of +FG-CAM and is able to generate less noisy explanations with almost no change in +explanation faithfulness. Experimental results show that the performance of +FG-CAM is almost unaffected by the explanation resolution. FG-CAM outperforms +existing CAM-based methods significantly in both shallow and intermediate +layers, and outperforms LRP and its variants significantly in the input layer. +Our code is available at https://github.com/dongmo-qcq/FG-CAM. + +
+
+ comment: This paper has been accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ Dual Relation Alignment for Composed Image Retrieval + + +
+ Composed image retrieval, a task involving the search for a target image +using a reference image and a complementary text as the query, has witnessed +significant advancements owing to the progress made in cross-modal modeling. +Unlike the general image-text retrieval problem with only one alignment +relation, i.e., image-text, we argue for the existence of two types of +relations in composed image retrieval. The explicit relation pertains to the +reference image & complementary text-target image, which is commonly exploited +by existing methods. Besides this intuitive relation, the observations during +our practice have uncovered another implicit yet crucial relation, i.e., +reference image & target image-complementary text, since we found that the +complementary text can be inferred by studying the relation between the target +image and the reference image. Regrettably, existing methods largely focus on +leveraging the explicit relation to learn their networks, while overlooking the +implicit relation. In response to this weakness, We propose a new framework for +composed image retrieval, termed dual relation alignment, which integrates both +explicit and implicit relations to fully exploit the correlations among the +triplets. Specifically, we design a vision compositor to fuse reference image +and target image at first, then the resulted representation will serve two +roles: (1) counterpart for semantic alignment with the complementary text and +(2) compensation for the complementary text to boost the explicit relation +modeling, thereby implant the implicit relation into the alignment learning. +Our method is evaluated on two popular datasets, CIRR and FashionIQ, through +extensive experiments. The results confirm the effectiveness of our +dual-relation learning in substantially enhancing composed image retrieval +performance. + +
+
+ comment: The architecture of our model changes, hence methodolgy and + experiments changes a lot, We have significantly revised the original + manuscript of the paper, so a withdraw of our original script is needed +
+
+
+
+
+ + ♻ ☆ $k$-$t$ CLAIR: Self-Consistency Guided Multi-Prior Learning for Dynamic + Parallel MR Image Reconstruction MICCAI 2023 + + +
+ Cardiac magnetic resonance imaging (CMR) has been widely used in clinical +practice for the medical diagnosis of cardiac diseases. However, the long +acquisition time hinders its development in real-time applications. Here, we +propose a novel self-consistency guided multi-prior learning framework named +$k$-$t$ CLAIR to exploit spatiotemporal correlations from highly undersampled +data for accelerated dynamic parallel MRI reconstruction. The $k$-$t$ CLAIR +progressively reconstructs faithful images by leveraging multiple complementary +priors learned in the $x$-$t$, $x$-$f$, and $k$-$t$ domains in an iterative +fashion, as dynamic MRI exhibits high spatiotemporal redundancy. Additionally, +$k$-$t$ CLAIR incorporates calibration information for prior learning, +resulting in a more consistent reconstruction. Experimental results on cardiac +cine and T1W/T2W images demonstrate that $k$-$t$ CLAIR achieves high-quality +dynamic MR reconstruction in terms of both quantitative and qualitative +performance. + +
+
+ comment: 12 pages, 3 figures, 4 tables. CMRxRecon Challenge, MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Variational Transfer Learning using Cross-Domain Latent Modulation WACV + + +
+ To successfully apply trained neural network models to new domains, powerful +transfer learning solutions are essential. We propose to introduce a novel +cross-domain latent modulation mechanism to a variational autoencoder framework +so as to achieve effective transfer learning. Our key idea is to procure deep +representations from one data domain and use it to influence the +reparameterization of the latent variable of another domain. Specifically, deep +representations of the source and target domains are first extracted by a +unified inference model and aligned by employing gradient reversal. The learned +deep representations are then cross-modulated to the latent encoding of the +alternative domain, where consistency constraints are also applied. In the +empirical validation that includes a number of transfer learning benchmark +tasks for unsupervised domain adaptation and image-to-image translation, our +model demonstrates competitive performance, which is also supported by evidence +obtained from visualization. + +
+
+ comment: Under review. Extended version of a previous WACV paper + (arXiv:2012.11727). 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Learning to Predict Gradients for Semi-Supervised Continual Learning + + +
+ A key challenge for machine intelligence is to learn new visual concepts +without forgetting the previously acquired knowledge. Continual learning is +aimed towards addressing this challenge. However, there is a gap between +existing supervised continual learning and human-like intelligence, where human +is able to learn from both labeled and unlabeled data. How unlabeled data +affects learning and catastrophic forgetting in the continual learning process +remains unknown. To explore these issues, we formulate a new semi-supervised +continual learning method, which can be generically applied to existing +continual learning models. Specifically, a novel gradient learner learns from +labeled data to predict gradients on unlabeled data. Hence, the unlabeled data +could fit into the supervised continual learning method. Different from +conventional semi-supervised settings, we do not hypothesize that the +underlying classes, which are associated to the unlabeled data, are known to +the learning process. In other words, the unlabeled data could be very distinct +from the labeled data. We evaluate the proposed method on mainstream continual +learning, adversarial continual learning, and semi-supervised learning tasks. +The proposed method achieves state-of-the-art performance on classification +accuracy and backward transfer in the continual learning setting while +achieving desired performance on classification accuracy in the semi-supervised +learning setting. This implies that the unlabeled images can enhance the +generalizability of continual learning models on the predictive ability on +unseen data and significantly alleviate catastrophic forgetting. The code is +available at \url{https://github.com/luoyan407/grad_prediction.git}. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems + (TNNLS) +
+
+
+
+
+ + ♻ ☆ GazeGPT: Augmenting Human Capabilities using Gaze-contingent Contextual + AI for Smart Eyewear + + +
+ Multimodal large language models (LMMs) excel in world knowledge and +problem-solving abilities. Through the use of a world-facing camera and +contextual AI, emerging smart accessories aim to provide a seamless interface +between humans and LMMs. Yet, these wearable computing systems lack an +understanding of the user's attention. We introduce GazeGPT as a new user +interaction paradigm for contextual AI. GazeGPT uses eye tracking to help the +LMM understand which object in the world-facing camera view a user is paying +attention to. Using extensive user evaluations, we show that this +gaze-contingent mechanism is a faster and more accurate pointing mechanism than +alternatives; that it augments human capabilities by significantly improving +their accuracy in a dog-breed classification task; and that it is consistently +ranked as more natural than head- or body-driven selection mechanisms for +contextual AI. Moreover, we prototype a variety of application scenarios that +suggest GazeGPT could be of significant value to users as part of future +AI-driven personal assistants. + +
+
+ comment: Project video: https://youtu.be/AuDFHHTK_m8 +
+
+
+
+
+ + ♻ ☆ A Survey on Video Prediction: From Deterministic to Generative + Approaches + + +
+ Video prediction, a fundamental task in computer vision, aims to enable +models to generate sequences of future frames based on existing video content. +This task has garnered widespread application across various domains. In this +paper, we comprehensively survey both historical and contemporary works in this +field, encompassing the most widely used datasets and algorithms. Our survey +scrutinizes the challenges and evolving landscape of video prediction within +the realm of computer vision. We propose a novel taxonomy centered on the +stochastic nature of video prediction algorithms. This taxonomy accentuates the +gradual transition from deterministic to generative prediction methodologies, +underlining significant advancements and shifts in approach. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Moderately Supervised Learning: Definition, Framework and Generality + + +
+ Learning with supervision has achieved remarkable success in numerous +artificial intelligence (AI) applications. In the current literature, by +referring to the properties of the labels prepared for the training dataset, +learning with supervision is categorized as supervised learning (SL) and weakly +supervised learning (WSL). SL concerns the situation where the training data +set is assigned with ideal (complete, exact and accurate) labels, while WSL +concerns the situation where the training data set is assigned with non-ideal +(incomplete, inexact or inaccurate) labels. However, various solutions for SL +tasks have shown that the given labels are not always easy to learn, and the +transformation from the given labels to easy-to-learn targets can significantly +affect the performance of the final SL solutions. Without considering the +properties of the transformation from the given labels to easy-to-learn +targets, the definition of SL conceals some details that can be critical to +building the appropriate solutions for specific SL tasks. Thus, for engineers +in the AI application field, it is desirable to reveal these details +systematically. This article attempts to achieve this goal by expanding the +categorization of SL and investigating the sub-type moderately supervised +learning (MSL) that concerns the situation where the given labels are ideal, +but due to the simplicity in annotation, careful designs are required to +transform the given labels into easy-to-learn targets. From the perspectives of +the definition, framework and generality, we conceptualize MSL to present a +complete fundamental basis to systematically analyse MSL tasks. At meantime, +revealing the relation between the conceptualization of MSL and the +mathematicians' vision, this paper as well establishes a tutorial for AI +application engineers to refer to viewing a problem to be solved from the +mathematicians' vision. + +
+
+ comment: This is the final published version (33 pages) +
+
+
+
+
+ + ♻ ☆ ECNR: Efficient Compressive Neural Representation of Time-Varying + Volumetric Datasets + + +
+ Due to its conceptual simplicity and generality, compressive neural +representation has emerged as a promising alternative to traditional +compression methods for managing massive volumetric datasets. The current +practice of neural compression utilizes a single large multilayer perceptron +(MLP) to encode the global volume, incurring slow training and inference. This +paper presents an efficient compressive neural representation (ECNR) solution +for time-varying data compression, utilizing the Laplacian pyramid for adaptive +signal fitting. Following a multiscale structure, we leverage multiple small +MLPs at each scale for fitting local content or residual blocks. By assigning +similar blocks to the same MLP via size uniformization, we enable balanced +parallelization among MLPs to significantly speed up training and inference. +Working in concert with the multiscale structure, we tailor a deep compression +strategy to compact the resulting model. We show the effectiveness of ECNR with +multiple datasets and compare it with state-of-the-art compression methods +(mainly SZ3, TTHRESH, and neurcomp). The results position ECNR as a promising +solution for volumetric data compression. + +
+
+ comment: Accepted by IEEE PacificVis 2024 (conference papers track) +
+
+
+
+
+ + ♻ ☆ Arbitrary-Scale Downscaling of Tidal Current Data Using Implicit + Continuous Representation + + +
+ Numerical models have long been used to understand geoscientific phenomena, +including tidal currents, crucial for renewable energy production and coastal +engineering. However, their computational cost hinders generating data of +varying resolutions. As an alternative, deep learning-based downscaling methods +have gained traction due to their faster inference speeds. But most of them are +limited to only inference fixed scale and overlook important characteristics of +target geoscientific data. In this paper, we propose a novel downscaling +framework for tidal current data, addressing its unique characteristics, which +are dissimilar to images: heterogeneity and local dependency. Moreover, our +framework can generate any arbitrary-scale output utilizing a continuous +representation model. Our proposed framework demonstrates significantly +improved flow velocity predictions by 93.21% (MSE) and 63.85% (MAE) compared to +the Baseline model while achieving a remarkable 33.2% reduction in FLOPs. + +
+
+
+
+
+ + ♻ ☆ BSED: Baseline Shapley-Based Explainable Detector + + +
+ Explainable artificial intelligence (XAI) has witnessed significant advances +in the field of object recognition, with saliency maps being used to highlight +image features relevant to the predictions of learned models. Although these +advances have made AI-based technology more interpretable to humans, several +issues have come to light. Some approaches present explanations irrelevant to +predictions, and cannot guarantee the validity of XAI (axioms). In this study, +we propose the Baseline Shapley-based Explainable Detector (BSED), which +extends the Shapley value to object detection, thereby enhancing the validity +of interpretation. The Shapley value can attribute the prediction of a learned +model to a baseline feature while satisfying the explainability axioms. The +processing cost for the BSED is within the reasonable range, while the original +Shapley value is prohibitively computationally expensive. Furthermore, BSED is +a generalizable method that can be applied to various detectors in a +model-agnostic manner, and interpret various detection targets without +fine-grained parameter tuning. These strengths can enable the practical +applicability of XAI. We present quantitative and qualitative comparisons with +existing methods to demonstrate the superior performance of our method in terms +of explanation validity. Moreover, we present some applications, such as +correcting detection based on explanations from our method. + +
+
+
+
+
+ + ♻ ☆ Towards Few-shot Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is critical for ensuring the reliability +of open-world intelligent systems. Despite the notable advancements in existing +OOD detection methodologies, our study identifies a significant performance +drop under the scarcity of training samples. In this context, we introduce a +novel few-shot OOD detection benchmark, carefully constructed to address this +gap. Our empirical analysis reveals the superiority of ParameterEfficient +Fine-Tuning (PEFT) strategies, such as visual prompt tuning and visual adapter +tuning, over conventional techniques, including fully fine-tuning and linear +probing tuning in the few-shot OOD detection task. Recognizing some crucial +information from the pre-trained model, which is pivotal for OOD detection, may +be lost during the fine-tuning process, we propose a method termed +DomainSpecific and General Knowledge Fusion (DSGF). This approach is designed +to be compatible with diverse fine-tuning frameworks. Our experiments show that +the integration of DSGF significantly enhances the few-shot OOD detection +capabilities across various methods and fine-tuning methodologies, including +fully fine-tuning, visual adapter tuning, and visual prompt tuning. The code +will be released. + +
+
+
+
+
+ + ♻ ☆ Calibrating Segmentation Networks with Margin-based Label Smoothing + + +
+ Despite the undeniable progress in visual recognition tasks fueled by deep +neural networks, there exists recent evidence showing that these models are +poorly calibrated, resulting in over-confident predictions. The standard +practices of minimizing the cross entropy loss during training promote the +predicted softmax probabilities to match the one-hot label assignments. +Nevertheless, this yields a pre-softmax activation of the correct class that is +significantly larger than the remaining activations, which exacerbates the +miscalibration problem. Recent observations from the classification literature +suggest that loss functions that embed implicit or explicit maximization of the +entropy of predictions yield state-of-the-art calibration performances. Despite +these findings, the impact of these losses in the relevant task of calibrating +medical image segmentation networks remains unexplored. In this work, we +provide a unifying constrained-optimization perspective of current +state-of-the-art calibration losses. Specifically, these losses could be viewed +as approximations of a linear penalty (or a Lagrangian term) imposing equality +constraints on logit distances. This points to an important limitation of such +underlying equality constraints, whose ensuing gradients constantly push +towards a non-informative solution, which might prevent from reaching the best +compromise between the discriminative performance and calibration of the model +during gradient-based optimization. Following our observations, we propose a +simple and flexible generalization based on inequality constraints, which +imposes a controllable margin on logit distances. Comprehensive experiments on +a variety of public medical image segmentation benchmarks demonstrate that our +method sets novel state-of-the-art results on these tasks in terms of network +calibration, whereas the discriminative performance is also improved. + +
+
+ comment: MedIA 2023. The code is available at + https://github.com/Bala93/MarginLoss. arXiv admin note: substantial text + overlap with arXiv:2111.15430 +
+
+
+
+
+ + ♻ ☆ Gaussian Adaptive Attention is All You Need: Robust Contextual + Representations Across Multiple Modalities + + +
+ We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a +novel probabilistic attention framework, and the Gaussian Adaptive Transformer +(GAT), designed to enhance information aggregation across multiple modalities, +including Speech, Text and Vision. GAAM integrates learnable mean and variance +into its attention mechanism, implemented in a Multi-Headed framework enabling +it to collectively model any Probability Distribution for dynamic recalibration +of feature significance. This method demonstrates significant improvements, +especially with highly non-stationary data, surpassing the state-of-the-art +attention techniques in model performance (up to approximately +20% in +accuracy) by identifying key elements within the feature space. GAAM's +compatibility with dot-product-based attention models and relatively low number +of parameters showcases its adaptability and potential to boost existing +attention frameworks. Empirically, GAAM exhibits superior adaptability and +efficacy across a diverse range of tasks, including emotion recognition in +speech, image classification, and text classification, thereby establishing its +robustness and versatility in handling multi-modal data. Furthermore, we +introduce the Importance Factor (IF), a new learning-based metric that enhances +the explainability of models trained with GAAM-based methods. Overall, GAAM +represents an advancement towards development of better performing and more +explainable attention models across multiple modalities. + +
+
+
+
+
+ + ♻ ☆ Associative Transformer + + +
+ Emerging from the pairwise attention in conventional Transformers, there is a +growing interest in sparse attention mechanisms that align more closely with +localized, contextual learning in the biological brain. Existing studies such +as the Coordination method employ iterative cross-attention mechanisms with a +bottleneck to enable the sparse association of inputs. However, these methods +are parameter inefficient and fail in more complex relational reasoning tasks. +To this end, we propose Associative Transformer (AiT) to enhance the +association among sparsely attended input patches, improving parameter +efficiency and performance in relational reasoning tasks. AiT leverages a +learnable explicit memory, comprised of various specialized priors, with a +bottleneck attention to facilitate the extraction of diverse localized +features. Moreover, we propose a novel associative memory-enabled patch +reconstruction with a Hopfield energy function. The extensive experiments in +four image classification tasks with three different sizes of AiT demonstrate +that AiT requires significantly fewer parameters and attention layers while +outperforming Vision Transformers and a broad range of sparse Transformers. +Additionally, AiT establishes new SOTA performance in the Sort-of-CLEVR +dataset, outperforming the previous Coordination method. + +
+
+
+
+
+ + ♻ ☆ Hyperspectral Pixel Unmixing with Latent Dirichlet Variational + Autoencoder + + +
+ We present a method for hyperspectral pixel {\it unmixing}. The proposed +method assumes that (1) {\it abundances} can be encoded as Dirichlet +distributions and (2) spectra of {\it endmembers} can be represented as +multivariate Normal distributions. The method solves the problem of abundance +estimation and endmember extraction within a variational autoencoder setting +where a Dirichlet bottleneck layer models the abundances, and the decoder +performs endmember extraction. The proposed method can also leverage transfer +learning paradigm, where the model is only trained on synthetic data containing +pixels that are linear combinations of one or more endmembers of interest. In +this case, we retrieve endmembers (spectra) from the United States Geological +Survey Spectral Library. The model thus trained can be subsequently used to +perform pixel unmixing on "real data" that contains a subset of the endmembers +used to generated the synthetic data. The model achieves state-of-the-art +results on several benchmarks: Cuprite, Urban Hydice and Samson. We also +present new synthetic dataset, OnTech-HSI-Syn-21, that can be used to study +hyperspectral pixel unmixing methods. We showcase the transfer learning +capabilities of the proposed model on Cuprite and OnTech-HSI-Syn-21 datasets. +In summary, the proposed method can be applied for pixel unmixing a variety of +domains, including agriculture, forestry, mineralogy, analysis of materials, +healthcare, etc. Additionally, the proposed method eschews the need for +labelled data for training by leveraging the transfer learning paradigm, where +the model is trained on synthetic data generated using the endmembers present +in the "real" data. + +
+
+
+
+
+ + ♻ ☆ CATS v2: Hybrid encoders for robust medical segmentation + + +
+ Convolutional Neural Networks (CNNs) have exhibited strong performance in +medical image segmentation tasks by capturing high-level (local) information, +such as edges and textures. However, due to the limited field of view of +convolution kernel, it is hard for CNNs to fully represent global information. +Recently, transformers have shown good performance for medical image +segmentation due to their ability to better model long-range dependencies. +Nevertheless, transformers struggle to capture high-level spatial features as +effectively as CNNs. A good segmentation model should learn a better +representation from local and global features to be both precise and +semantically accurate. In our previous work, we proposed CATS, which is a +U-shaped segmentation network augmented with transformer encoder. In this work, +we further extend this model and propose CATS v2 with hybrid encoders. +Specifically, hybrid encoders consist of a CNN-based encoder path paralleled to +a transformer path with a shifted window, which better leverage both local and +global information to produce robust 3D medical image segmentation. We fuse the +information from the convolutional encoder and the transformer at the skip +connections of different resolutions to form the final segmentation. The +proposed method is evaluated on three public challenge datasets: Beyond the +Cranial Vault (BTCV), Cross-Modality Domain Adaptation (CrossMoDA) and task 5 +of Medical Segmentation Decathlon (MSD-5), to segment abdominal organs, +vestibular schwannoma (VS) and prostate, respectively. Compared with the +state-of-the-art methods, our approach demonstrates superior performance in +terms of higher Dice scores. Our code is publicly available at +https://github.com/MedICL-VU/CATS. + +
+
+
+
+
+
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ Neural Locality Sensitive Hashing for Entity Blocking + + +
+ Locality-sensitive hashing (LSH) is a fundamental algorithmic technique +widely employed in large-scale data processing applications, such as +nearest-neighbor search, entity resolution, and clustering. However, its +applicability in some real-world scenarios is limited due to the need for +careful design of hashing functions that align with specific metrics. Existing +LSH-based Entity Blocking solutions primarily rely on generic similarity +metrics such as Jaccard similarity, whereas practical use cases often demand +complex and customized similarity rules surpassing the capabilities of generic +similarity metrics. Consequently, designing LSH functions for these customized +similarity rules presents considerable challenges. In this research, we propose +a neuralization approach to enhance locality-sensitive hashing by training deep +neural networks to serve as hashing functions for complex metrics. We assess +the effectiveness of this approach within the context of the entity resolution +problem, which frequently involves the use of task-specific metrics in +real-world applications. Specifically, we introduce NLSHBlock (Neural-LSH +Block), a novel blocking methodology that leverages pre-trained language +models, fine-tuned with a novel LSH-based loss function. Through extensive +evaluations conducted on a diverse range of real-world datasets, we demonstrate +the superiority of NLSHBlock over existing methods, exhibiting significant +performance improvements. Furthermore, we showcase the efficacy of NLSHBlock in +enhancing the performance of the entity matching phase, particularly within the +semi-supervised setting. + +
+
+
+
+
+ + ☆ Error-Tolerant E-Discovery Protocols + + +
+ We consider the multi-party classification problem introduced by Dong, +Hartline, and Vijayaraghavan (2022) in the context of electronic discovery +(e-discovery). Based on a request for production from the requesting party, the +responding party is required to provide documents that are responsive to the +request except for those that are legally privileged. Our goal is to find a +protocol that verifies that the responding party sends almost all responsive +documents while minimizing the disclosure of non-responsive documents. We +provide protocols in the challenging non-realizable setting, where the instance +may not be perfectly separated by a linear classifier. We demonstrate +empirically that our protocol successfully manages to find almost all relevant +documents, while incurring only a small disclosure of non-responsive documents. +We complement this with a theoretical analysis of our protocol in the +single-dimensional setting, and other experiments on simulated data which +suggest that the non-responsive disclosure incurred by our protocol may be +unavoidable. + +
+
+ comment: 28 pages, 6 figures, CSLAW 2024 +
+
+
+
+
+ + ☆ A Survey on Data-Centric Recommender Systems + + +
+ Recommender systems (RS) have become essential tools for mitigating +information overload in a range of real-world scenarios. Recent trends in RS +have seen a paradigm shift, moving the spotlight from model-centric innovations +to the importance of data quality and quantity. This evolution has given rise +to the concept of data-centric recommender systems (Data-Centric RS), marking a +significant development in the field. This survey provides the first systematic +overview of Data-Centric RS, covering 1) the foundational concepts of +recommendation data and Data-Centric RS; 2) three primary issues in +recommendation data; 3) recent research developed to address these issues; and +4) several potential future directions in Data-Centric RS. + +
+
+
+
+
+ + ☆ Towards Semantic Consistency: Dirichlet Energy Driven Robust Multi-Modal + Entity Alignment + + +
+ In Multi-Modal Knowledge Graphs (MMKGs), Multi-Modal Entity Alignment (MMEA) +is crucial for identifying identical entities across diverse modal attributes. +However, semantic inconsistency, mainly due to missing modal attributes, poses +a significant challenge. Traditional approaches rely on attribute +interpolation, but this often introduces modality noise, distorting the +original semantics. Moreover, the lack of a universal theoretical framework +limits advancements in achieving semantic consistency. This study introduces a +novel approach, DESAlign, which addresses these issues by applying a +theoretical framework based on Dirichlet energy to ensure semantic consistency. +We discover that semantic inconsistency leads to model overfitting to modality +noise, causing performance fluctuations, particularly when modalities are +missing. DESAlign innovatively combats over-smoothing and interpolates absent +semantics using existing modalities. Our approach includes a multi-modal +knowledge graph learning strategy and a propagation technique that employs +existing semantic features to compensate for missing ones, providing explicit +Euler solutions. Comprehensive evaluations across 18 benchmarks, including +monolingual and bilingual scenarios, demonstrate that DESAlign surpasses +existing methods, setting a new standard in performance. Further testing on 42 +benchmarks with high rates of missing modalities confirms its robustness, +offering an effective solution to semantic inconsistency in real-world MMKGs. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2307.16210 by other authors +
+
+
+
+
+ + ☆ Network-based Topic Structure Visualization + + +
+ In the real world, many topics are inter-correlated, making it challenging to +investigate their structure and relationships. Understanding the interplay +between topics and their relevance can provide valuable insights for +researchers, guiding their studies and informing the direction of research. In +this paper, we utilize the topic-words distribution, obtained from topic +models, as item-response data to model the structure of topics using a latent +space item response model. By estimating the latent positions of topics based +on their distances toward words, we can capture the underlying topic structure +and reveal their relationships. Visualizing the latent positions of topics in +Euclidean space allows for an intuitive understanding of their proximity and +associations. We interpret relationships among topics by characterizing each +topic based on representative words selected using a newly proposed scoring +scheme. Additionally, we assess the maturity of topics by tracking their latent +positions using different word sets, providing insights into the robustness of +topics. To demonstrate the effectiveness of our approach, we analyze the topic +composition of COVID-19 studies during the early stage of its emergence using +biomedical literature in the PubMed database. The software and data used in +this paper are publicly available at https://github.com/jeon9677/gViz . + +
+
+
+
+
+ + ☆ Global-Liar: Factuality of LLMs over Time and Geographic Regions + + +
+ The increasing reliance on AI-driven solutions, particularly Large Language +Models (LLMs) like the GPT series, for information retrieval highlights the +critical need for their factuality and fairness, especially amidst the rampant +spread of misinformation and disinformation online. Our study evaluates the +factual accuracy, stability, and biases in widely adopted GPT models, including +GPT-3.5 and GPT-4, contributing to reliability and integrity of AI-mediated +information dissemination. + We introduce 'Global-Liar,' a dataset uniquely balanced in terms of +geographic and temporal representation, facilitating a more nuanced evaluation +of LLM biases. Our analysis reveals that newer iterations of GPT models do not +always equate to improved performance. Notably, the GPT-4 version from March +demonstrates higher factual accuracy than its subsequent June release. +Furthermore, a concerning bias is observed, privileging statements from the +Global North over the Global South, thus potentially exacerbating existing +informational inequities. Regions such as Africa and the Middle East are at a +disadvantage, with much lower factual accuracy. The performance fluctuations +over time suggest that model updates may not consistently benefit all regions +equally. + Our study also offers insights into the impact of various LLM configuration +settings, such as binary decision forcing, model re-runs and temperature, on +model's factuality. Models constrained to binary (true/false) choices exhibit +reduced factuality compared to those allowing an 'unclear' option. Single +inference at a low temperature setting matches the reliability of majority +voting across various configurations. The insights gained highlight the need +for culturally diverse and geographically inclusive model training and +evaluation. This approach is key to achieving global equity in technology, +distributing AI benefits fairly worldwide. + +
+
+ comment: 24 pages, 12 figures, 9 tables +
+
+
+
+
+ + ☆ LoRec: Large Language Model for Robust Sequential Recommendation against + Poisoning Attacks + + +
+ Sequential recommender systems stand out for their ability to capture users' +dynamic interests and the patterns of item-to-item transitions. However, the +inherent openness of sequential recommender systems renders them vulnerable to +poisoning attacks, where fraudulent users are injected into the training data +to manipulate learned patterns. Traditional defense strategies predominantly +depend on predefined assumptions or rules extracted from specific known +attacks, limiting their generalizability to unknown attack types. To solve the +above problems, considering the rich open-world knowledge encapsulated in Large +Language Models (LLMs), our research initially focuses on the capabilities of +LLMs in the detection of unknown fraudulent activities within recommender +systems, a strategy we denote as LLM4Dec. Empirical evaluations demonstrate the +substantial capability of LLMs in identifying unknown fraudsters, leveraging +their expansive, open-world knowledge. + Building upon this, we propose the integration of LLMs into defense +strategies to extend their effectiveness beyond the confines of known attacks. +We propose LoRec, an advanced framework that employs LLM-Enhanced Calibration +to strengthen the robustness of sequential recommender systems against +poisoning attacks. LoRec integrates an LLM-enhanced CalibraTor (LCT) that +refines the training process of sequential recommender systems with knowledge +derived from LLMs, applying a user-wise reweighting to diminish the impact of +fraudsters injected by attacks. By incorporating LLMs' open-world knowledge, +the LCT effectively converts the limited, specific priors or rules into a more +general pattern of fraudsters, offering improved defenses against poisoning +attacks. Our comprehensive experiments validate that LoRec, as a general +framework, significantly strengthens the robustness of sequential recommender +systems. + +
+
+
+
+
+ + ☆ ReSLLM: Large Language Models are Strong Resource Selectors for + Federated Search + + +
+ Federated search, which involves integrating results from multiple +independent search engines, will become increasingly pivotal in the context of +Retrieval-Augmented Generation pipelines empowering LLM-based applications such +as chatbots. These systems often distribute queries among various search +engines, ranging from specialized (e.g., PubMed) to general (e.g., Google), +based on the nature of user utterances. A critical aspect of federated search +is resource selection - the selection of appropriate resources prior to issuing +the query to ensure high-quality and rapid responses, and contain costs +associated with calling the external search engines. However, current SOTA +resource selection methodologies primarily rely on feature-based learning +approaches. These methods often involve the labour intensive and expensive +creation of training labels for each resource. In contrast, LLMs have exhibited +strong effectiveness as zero-shot methods across NLP and IR tasks. We +hypothesise that in the context of federated search LLMs can assess the +relevance of resources without the need for extensive predefined labels or +features. In this paper, we propose ReSLLM. Our ReSLLM method exploits LLMs to +drive the selection of resources in federated search in a zero-shot setting. In +addition, we devise an unsupervised fine tuning protocol, the Synthetic Label +Augmentation Tuning (SLAT), where the relevance of previously logged queries +and snippets from resources is predicted using an off-the-shelf LLM and then in +turn used to fine-tune ReSLLM with respect to resource selection. Our empirical +evaluation and analysis details the factors influencing the effectiveness of +LLMs in this context. The results showcase the merits of ReSLLM for resource +selection: not only competitive effectiveness in the zero-shot setting, but +also obtaining large when fine-tuned using SLAT-protocol. + +
+
+
+
+
+ + ☆ Towards Personalized Privacy: User-Governed Data Contribution for + Federated Recommendation + + +
+ Federated recommender systems (FedRecs) have gained significant attention for +their potential to protect user's privacy by keeping user privacy data locally +and only communicating model parameters/gradients to the server. Nevertheless, +the currently existing architecture of FedRecs assumes that all users have the +same 0-privacy budget, i.e., they do not upload any data to the server, thus +overlooking those users who are less concerned about privacy and are willing to +upload data to get a better recommendation service. To bridge this gap, this +paper explores a user-governed data contribution federated recommendation +architecture where users are free to take control of whether they share data +and the proportion of data they share to the server. To this end, this paper +presents a cloud-device collaborative graph neural network federated +recommendation model, named CDCGNNFed. It trains user-centric ego graphs +locally, and high-order graphs based on user-shared data in the server in a +collaborative manner via contrastive learning. Furthermore, a graph mending +strategy is utilized to predict missing links in the graph on the server, thus +leveraging the capabilities of graph neural networks over high-order graphs. +Extensive experiments were conducted on two public datasets, and the results +demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Fréchet Distance for Offline Evaluation of Information Retrieval + Systems with Sparse Labels + + +
+ The rapid advancement of natural language processing, information retrieval +(IR), computer vision, and other technologies has presented significant +challenges in evaluating the performance of these systems. One of the main +challenges is the scarcity of human-labeled data, which hinders the fair and +accurate assessment of these systems. In this work, we specifically focus on +evaluating IR systems with sparse labels, borrowing from recent research on +evaluating computer vision tasks. taking inspiration from the success of using +Fr\'echet Inception Distance (FID) in assessing text-to-image generation +systems. We propose leveraging the Fr\'echet Distance to measure the distance +between the distributions of relevant judged items and retrieved results. Our +experimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query +sets demonstrate the effectiveness of the Fr\'echet Distance as a metric for +evaluating IR systems, particularly in settings where a few labels are +available. This approach contributes to the advancement of evaluation +methodologies in real-world scenarios such as the assessment of generative IR +systems. + +
+
+
+
+
+ + ☆ RAG-Fusion: a New Take on Retrieval-Augmented Generation + + +
+ Infineon has identified a need for engineers, account managers, and customers +to rapidly obtain product information. This problem is traditionally addressed +with retrieval-augmented generation (RAG) chatbots, but in this study, I +evaluated the use of the newly popularized RAG-Fusion method. RAG-Fusion +combines RAG and reciprocal rank fusion (RRF) by generating multiple queries, +reranking them with reciprocal scores and fusing the documents and scores. +Through manually evaluating answers on accuracy, relevance, and +comprehensiveness, I found that RAG-Fusion was able to provide accurate and +comprehensive answers due to the generated queries contextualizing the original +query from various perspectives. However, some answers strayed off topic when +the generated queries' relevance to the original query is insufficient. This +research marks significant progress in artificial intelligence (AI) and natural +language processing (NLP) applications and demonstrates transformations in a +global and multi-industry context. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Uncertainty-Aware Explainable Recommendation with Large Language Models + + +
+ Providing explanations within the recommendation system would boost user +satisfaction and foster trust, especially by elaborating on the reasons for +selecting recommended items tailored to the user. The predominant approach in +this domain revolves around generating text-based explanations, with a notable +emphasis on applying large language models (LLMs). However, refining LLMs for +explainable recommendations proves impractical due to time constraints and +computing resource limitations. As an alternative, the current approach +involves training the prompt rather than the LLM. In this study, we developed a +model that utilizes the ID vectors of user and item inputs as prompts for +GPT-2. We employed a joint training mechanism within a multi-task learning +framework to optimize both the recommendation task and explanation task. This +strategy enables a more effective exploration of users' interests, improving +recommendation effectiveness and user satisfaction. Through the experiments, +our method achieving 1.59 DIV, 0.57 USR and 0.41 FCR on the Yelp, TripAdvisor +and Amazon dataset respectively, demonstrates superior performance over four +SOTA methods in terms of explainability evaluation metric. In addition, we +identified that the proposed model is able to ensure stable textual quality on +the three public datasets. + +
+
+
+
+
+ + ☆ Heterophily-Aware Fair Recommendation using Graph Convolutional Networks + + +
+ In recent years, graph neural networks (GNNs) have become a popular tool to +improve the accuracy and performance of recommender systems. Modern recommender +systems are not only designed to serve the end users, but also to benefit other +participants, such as items and items providers. These participants may have +different or conflicting goals and interests, which raise the need for fairness +and popularity bias considerations. GNN-based recommendation methods also face +the challenges of unfairness and popularity bias and their normalization and +aggregation processes suffer from these challenges. In this paper, we propose a +fair GNN-based recommender system, called HetroFair, to improve items' side +fairness. HetroFair uses two separate components to generate fairness-aware +embeddings: i) fairness-aware attention which incorporates dot product in the +normalization process of GNNs, to decrease the effect of nodes' degrees, and +ii) heterophily feature weighting to assign distinct weights to different +features during the aggregation process. In order to evaluate the effectiveness +of HetroFair, we conduct extensive experiments over six real-world datasets. +Our experimental results reveal that HetroFair not only alleviates the +unfairness and popularity bias on the items' side, but also achieves superior +accuracy on the users' side. Our implementation is publicly available at +https://github.com/NematGH/HetroFair + +
+
+
+
+
+ + ♻ ☆ Future Impact Decomposition in Request-level Recommendations + + +
+ In recommender systems, reinforcement learning solutions have shown promising +results in optimizing the interaction sequence between users and the system +over the long-term performance. For practical reasons, the policy's actions are +typically designed as recommending a list of items to handle users' frequent +and continuous browsing requests more efficiently. In this list-wise +recommendation scenario, the user state is updated upon every request in the +corresponding MDP formulation. However, this request-level formulation is +essentially inconsistent with the user's item-level behavior. In this study, we +demonstrate that an item-level optimization approach can better utilize item +characteristics and optimize the policy's performance even under the +request-level MDP. We support this claim by comparing the performance of +standard request-level methods with the proposed item-level actor-critic +framework in both simulation and online experiments. Furthermore, we show that +a reward-based future decomposition strategy can better express the item-wise +future impact and improve the recommendation accuracy in the long term. To +achieve a more thorough understanding of the decomposition strategy, we propose +a model-based re-weighting framework with adversarial learning that further +boost the performance and investigate its correlation with the reward-based +strategy. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Prompt Performance Prediction for Image Generation + + +
+ The ability to predict the performance of a query before results are returned +has been a longstanding challenge in Information Retrieval (IR) systems. +Inspired by this task, we introduce, in this paper, a novel task called "Prompt +Performance Prediction" (PPP) that aims to predict the performance of a prompt, +before obtaining the actual generated images. We demonstrate the plausibility +of our task by measuring the correlation coefficient between predicted and +actual performance scores across: three datasets containing pairs of prompts +and generated images AND three art domain datasets of real images and real user +appreciation ratings. Our results show promising performance prediction +capabilities, suggesting potential applications for optimizing user prompts. + +
+
+
+
+
+ + ♻ ☆ Ada-Retrieval: An Adaptive Multi-Round Retrieval Paradigm for Sequential + Recommendations AAAI2024 + + +
+ Retrieval models aim at selecting a small set of item candidates which match +the preference of a given user. They play a vital role in large-scale +recommender systems since subsequent models such as rankers highly depend on +the quality of item candidates. However, most existing retrieval models employ +a single-round inference paradigm, which may not adequately capture the dynamic +nature of user preferences and stuck in one area in the item space. In this +paper, we propose Ada-Retrieval, an adaptive multi-round retrieval paradigm for +recommender systems that iteratively refines user representations to better +capture potential candidates in the full item space. Ada-Retrieval comprises +two key modules: the item representation adapter and the user representation +adapter, designed to inject context information into items' and users' +representations. The framework maintains a model-agnostic design, allowing +seamless integration with various backbone models such as RNNs or Transformers. +We perform experiments on three widely used public datasets, incorporating five +powerful sequential recommenders as backbone models. Our results demonstrate +that Ada-Retrieval significantly enhances the performance of various base +models, with consistent improvements observed across different datasets. Our +code and data are publicly available at: +https://github.com/ll0ruc/Ada-Retrieval. + +
+
+ comment: 9 pages, Accepted to AAAI2024 +
+
+
+
+
+ + ♻ ☆ Prompt-enhanced Federated Content Representation Learning for + Cross-domain Recommendation WWW 2024 + + +
+ Cross-domain Recommendation (CDR) as one of the effective techniques in +alleviating the data sparsity issues has been widely studied in recent years. +However, previous works may cause domain privacy leakage since they necessitate +the aggregation of diverse domain data into a centralized server during the +training process. Though several studies have conducted privacy preserving CDR +via Federated Learning (FL), they still have the following limitations: 1) They +need to upload users' personal information to the central server, posing the +risk of leaking user privacy. 2) Existing federated methods mainly rely on +atomic item IDs to represent items, which prevents them from modeling items in +a unified feature space, increasing the challenge of knowledge transfer among +domains. 3) They are all based on the premise of knowing overlapped users +between domains, which proves impractical in real-world applications. To +address the above limitations, we focus on Privacy-preserving Cross-domain +Recommendation (PCDR) and propose PFCR as our solution. For Limitation 1, we +develop a FL schema by exclusively utilizing users' interactions with local +clients and devising an encryption method for gradient encryption. For +Limitation 2, we model items in a universal feature space by their description +texts. For Limitation 3, we initially learn federated content representations, +harnessing the generality of natural language to establish bridges between +domains. Subsequently, we craft two prompt fine-tuning strategies to tailor the +pre-trained model to the target domain. Extensive experiments on two real-world +datasets demonstrate the superiority of our PFCR method compared to the SOTA +approaches. + +
+
+ comment: 11 pages, 3 figures, accepted by WWW 2024 +
+
+
+
+
+ + ♻ ☆ InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining + + +
+ Pretraining auto-regressive large language models (LLMs) with retrieval +demonstrates better perplexity and factual accuracy by leveraging external +databases. However, the size of existing pretrained retrieval-augmented LLM is +still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness +of instruction tuning and zero-shot generalization. In this work, we introduce +Retro 48B, the largest LLM pretrained with retrieval. Specifically, we continue +to pretrain a 43B GPT model on additional 100 billion tokens using the Retro +augmentation method by retrieving from 1.2 trillion tokens. Notably, the +obtained foundation model, Retro 48B, largely outperforms the counterpart GPT +43B trained on 1.2T tokens in terms of perplexity with only 2.58% additional +GPU hours, demonstrating the significant scaling potential of the method. After +instruction tuning on Retro, InstructRetro demonstrates significant improvement +over the instruction tuned GPT on a wide range of zero-shot tasks. +Specifically, the average improvement of InstructRetro is 7% over its GPT +counterpart across 8 short-form QA and reading comprehension tasks, 10% over +GPT across 4 challenging long-form QA tasks, and 16% over GPT across 3 +summarization tasks. Surprisingly, we find that one can ablate the encoder from +InstructRetro architecture and directly use its decoder backbone, while +achieving comparable results. Our results highlight the promising direction to +obtain a better GPT decoder through continued pretraining with retrieval before +instruction tuning. Our code and checkpoints are publicly available at: +https://github.com/NVIDIA/Megatron-LM/tree/InstructRetro/tools/retro. + +
+
+
+
+
+
+
+
+ + Machine Learning 112 + +
+
+
+ + ☆ KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache + Quantization + + +
+ LLMs are seeing growing use for applications such as document analysis and +summarization which require large context windows, and with these large context +windows KV cache activations surface as the dominant contributor to memory +consumption during inference. Quantization is a promising approach for +compressing KV cache activations; however, existing solutions fail to represent +activations accurately in ultra-low precisions, such as sub-4-bit. In this +work, we present KVQuant, which addresses this problem by incorporating novel +methods for quantizing cached KV activations, including: (i) Per-Channel Key +Quantization, where we adjust the dimension along which we quantize the Key +activations to better match the distribution; (ii) Pre-RoPE Key Quantization, +where we quantize Key activations before the rotary positional embedding to +mitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization, +where we derive per-layer sensitivity-weighted non-uniform datatypes that +better represent the distributions; (iv) Per-Vector Dense-and-Sparse +Quantization, where we isolate outliers separately for each vector to minimize +skews in quantization ranges; and (v) Q-Norm, where we normalize quantization +centroids in order to mitigate distribution shift, providing additional +benefits for 2-bit quantization. By applying our method to the LLaMA, LLaMA-2, +and Mistral models, we achieve $<0.1$ perplexity degradation with 3-bit +quantization on both Wikitext-2 and C4, outperforming existing approaches. Our +method enables serving the LLaMA-7B model with a context length of up to 1 +million on a single A100-80GB GPU and up to 10 million on an 8-GPU system. + +
+
+
+
+
+ + ☆ Do Language Models Exhibit the Same Cognitive Biases in Problem Solving + as Human Learners? + + +
+ There is increasing interest in employing large language models (LLMs) as +cognitive models. For such purposes, it is central to understand which +cognitive properties are well-modeled by LLMs, and which are not. In this work, +we study the biases of LLMs in relation to those known in children when solving +arithmetic word problems. Surveying the learning science literature, we posit +that the problem-solving process can be split into three distinct steps: text +comprehension, solution planning and solution execution. We construct tests for +each one in order to understand which parts of this process can be faithfully +modeled by current state-of-the-art LLMs. We generate a novel set of word +problems for each of these tests, using a neuro-symbolic method that enables +fine-grained control over the problem features. We find evidence that LLMs, +with and without instruction-tuning, exhibit human-like biases in both the +text-comprehension and the solution-planning steps of the solving process, but +not during the final step which relies on the problem's arithmetic expressions +(solution execution). + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval + + +
+ Retrieval-augmented language models can better adapt to changes in world +state and incorporate long-tail knowledge. However, most existing methods +retrieve only short contiguous chunks from a retrieval corpus, limiting +holistic understanding of the overall document context. We introduce the novel +approach of recursively embedding, clustering, and summarizing chunks of text, +constructing a tree with differing levels of summarization from the bottom up. +At inference time, our RAPTOR model retrieves from this tree, integrating +information across lengthy documents at different levels of abstraction. +Controlled experiments show that retrieval with recursive summaries offers +significant improvements over traditional retrieval-augmented LMs on several +tasks. On question-answering tasks that involve complex, multi-step reasoning, +we show state-of-the-art results; for example, by coupling RAPTOR retrieval +with the use of GPT-4, we can improve the best performance on the QuALITY +benchmark by 20% in absolute accuracy. + +
+
+
+
+
+ + ☆ LongAlign: A Recipe for Long Context Alignment of Large Language Models + + +
+ Extending large language models to effectively handle long contexts requires +instruction fine-tuning on input sequences of similar length. To address this, +we present LongAlign -- a recipe of the instruction data, training, and +evaluation for long context alignment. First, we construct a long +instruction-following dataset using Self-Instruct. To ensure the data +diversity, it covers a broad range of tasks from various long context sources. +Second, we adopt the packing and sorted batching strategies to speed up +supervised fine-tuning on data with varied length distributions. Additionally, +we develop a loss weighting method to balance the contribution to the loss +across different sequences during packing training. Third, we introduce the +LongBench-Chat benchmark for evaluating instruction-following capabilities on +queries of 10k-100k in length. Experiments show that LongAlign outperforms +existing recipes for LLMs in long context tasks by up to 30\%, while also +maintaining their proficiency in handling short, generic tasks. The code, data, +and long-aligned models are open-sourced at https://github.com/THUDM/LongAlign. + +
+
+
+
+
+ + ☆ Rank Supervised Contrastive Learning for Time Series Classification + + +
+ Recently, various contrastive learning techniques have been developed to +categorize time series data and exhibit promising performance. A general +paradigm is to utilize appropriate augmentations and construct feasible +positive samples such that the encoder can yield robust and discriminative +representations by mapping similar data points closer together in the feature +space while pushing dissimilar data points farther apart. Despite its efficacy, +the fine-grained relative similarity (e.g., rank) information of positive +samples is largely ignored, especially when labeled samples are limited. To +this end, we present Rank Supervised Contrastive Learning (RankSCL) to perform +time series classification. Different from conventional contrastive learning +frameworks, RankSCL augments raw data in a targeted way in the embedding space +and adopts certain filtering rules to select more informative positive and +negative pairs of samples. Moreover, a novel rank loss is developed to assign +different weights for different levels of positive samples, enable the encoder +to extract the fine-grained information of the same class, and produce a clear +boundary among different classes. Thoroughly empirical studies on 128 UCR +datasets and 30 UEA datasets demonstrate that the proposed RankSCL can achieve +state-of-the-art performance compared to existing baseline methods. + +
+
+
+
+
+ + ☆ Benchmarking Sensitivity of Continual Graph Learning for Skeleton-Based + Action Recognition + + +
+ Continual learning (CL) is the research field that aims to build machine +learning models that can accumulate knowledge continuously over different tasks +without retraining from scratch. Previous studies have shown that pre-training +graph neural networks (GNN) may lead to negative transfer (Hu et al., 2020) +after fine-tuning, a setting which is closely related to CL. Thus, we focus on +studying GNN in the continual graph learning (CGL) setting. We propose the +first continual graph learning benchmark for spatio-temporal graphs and use it +to benchmark well-known CGL methods in this novel setting. The benchmark is +based on the N-UCLA and NTU-RGB+D datasets for skeleton-based action +recognition. Beyond benchmarking for standard performance metrics, we study the +class and task-order sensitivity of CGL methods, i.e., the impact of learning +order on each class/task's performance, and the architectural sensitivity of +CGL methods with backbone GNN at various widths and depths. We reveal that +task-order robust methods can still be class-order sensitive and observe +results that contradict previous empirical observations on architectural +sensitivity in CL. + +
+
+ comment: This work is accepted at VISAPP 2024 as a short paper +
+
+
+
+
+ + ☆ Epidemic Modeling using Hybrid of Time-varying SIRD, Particle Swarm + Optimization, and Deep Learning + + +
+ Epidemiological models are best suitable to model an epidemic if the spread +pattern is stationary. To deal with non-stationary patterns and multiple waves +of an epidemic, we develop a hybrid model encompassing epidemic modeling, +particle swarm optimization, and deep learning. The model mainly caters to +three objectives for better prediction: 1. Periodic estimation of the model +parameters. 2. Incorporating impact of all the aspects using data fitting and +parameter optimization 3. Deep learning based prediction of the model +parameters. In our model, we use a system of ordinary differential equations +(ODEs) for Susceptible-Infected-Recovered-Dead (SIRD) epidemic modeling, +Particle Swarm Optimization (PSO) for model parameter optimization, and +stacked-LSTM for forecasting the model parameters. Initial or one time +estimation of model parameters is not able to model multiple waves of an +epidemic. So, we estimate the model parameters periodically (weekly). We use +PSO to identify the optimum values of the model parameters. We next train the +stacked-LSTM on the optimized parameters, and perform forecasting of the model +parameters for upcoming four weeks. Further, we fed the LSTM forecasted +parameters into the SIRD model to forecast the number of COVID-19 cases. We +evaluate the model for highly affected three countries namely; the USA, India, +and the UK. The proposed hybrid model is able to deal with multiple waves, and +has outperformed existing methods on all the three datasets. + +
+
+ comment: Accepted in ICCCNT 2023 +
+
+
+
+
+ + ☆ Variable selection for Naïve Bayes classification + + +
+ The Na\"ive Bayes has proven to be a tractable and efficient method for +classification in multivariate analysis. However, features are usually +correlated, a fact that violates the Na\"ive Bayes' assumption of conditional +independence, and may deteriorate the method's performance. Moreover, datasets +are often characterized by a large number of features, which may complicate the +interpretation of the results as well as slow down the method's execution. + In this paper we propose a sparse version of the Na\"ive Bayes classifier +that is characterized by three properties. First, the sparsity is achieved +taking into account the correlation structure of the covariates. Second, +different performance measures can be used to guide the selection of features. +Third, performance constraints on groups of higher interest can be included. +Our proposal leads to a smart search, which yields competitive running times, +whereas the flexibility in terms of performance measure for classification is +integrated. Our findings show that, when compared against well-referenced +feature selection approaches, the proposed sparse Na\"ive Bayes obtains +competitive results regarding accuracy, sparsity and running times for balanced +datasets. In the case of datasets with unbalanced (or with different +importance) classes, a better compromise between classification rates for the +different classes is achieved. + +
+
+
+
+
+ + ☆ Optimizing contrastive learning for cortical folding pattern detection SP + + +
+ The human cerebral cortex has many bumps and grooves called gyri and sulci. +Even though there is a high inter-individual consistency for the main cortical +folds, this is not the case when we examine the exact shapes and details of the +folding patterns. Because of this complexity, characterizing the cortical +folding variability and relating them to subjects' behavioral characteristics +or pathologies is still an open scientific problem. Classical approaches +include labeling a few specific patterns, either manually or +semi-automatically, based on geometric distances, but the recent availability +of MRI image datasets of tens of thousands of subjects makes modern +deep-learning techniques particularly attractive. Here, we build a +self-supervised deep-learning model to detect folding patterns in the cingulate +region. We train a contrastive self-supervised model (SimCLR) on both Human +Connectome Project (1101 subjects) and UKBioBank (21070 subjects) datasets with +topological-based augmentations on the cortical skeletons, which are +topological objects that capture the shape of the folds. We explore several +backbone architectures (convolutional network, DenseNet, and PointNet) for the +SimCLR. For evaluation and testing, we perform a linear classification task on +a database manually labeled for the presence of the "double-parallel" folding +pattern in the cingulate region, which is related to schizophrenia +characteristics. The best model, giving a test AUC of 0.76, is a convolutional +network with 6 layers, a 10-dimensional latent space, a linear projection head, +and using the branch-clipping augmentation. This is the first time that a +self-supervised deep learning model has been applied to cortical skeletons on +such a large dataset and quantitatively evaluated. We can now envisage the next +step: applying it to other brain regions to detect other biomarkers. + +
+
+ comment: 9 pages, 6 figures, 1 table, SPIE Imaging 2024 +
+
+
+
+
+ + ☆ Prompt-Driven LLM Safeguarding via Directed Representation Optimization + + +
+ Prepending model inputs with safety prompts is a common practice of +safeguarding large language models (LLMs) from complying with queries that +contain harmful intents. However, the working mechanisms of safety prompts have +not yet been fully understood, which hinders the potential for automatically +optimizing them for improved LLM safety. Motivated by this problem, we +investigate the impact of safety prompts from the perspective of model +representations. We find that in models' representation space, harmful and +harmless queries can be largely distinguished, but this is not noticeably +enhanced by safety prompts. Instead, the queries' representations are moved by +different safety prompts in similar directions, where models become more prone +to refusal (i.e., refusing to provide assistance) even when the queries are +harmless. Inspired by these findings, we propose a method called DRO (Directed +Representation Optimization) for automatic safety prompt optimization. DRO +treats safety prompts as continuous, trainable embeddings and learns to move +the representations of harmful/harmless queries along/opposite the direction in +which the model's refusal probability increases. We demonstrate that DRO +remarkably improves the safeguarding performance of human-crafted safety +prompts and outperforms strong baselines, as evaluated on out-of-domain +benchmarks, without compromising the general model capability. + +
+
+
+
+
+ + ☆ Causal Discovery by Kernel Deviance Measures with Heterogeneous + Transforms + + +
+ The discovery of causal relationships in a set of random variables is a +fundamental objective of science and has also recently been argued as being an +essential component towards real machine intelligence. One class of causal +discovery techniques are founded based on the argument that there are inherent +structural asymmetries between the causal and anti-causal direction which could +be leveraged in determining the direction of causation. To go about capturing +these discrepancies between cause and effect remains to be a challenge and many +current state-of-the-art algorithms propose to compare the norms of the kernel +mean embeddings of the conditional distributions. In this work, we argue that +such approaches based on RKHS embeddings are insufficient in capturing +principal markers of cause-effect asymmetry involving higher-order structural +variabilities of the conditional distributions. We propose Kernel Intrinsic +Invariance Measure with Heterogeneous Transform (KIIM-HT) which introduces a +novel score measure based on heterogeneous transformation of RKHS embeddings to +extract relevant higher-order moments of the conditional densities for causal +discovery. Inference is made via comparing the score of each hypothetical +cause-effect direction. Tests and comparisons on a synthetic dataset, a +two-dimensional synthetic dataset and the real-world benchmark dataset +T\"ubingen Cause-Effect Pairs verify our approach. In addition, we conduct a +sensitivity analysis to the regularization parameter to faithfully compare +previous work to our method and an experiment with trials on varied +hyperparameter values to showcase the robustness of our algorithm. + +
+
+
+
+
+ + ☆ Causal Coordinated Concurrent Reinforcement Learning + + +
+ In this work, we propose a novel algorithmic framework for data sharing and +coordinated exploration for the purpose of learning more data-efficient and +better performing policies under a concurrent reinforcement learning (CRL) +setting. In contrast to other work which make the assumption that all agents +act under identical environments, we relax this restriction and instead +consider the formulation where each agent acts within an environment which +shares a global structure but also exhibits individual variations. Our +algorithm leverages a causal inference algorithm in the form of Additive Noise +Model - Mixture Model (ANM-MM) in extracting model parameters governing +individual differentials via independence enforcement. We propose a new data +sharing scheme based on a similarity measure of the extracted model parameters +and demonstrate superior learning speeds on a set of autoregressive, pendulum +and cart-pole swing-up tasks and finally, we show the effectiveness of diverse +action selection between common agents under a sparse reward setting. To the +best of our knowledge, this is the first work in considering non-identical +environments in CRL and one of the few works which seek to integrate causal +inference with reinforcement learning (RL). + +
+
+
+
+
+ + ☆ EEG-GPT: Exploring Capabilities of Large Language Models for EEG + Classification and Interpretation + + +
+ In conventional machine learning (ML) approaches applied to +electroencephalography (EEG), this is often a limited focus, isolating specific +brain activities occurring across disparate temporal scales (from transient +spikes in milliseconds to seizures lasting minutes) and spatial scales (from +localized high-frequency oscillations to global sleep activity). This siloed +approach limits the development EEG ML models that exhibit multi-scale +electrophysiological understanding and classification capabilities. Moreover, +typical ML EEG approaches utilize black-box approaches, limiting their +interpretability and trustworthiness in clinical contexts. Thus, we propose +EEG-GPT, a unifying approach to EEG classification that leverages advances in +large language models (LLM). EEG-GPT achieves excellent performance comparable +to current state-of-the-art deep learning methods in classifying normal from +abnormal EEG in a few-shot learning paradigm utilizing only 2% of training +data. Furthermore, it offers the distinct advantages of providing intermediate +reasoning steps and coordinating specialist EEG tools across multiple scales in +its operation, offering transparent and interpretable step-by-step +verification, thereby promoting trustworthiness in clinical contexts. + +
+
+
+
+
+ + ☆ Multilinear Operator Networks + + +
+ Despite the remarkable capabilities of deep neural networks in image +recognition, the dependence on activation functions remains a largely +unexplored area and has yet to be eliminated. On the other hand, Polynomial +Networks is a class of models that does not require activation functions, but +have yet to perform on par with modern architectures. In this work, we aim +close this gap and propose MONet, which relies solely on multilinear operators. +The core layer of MONet, called Mu-Layer, captures multiplicative interactions +of the elements of the input token. MONet captures high-degree interactions of +the input elements and we demonstrate the efficacy of our approach on a series +of image recognition and scientific computing benchmarks. The proposed model +outperforms prior polynomial networks and performs on par with modern +architectures. We believe that MONet can inspire further research on models +that use entirely multilinear operations. + +
+
+ comment: International Conference on Learning Representations Poster(2024) +
+
+
+
+
+ + ☆ Understanding polysemanticity in neural networks through coding theory + + +
+ Despite substantial efforts, neural network interpretability remains an +elusive goal, with previous research failing to provide succinct explanations +of most single neurons' impact on the network output. This limitation is due to +the polysemantic nature of most neurons, whereby a given neuron is involved in +multiple unrelated network states, complicating the interpretation of that +neuron. In this paper, we apply tools developed in neuroscience and information +theory to propose both a novel practical approach to network interpretability +and theoretical insights into polysemanticity and the density of codes. We +infer levels of redundancy in the network's code by inspecting the +eigenspectrum of the activation's covariance matrix. Furthermore, we show how +random projections can reveal whether a network exhibits a smooth or +non-differentiable code and hence how interpretable the code is. This same +framework explains the advantages of polysemantic neurons to learning +performance and explains trends found in recent results by Elhage et +al.~(2022). Our approach advances the pursuit of interpretability in neural +networks, providing insights into their underlying structure and suggesting new +avenues for circuit-level interpretability. + +
+
+
+
+
+ + ☆ MelNet: A Real-Time Deep Learning Algorithm for Object Detection + + +
+ In this study, a novel deep learning algorithm for object detection, named +MelNet, was introduced. MelNet underwent training utilizing the KITTI dataset +for object detection. Following 300 training epochs, MelNet attained an mAP +(mean average precision) score of 0.732. Additionally, three alternative models +-YOLOv5, EfficientDet, and Faster-RCNN-MobileNetv3- were trained on the KITTI +dataset and juxtaposed with MelNet for object detection. + The outcomes underscore the efficacy of employing transfer learning in +certain instances. Notably, preexisting models trained on prominent datasets +(e.g., ImageNet, COCO, and Pascal VOC) yield superior results. Another finding +underscores the viability of creating a new model tailored to a specific +scenario and training it on a specific dataset. This investigation demonstrates +that training MelNet exclusively on the KITTI dataset also surpasses +EfficientDet after 150 epochs. Consequently, post-training, MelNet's +performance closely aligns with that of other pre-trained models. + +
+
+ comment: 11 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ CONCORD: Towards a DSL for Configurable Graph Code Representation + + +
+ Deep learning is widely used to uncover hidden patterns in large code +corpora. To achieve this, constructing a format that captures the relevant +characteristics and features of source code is essential. Graph-based +representations have gained attention for their ability to model structural and +semantic information. However, existing tools lack flexibility in constructing +graphs across different programming languages, limiting their use. +Additionally, the output of these tools often lacks interoperability and +results in excessively large graphs, making graph-based neural networks +training slower and less scalable. + We introduce CONCORD, a domain-specific language to build customizable graph +representations. It implements reduction heuristics to reduce graphs' size +complexity. We demonstrate its effectiveness in code smell detection as an +illustrative use case and show that: first, CONCORD can produce code +representations automatically per the specified configuration, and second, our +heuristics can achieve comparable performance with significantly reduced size. +CONCORD will help researchers a) create and experiment with customizable +graph-based code representations for different software engineering tasks +involving DL, b) reduce the engineering work to generate graph representations, +c) address the issue of scalability in GNN models, and d) enhance the +reproducibility of experiments in research through a standardized approach to +code representation and analysis. + +
+
+
+
+
+ + ☆ Convergence Analysis for General Probability Flow ODEs of Diffusion + Models in Wasserstein Distances + + +
+ Score-based generative modeling with probability flow ordinary differential +equations (ODEs) has achieved remarkable success in a variety of applications. +While various fast ODE-based samplers have been proposed in the literature and +employed in practice, the theoretical understandings about convergence +properties of the probability flow ODE are still quite limited. In this paper, +we provide the first non-asymptotic convergence analysis for a general class of +probability flow ODE samplers in 2-Wasserstein distance, assuming accurate +score estimates. We then consider various examples and establish results on the +iteration complexity of the corresponding ODE-based samplers. + +
+
+ comment: 47 pages, 3 tables. arXiv admin note: text overlap with + arXiv:2311.11003 +
+
+
+
+
+ + ☆ LOCOST: State-Space Models for Long Document Abstractive Summarization EACL 2024 + + +
+ State-space models are a low-complexity alternative to transformers for +encoding long sequences and capturing long-term dependencies. We propose +LOCOST: an encoder-decoder architecture based on state-space models for +conditional text generation with long context inputs. With a computational +complexity of $O(L \log L)$, this architecture can handle significantly longer +sequences than state-of-the-art models that are based on sparse attention +patterns. We evaluate our model on a series of long document abstractive +summarization tasks. The model reaches a performance level that is 93-96% +comparable to the top-performing sparse transformers of the same size while +saving up to 50% memory during training and up to 87% during inference. +Additionally, LOCOST effectively handles input texts exceeding 600K tokens at +inference time, setting new state-of-the-art results on full-book summarization +and opening new perspectives for long input processing. + +
+
+ comment: 9 pages, 5 figures, 7 tables, EACL 2024 conference +
+
+
+
+
+ + ☆ Graph Attention-based Reinforcement Learning for Trajectory Design and + Resource Assignment in Multi-UAV Assisted Communication + + +
+ In the multiple unmanned aerial vehicle (UAV)- assisted downlink +communication, it is challenging for UAV base stations (UAV BSs) to realize +trajectory design and resource assignment in unknown environments. The +cooperation and competition between UAV BSs in the communication network leads +to a Markov game problem. Multi-agent reinforcement learning is a significant +solution for the above decision-making. However, there are still many common +issues, such as the instability of the system and low utilization of historical +data, that limit its application. In this paper, a novel graph-attention +multi-agent trust region (GA-MATR) reinforcement learning framework is proposed +to solve the multi-UAV assisted communication problem. Graph recurrent network +is introduced to process and analyze complex topology of the communication +network, so as to extract useful information and patterns from observational +information. The attention mechanism provides additional weighting for conveyed +information, so that the critic network can accurately evaluate the value of +behavior for UAV BSs. This provides more reliable feedback signals and helps +the actor network update the strategy more effectively. Ablation simulations +indicate that the proposed approach attains improved convergence over the +baselines. UAV BSs learn the optimal communication strategies to achieve their +maximum cumulative rewards. Additionally, multi-agent trust region method with +monotonic convergence provides an estimated Nash equilibrium for the multi-UAV +assisted communication Markov game. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Efficient Subseasonal Weather Forecast using Teleconnection-informed + Transformers + + +
+ Subseasonal forecasting, which is pivotal for agriculture, water resource +management, and early warning of disasters, faces challenges due to the chaotic +nature of the atmosphere. Recent advances in machine learning (ML) have +revolutionized weather forecasting by achieving competitive predictive skills +to numerical models. However, training such foundation models requires +thousands of GPU days, which causes substantial carbon emissions and limits +their broader applicability. Moreover, ML models tend to fool the pixel-wise +error scores by producing smoothed results which lack physical consistency and +meteorological meaning. To deal with the aforementioned problems, we propose a +teleconnection-informed transformer. Our architecture leverages the pretrained +Pangu model to achieve good initial weights and integrates a +teleconnection-informed temporal module to improve predictability in an +extended temporal range. Remarkably, by adjusting 1.1% of the Pangu model's +parameters, our method enhances predictability on four surface and five +upper-level atmospheric variables at a two-week lead time. Furthermore, the +teleconnection-filtered features improve the spatial granularity of outputs +significantly, indicating their potential physical consistency. Our research +underscores the importance of atmospheric and oceanic teleconnections in +driving future weather conditions. Besides, it presents a resource-efficient +pathway for researchers to leverage existing foundation models on versatile +downstream tasks. + +
+
+ comment: Submitted to IGARSS 2024 +
+
+
+
+
+ + ☆ Convolution Meets LoRA: Parameter Efficient Finetuning for Segment + Anything Model ICLR 2024 + + +
+ The Segment Anything Model (SAM) stands as a foundational framework for image +segmentation. While it exhibits remarkable zero-shot generalization in typical +scenarios, its advantage diminishes when applied to specialized domains like +medical imagery and remote sensing. To address this limitation, this paper +introduces Conv-LoRA, a simple yet effective parameter-efficient fine-tuning +approach. By integrating ultra-lightweight convolutional parameters into +Low-Rank Adaptation (LoRA), Conv-LoRA can inject image-related inductive biases +into the plain ViT encoder, further reinforcing SAM's local prior assumption. +Notably, Conv-LoRA not only preserves SAM's extensive segmentation knowledge +but also revives its capacity of learning high-level image semantics, which is +constrained by SAM's foreground-background segmentation pretraining. +Comprehensive experimentation across diverse benchmarks spanning multiple +domains underscores Conv-LoRA's superiority in adapting SAM to real-world +semantic segmentation tasks. + +
+
+ comment: Accepted at ICLR 2024 Conference +
+
+
+
+
+ + ☆ Manipulating Predictions over Discrete Inputs in Machine Teaching + + +
+ Machine teaching often involves the creation of an optimal (typically +minimal) dataset to help a model (referred to as the `student') achieve +specific goals given by a teacher. While abundant in the continuous domain, the +studies on the effectiveness of machine teaching in the discrete domain are +relatively limited. This paper focuses on machine teaching in the discrete +domain, specifically on manipulating student models' predictions based on the +goals of teachers via changing the training data efficiently. We formulate this +task as a combinatorial optimization problem and solve it by proposing an +iterative searching algorithm. Our algorithm demonstrates significant numerical +merit in the scenarios where a teacher attempts at correcting erroneous +predictions to improve the student's models, or maliciously manipulating the +model to misclassify some specific samples to the target class aligned with his +personal profits. Experimental results show that our proposed algorithm can +have superior performance in effectively and efficiently manipulating the +predictions of the model, surpassing conventional baselines. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ A Cross-View Hierarchical Graph Learning Hypernetwork for Skill + Demand-Supply Joint Prediction AAAI24 + + +
+ The rapidly changing landscape of technology and industries leads to dynamic +skill requirements, making it crucial for employees and employers to anticipate +such shifts to maintain a competitive edge in the labor market. Existing +efforts in this area either rely on domain-expert knowledge or regarding skill +evolution as a simplified time series forecasting problem. However, both +approaches overlook the sophisticated relationships among different skills and +the inner-connection between skill demand and supply variations. In this paper, +we propose a Cross-view Hierarchical Graph learning Hypernetwork (CHGH) +framework for joint skill demand-supply prediction. Specifically, CHGH is an +encoder-decoder network consisting of i) a cross-view graph encoder to capture +the interconnection between skill demand and supply, ii) a hierarchical graph +encoder to model the co-evolution of skills from a cluster-wise perspective, +and iii) a conditional hyper-decoder to jointly predict demand and supply +variations by incorporating historical demand-supply gaps. Extensive +experiments on three real-world datasets demonstrate the superiority of the +proposed framework compared to seven baselines and the effectiveness of the +three modules. + +
+
+ comment: 11 pages, 7 figures, AAAI24 +
+
+
+
+
+ + ☆ Predicting the Future with Simple World Models + + +
+ World models can represent potentially high-dimensional pixel observations in +compact latent spaces, making it tractable to model the dynamics of the +environment. However, the latent dynamics inferred by these models may still be +highly complex. Abstracting the dynamics of the environment with simple models +can have several benefits. If the latent dynamics are simple, the model may +generalize better to novel transitions, and discover useful latent +representations of environment states. We propose a regularization scheme that +simplifies the world model's latent dynamics. Our model, the Parsimonious +Latent Space Model (PLSM), minimizes the mutual information between latent +states and the dynamics that arise between them. This makes the dynamics softly +state-invariant, and the effects of the agent's actions more predictable. We +combine the PLSM with three different model classes used for i) future latent +state prediction, ii) video prediction, and iii) planning. We find that our +regularization improves accuracy, generalization, and performance in downstream +tasks. + +
+
+
+
+
+ + ☆ Privacy-preserving data release leveraging optimal transport and + particle gradient descent + + +
+ We present a novel approach for differentially private data synthesis of +protected tabular datasets, a relevant task in highly sensitive domains such as +healthcare and government. Current state-of-the-art methods predominantly use +marginal-based approaches, where a dataset is generated from private estimates +of the marginals. In this paper, we introduce PrivPGD, a new generation method +for marginal-based private data synthesis, leveraging tools from optimal +transport and particle gradient descent. Our algorithm outperforms existing +methods on a large range of datasets while being highly scalable and offering +the flexibility to incorporate additional domain-specific constraints. + +
+
+
+
+
+ + ☆ SWEA: Changing Factual Knowledge in Large Language Models via Subject + Word Embedding Altering + + +
+ Model editing has recently gained widespread attention. Current model editing +methods primarily involve modifying model parameters or adding additional +modules to the existing model. However, the former causes irreversible damage +to LLMs, while the latter incurs additional inference overhead and fuzzy vector +matching is not always reliable. To address these issues, we propose an +expandable Subject Word Embedding Altering (SWEA) framework, which modifies the +representation of subjects and achieve the goal of editing knowledge during the +inference stage. SWEA uses precise key matching outside the model and performs +reliable subject word embedding altering, thus protecting the original weights +of the model without increasing inference overhead. We then propose optimizing +then suppressing fusion method, which first optimizes the embedding vector for +the editing target and then suppresses the Knowledge Embedding Dimension (KED) +to obtain the final fused embedding. We thus propose SWEAOS method for editing +factual knowledge in LLMs. We demonstrate the state-of-the-art performance of +SWEAOS on the COUNTERFACT and zsRE datasets. To further validate the reasoning +ability of SWEAOS in editing knowledge, we evaluate it on the more complex +RIPPLEEDITS benchmark. The results on two subdatasets demonstrate that our +SWEAOS possesses state-of-the-art reasoning ability. + +
+
+ comment: Work in progress; Our code will be released +
+
+
+
+
+ + ☆ Distillation Enhanced Time Series Forecasting Network with Momentum + Contrastive Learning + + +
+ Contrastive representation learning is crucial in time series analysis as it +alleviates the issue of data noise and incompleteness as well as sparsity of +supervision signal. However, existing constrastive learning frameworks usually +focus on intral-temporal features, which fails to fully exploit the intricate +nature of time series data. To address this issue, we propose DE-TSMCL, an +innovative distillation enhanced framework for long sequence time series +forecasting. Specifically, we design a learnable data augmentation mechanism +which adaptively learns whether to mask a timestamp to obtain optimized +sub-sequences. Then, we propose a contrastive learning task with momentum +update to explore inter-sample and intra-temporal correlations of time series +to learn the underlying structure feature on the unlabeled time series. +Meanwhile, we design a supervised task to learn more robust representations and +facilitate the contrastive learning process. Finally, we jointly optimize the +above two tasks. By developing model loss from multiple tasks, we can learn +effective representations for downstream forecasting task. Extensive +experiments, in comparison with state-of-the-arts, well demonstrate the +effectiveness of DE-TSMCL, where the maximum improvement can reach to 27.3%. + +
+
+
+
+
+ + ☆ Graph Transformers without Positional Encodings + + +
+ Recently, Transformers for graph representation learning have become +increasingly popular, achieving state-of-the-art performance on a wide-variety +of datasets, either alone or in combination with message-passing graph neural +networks (MP-GNNs). Infusing graph inductive-biases in the innately +structure-agnostic transformer architecture in the form of structural or +positional encodings (PEs) is key to achieving these impressive results. +However, designing such encodings is tricky and disparate attempts have been +made to engineer such encodings including Laplacian eigenvectors, relative +random-walk probabilities (RRWP), spatial encodings, centrality encodings, edge +encodings etc. In this work, we argue that such encodings may not be required +at all, provided the attention mechanism itself incorporates information about +the graph structure. We introduce Eigenformer, which uses a novel +spectrum-aware attention mechanism cognizant of the Laplacian spectrum of the +graph, and empirically show that it achieves performance comparable to SOTA +MP-GNN architectures and Graph Transformers on a number of standard GNN +benchmark datasets, even surpassing the SOTA on some datasets. We also find +that our architecture is much faster to train in terms of number of epochs, +presumably due to the innate graph inductive biases. + +
+
+ comment: Independent Research +
+
+
+
+
+ + ☆ RADIN: Souping on a Budget + + +
+ Model Soups, extending Stochastic Weights Averaging (SWA), combine models +fine-tuned with different hyperparameters. Yet, their adoption is hindered by +computational challenges due to subset selection issues. In this paper, we +propose to speed up model soups by approximating soups performance using +averaged ensemble logits performances. Theoretical insights validate the +congruence between ensemble logits and weight averaging soups across any mixing +ratios. Our Resource ADjusted soups craftINg (RADIN) procedure stands out by +allowing flexible evaluation budgets, enabling users to adjust his budget of +exploration adapted to his resources while increasing performance at lower +budget compared to previous greedy approach (up to 4% on ImageNet). + +
+
+
+
+
+ + ☆ Robustly overfitting latents for flexible neural image compression + + +
+ Neural image compression has made a great deal of progress. State-of-the-art +models are based on variational autoencoders and are outperforming classical +models. Neural compression models learn to encode an image into a quantized +latent representation that can be efficiently sent to the decoder, which +decodes the quantized latent into a reconstructed image. While these models +have proven successful in practice, they lead to sub-optimal results due to +imperfect optimization and limitations in the encoder and decoder capacity. +Recent work shows how to use stochastic Gumbel annealing (SGA) to refine the +latents of pre-trained neural image compression models. We extend this idea by +introducing SGA+, which contains three different methods that build upon SGA. +Further, we give a detailed analysis of our proposed methods, show how they +improve performance, and show that they are less sensitive to hyperparameter +choices. Besides, we show how each method can be extended to three- instead of +two-class rounding. Finally, we show how refinement of the latents with our +best-performing method improves the compression performance on the Tecnick +dataset and how it can be deployed to partly move along the rate-distortion +curve. + +
+
+
+
+
+ + ☆ Vision-Assisted Digital Twin Creation for mmWave Beam Management + + +
+ In the context of communication networks, digital twin technology provides a +means to replicate the radio frequency (RF) propagation environment as well as +the system behaviour, allowing for a way to optimize the performance of a +deployed system based on simulations. One of the key challenges in the +application of Digital Twin technology to mmWave systems is the prevalent +channel simulators' stringent requirements on the accuracy of the 3D Digital +Twin, reducing the feasibility of the technology in real applications. We +propose a practical Digital Twin creation pipeline and a channel simulator, +that relies only on a single mounted camera and position information. We +demonstrate the performance benefits compared to methods that do not explicitly +model the 3D environment, on downstream sub-tasks in beam acquisition, using +the real-world dataset of the DeepSense6G challenge + +
+
+ comment: ICC2024 accepted paper. Copyright IEEE +
+
+
+
+
+ + ☆ A Policy Gradient Primal-Dual Algorithm for Constrained MDPs with + Uniform PAC Guarantees + + +
+ We study a primal-dual reinforcement learning (RL) algorithm for the online +constrained Markov decision processes (CMDP) problem, wherein the agent +explores an optimal policy that maximizes return while satisfying constraints. +Despite its widespread practical use, the existing theoretical literature on +primal-dual RL algorithms for this problem only provides sublinear regret +guarantees and fails to ensure convergence to optimal policies. In this paper, +we introduce a novel policy gradient primal-dual algorithm with uniform +probably approximate correctness (Uniform-PAC) guarantees, simultaneously +ensuring convergence to optimal policies, sublinear regret, and polynomial +sample complexity for any target accuracy. Notably, this represents the first +Uniform-PAC algorithm for the online CMDP problem. In addition to the +theoretical guarantees, we empirically demonstrate in a simple CMDP that our +algorithm converges to optimal policies, while an existing algorithm exhibits +oscillatory performance and constraint violation. + +
+
+
+
+
+ + ☆ Regularized Linear Discriminant Analysis Using a Nonlinear Covariance + Matrix Estimator + + +
+ Linear discriminant analysis (LDA) is a widely used technique for data +classification. The method offers adequate performance in many classification +problems, but it becomes inefficient when the data covariance matrix is +ill-conditioned. This often occurs when the feature space's dimensionality is +higher than or comparable to the training data size. Regularized LDA (RLDA) +methods based on regularized linear estimators of the data covariance matrix +have been proposed to cope with such a situation. The performance of RLDA +methods is well studied, with optimal regularization schemes already proposed. +In this paper, we investigate the capability of a positive semidefinite +ridge-type estimator of the inverse covariance matrix that coincides with a +nonlinear (NL) covariance matrix estimator. The estimator is derived by +reformulating the score function of the optimal classifier utilizing linear +estimation methods, which eventually results in the proposed NL-RLDA +classifier. We derive asymptotic and consistent estimators of the proposed +technique's misclassification rate under the assumptions of a double-asymptotic +regime and multivariate Gaussian model for the classes. The consistent +estimator, coupled with a one-dimensional grid search, is used to set the value +of the regularization parameter required for the proposed NL-RLDA classifier. +Performance evaluations based on both synthetic and real data demonstrate the +effectiveness of the proposed classifier. The proposed technique outperforms +state-of-art methods over multiple datasets. When compared to state-of-the-art +methods across various datasets, the proposed technique exhibits superior +performance. + +
+
+
+
+
+ + ☆ PF-GNN: Differentiable particle filtering based approximation of + universal graph representations ICLR 2022 + + +
+ Message passing Graph Neural Networks (GNNs) are known to be limited in +expressive power by the 1-WL color-refinement test for graph isomorphism. Other +more expressive models either are computationally expensive or need +preprocessing to extract structural features from the graph. In this work, we +propose to make GNNs universal by guiding the learning process with exact +isomorphism solver techniques which operate on the paradigm of +Individualization and Refinement (IR), a method to artificially introduce +asymmetry and further refine the coloring when 1-WL stops. Isomorphism solvers +generate a search tree of colorings whose leaves uniquely identify the graph. +However, the tree grows exponentially large and needs hand-crafted pruning +techniques which are not desirable from a learning perspective. We take a +probabilistic view and approximate the search tree of colorings (i.e. +embeddings) by sampling multiple paths from root to leaves of the search tree. +To learn more discriminative representations, we guide the sampling process +with particle filter updates, a principled approach for sequential state +estimation. Our algorithm is end-to-end differentiable, can be applied with any +GNN as backbone and learns richer graph representations with only linear +increase in runtime. Experimental evaluation shows that our approach +consistently outperforms leading GNN models on both synthetic benchmarks for +isomorphism detection as well as real-world datasets. + +
+
+ comment: Published as a conference paper at ICLR 2022 +
+
+
+
+
+ + ☆ Algorithmic Robust Forecast Aggregation + + +
+ Forecast aggregation combines the predictions of multiple forecasters to +improve accuracy. However, the lack of knowledge about forecasters' information +structure hinders optimal aggregation. Given a family of information +structures, robust forecast aggregation aims to find the aggregator with +minimal worst-case regret compared to the omniscient aggregator. Previous +approaches for robust forecast aggregation rely on heuristic observations and +parameter tuning. We propose an algorithmic framework for robust forecast +aggregation. Our framework provides efficient approximation schemes for general +information aggregation with a finite family of possible information +structures. In the setting considered by Arieli et al. (2018) where two agents +receive independent signals conditioned on a binary state, our framework also +provides efficient approximation schemes by imposing Lipschitz conditions on +the aggregator or discrete conditions on agents' reports. Numerical experiments +demonstrate the effectiveness of our method by providing a nearly optimal +aggregator in the setting considered by Arieli et al. (2018). + +
+
+
+
+
+ + ☆ Operator learning without the adjoint + + +
+ There is a mystery at the heart of operator learning: how can one recover a +non-self-adjoint operator from data without probing the adjoint? Current +practical approaches suggest that one can accurately recover an operator while +only using data generated by the forward action of the operator without access +to the adjoint. However, naively, it seems essential to sample the action of +the adjoint. In this paper, we partially explain this mystery by proving that +without querying the adjoint, one can approximate a family of non-self-adjoint +infinite-dimensional compact operators via projection onto a Fourier basis. We +then apply the result to recovering Green's functions of elliptic partial +differential operators and derive an adjoint-free sample complexity bound. +While existing theory justifies low sample complexity in operator learning, +ours is the first adjoint-free analysis that attempts to close the gap between +theory and practice. + +
+
+ comment: 49 pages, 5 figures +
+
+
+
+
+ + ☆ Harnessing Smartwatch Microphone Sensors for Cough Detection and + Classification + + +
+ This study investigates the potential of using smartwatches with built-in +microphone sensors for monitoring coughs and detecting various cough types. We +conducted a study involving 32 participants and collected 9 hours of audio data +in a controlled manner. Afterward, we processed this data using a structured +approach, resulting in 223 positive cough samples. We further improved the +dataset through augmentation techniques and employed a specialized 1D CNN +model. This model achieved an impressive accuracy rate of 98.49% while +non-walking and 98.2% while walking, showing smartwatches can detect cough. +Moreover, our research successfully identified four distinct types of coughs +using clustering techniques. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Hierarchical Bias-Driven Stratification for Interpretable Causal Effect + Estimation + + +
+ Interpretability and transparency are essential for incorporating causal +effect models from observational data into policy decision-making. They can +provide trust for the model in the absence of ground truth labels to evaluate +the accuracy of such models. To date, attempts at transparent causal effect +estimation consist of applying post hoc explanation methods to black-box +models, which are not interpretable. Here, we present BICauseTree: an +interpretable balancing method that identifies clusters where natural +experiments occur locally. Our approach builds on decision trees with a +customized objective function to improve balancing and reduce treatment +allocation bias. Consequently, it can additionally detect subgroups presenting +positivity violations, exclude them, and provide a covariate-based definition +of the target population we can infer from and generalize to. We evaluate the +method's performance using synthetic and realistic datasets, explore its +bias-interpretability tradeoff, and show that it is comparable with existing +approaches. + +
+
+
+
+
+ + ☆ Towards Physical Plausibility in Neuroevolution Systems + + +
+ The increasing usage of Artificial Intelligence (AI) models, especially Deep +Neural Networks (DNNs), is increasing the power consumption during training and +inference, posing environmental concerns and driving the need for more +energy-efficient algorithms and hardware solutions. This work addresses the +growing energy consumption problem in Machine Learning (ML), particularly +during the inference phase. Even a slight reduction in power usage can lead to +significant energy savings, benefiting users, companies, and the environment. +Our approach focuses on maximizing the accuracy of Artificial Neural Network +(ANN) models using a neuroevolutionary framework whilst minimizing their power +consumption. To do so, power consumption is considered in the fitness function. +We introduce a new mutation strategy that stochastically reintroduces modules +of layers, with power-efficient modules having a higher chance of being chosen. +We introduce a novel technique that allows training two separate models in a +single training step whilst promoting one of them to be more power efficient +than the other while maintaining similar accuracy. The results demonstrate a +reduction in power consumption of ANN models by up to 29.2% without a +significant decrease in predictive performance. + +
+
+
+
+
+ + ☆ Predicting suicidal behavior among Indian adults using childhood trauma, + mental health questionnaires and machine learning cascade ensembles + + +
+ Among young adults, suicide is India's leading cause of death, accounting for +an alarming national suicide rate of around 16%. In recent years, machine +learning algorithms have emerged to predict suicidal behavior using various +behavioral traits. But to date, the efficacy of machine learning algorithms in +predicting suicidal behavior in the Indian context has not been explored in +literature. In this study, different machine learning algorithms and ensembles +were developed to predict suicide behavior based on childhood trauma, different +mental health parameters, and other behavioral factors. The dataset was +acquired from 391 individuals from a wellness center in India. Information +regarding their childhood trauma, psychological wellness, and other mental +health issues was acquired through standardized questionnaires. Results +revealed that cascade ensemble learning methods using a support vector machine, +decision trees, and random forest were able to classify suicidal behavior with +an accuracy of 95.04% using data from childhood trauma and mental health +questionnaires. The study highlights the potential of using these machine +learning ensembles to identify individuals with suicidal tendencies so that +targeted interinterventions could be provided efficiently. + +
+
+ comment: 11 pages, presnted at the 4th International Conference on Frontiers + in Computing and Systems (COMSYS 2023), Himachal Pradesh, October 2023 +
+
+
+
+
+ + ☆ Datacube segmentation via Deep Spectral Clustering + + +
+ Extended Vision techniques are ubiquitous in physics. However, the data cubes +steaming from such analysis often pose a challenge in their interpretation, due +to the intrinsic difficulty in discerning the relevant information from the +spectra composing the data cube. + Furthermore, the huge dimensionality of data cube spectra poses a complex +task in its statistical interpretation; nevertheless, this complexity contains +a massive amount of statistical information that can be exploited in an +unsupervised manner to outline some essential properties of the case study at +hand, e.g.~it is possible to obtain an image segmentation via (deep) clustering +of data-cube's spectra, performed in a suitably defined low-dimensional +embedding space. + To tackle this topic, we explore the possibility of applying unsupervised +clustering methods in encoded space, i.e. perform deep clustering on the +spectral properties of datacube pixels. A statistical dimensional reduction is +performed by an ad hoc trained (Variational) AutoEncoder, in charge of mapping +spectra into lower dimensional metric spaces, while the clustering process is +performed by a (learnable) iterative K-Means clustering algorithm. + We apply this technique to two different use cases, of different physical +origins: a set of Macro mapping X-Ray Fluorescence (MA-XRF) synthetic data on +pictorial artworks, and a dataset of simulated astrophysical observations. + +
+
+ comment: 20 pages, 10 figures, doi for code repository, dataset and trained + model available and reported in the paper +
+
+
+
+
+ + ☆ Convergence analysis of t-SNE as a gradient flow for point cloud on a + manifold + + +
+ We present a theoretical foundation regarding the boundedness of the t-SNE +algorithm. t-SNE employs gradient descent iteration with Kullback-Leibler (KL) +divergence as the objective function, aiming to identify a set of points that +closely resemble the original data points in a high-dimensional space, +minimizing KL divergence. Investigating t-SNE properties such as perplexity and +affinity under a weak convergence assumption on the sampled dataset, we examine +the behavior of points generated by t-SNE under continuous gradient flow. +Demonstrating that points generated by t-SNE remain bounded, we leverage this +insight to establish the existence of a minimizer for KL divergence. + +
+
+
+
+
+ + ☆ An attempt to generate new bridge types from latent space of + energy-based model + + +
+ Use energy-based model for bridge-type innovation. The loss function is +explained by the game theory, the logic is clear and the formula is simple and +clear. Thus avoid the use of maximum likelihood estimation to explain the loss +function and eliminate the need for Monte Carlo methods to solve the normalized +denominator. Assuming that the bridge-type population follows a Boltzmann +distribution, a neural network is constructed to represent the energy function. +Use Langevin dynamics technology to generate a new sample with low energy +value, thus a generative model of bridge-type based on energy is established. +Train energy function on symmetric structured image dataset of three span beam +bridge, arch bridge, cable-stayed bridge, and suspension bridge to accurately +calculate the energy values of real and fake samples. Sampling from latent +space, using gradient descent algorithm, the energy function transforms the +sampling points into low energy score samples, thereby generating new bridge +types different from the dataset. Due to unstable and slow training in this +attempt, the possibility of generating new bridge types is rare and the image +definition of generated images is low. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ A primer on synthetic health data + + +
+ Recent advances in deep generative models have greatly expanded the potential +to create realistic synthetic health datasets. These synthetic datasets aim to +preserve the characteristics, patterns, and overall scientific conclusions +derived from sensitive health datasets without disclosing patient identity or +sensitive information. Thus, synthetic data can facilitate safe data sharing +that supports a range of initiatives including the development of new +predictive models, advanced health IT platforms, and general project ideation +and hypothesis development. However, many questions and challenges remain, +including how to consistently evaluate a synthetic dataset's similarity and +predictive utility in comparison to the original real dataset and risk to +privacy when shared. Additional regulatory and governance issues have not been +widely addressed. In this primer, we map the state of synthetic health data, +including generation and evaluation methods and tools, existing examples of +deployment, the regulatory and ethical landscape, access and governance +options, and opportunities for further development. + +
+
+
+
+
+ + ☆ Spatial-and-Frequency-aware Restoration method for Images based on + Diffusion Models + + +
+ Diffusion models have recently emerged as a promising framework for Image +Restoration (IR), owing to their ability to produce high-quality +reconstructions and their compatibility with established methods. Existing +methods for solving noisy inverse problems in IR, considers the pixel-wise +data-fidelity. In this paper, we propose SaFaRI, a spatial-and-frequency-aware +diffusion model for IR with Gaussian noise. Our model encourages images to +preserve data-fidelity in both the spatial and frequency domains, resulting in +enhanced reconstruction quality. We comprehensively evaluate the performance of +our model on a variety of noisy inverse problems, including inpainting, +denoising, and super-resolution. Our thorough evaluation demonstrates that +SaFaRI achieves state-of-the-art performance on both the ImageNet datasets and +FFHQ datasets, outperforming existing zero-shot IR methods in terms of LPIPS +and FID metrics. + +
+
+
+
+
+ + ☆ Generative AI to Generate Test Data Generators + + +
+ Generating fake data is an essential dimension of modern software testing, as +demonstrated by the number and significance of data faking libraries. Yet, +developers of faking libraries cannot keep up with the wide range of data to be +generated for different natural languages and domains. In this paper, we assess +the ability of generative AI for generating test data in different domains. We +design three types of prompts for Large Language Models (LLMs), which perform +test data generation tasks at different levels of integrability: 1) raw test +data generation, 2) synthesizing programs in a specific language that generate +useful test data, and 3) producing programs that use state-of-the-art faker +libraries. We evaluate our approach by prompting LLMs to generate test data for +11 domains. The results show that LLMs can successfully generate realistic test +data generators in a wide range of domains at all three levels of +integrability. + +
+
+
+
+
+ + ☆ Graph Multi-Similarity Learning for Molecular Property Prediction + + +
+ Effective molecular representation learning is essential for molecular +property prediction. Contrastive learning, a prominent self-supervised approach +for molecular representation learning, relies on establishing positive and +negative pairs. However, this binary similarity categorization oversimplifies +the nature of complex molecular relationships and overlooks the degree of +relative similarities among molecules, posing challenges to the effectiveness +and generality of representation learning. In response to this challenge, we +propose the Graph Multi-Similarity Learning for Molecular Property Prediction +(GraphMSL) framework. GraphMSL incorporates a generalized multi-similarity +metric in a continuous scale, capturing self-similarity and relative +similarities. The unimodal multi-similarity metrics are derived from various +chemical modalities, and the fusion of these metrics into a multimodal form +significantly enhances the effectiveness of GraphMSL. In addition, the +flexibility of fusion function can reshape the focus of the model to convey +different chemical semantics. GraphMSL proves effective in drug discovery +evaluations through various downstream tasks and post-hoc analysis of learnt +representations. Its notable performance suggests significant potential for the +exploration of new drug candidates. + +
+
+
+
+
+ + ☆ IGCN: Integrative Graph Convolutional Networks for Multi-modal Data + + +
+ Recent advances in Graph Neural Networks (GNN) have led to a considerable +growth in graph data modeling for multi-modal data which contains various types +of nodes and edges. Although some integrative prediction solutions have been +developed recently for network-structured data, these methods have some +restrictions. For a node classification task involving multi-modal data, +certain data modalities may perform better when predicting one class, while +others might excel in predicting a different class. Thus, to obtain a better +learning representation, advanced computational methodologies are required for +the integrative analysis of multi-modal data. Moreover, existing integrative +tools lack a comprehensive and cohesive understanding of the rationale behind +their specific predictions, making them unsuitable for enhancing model +interpretability. Addressing these restrictions, we introduce a novel +integrative neural network approach for multi-modal data networks, named +Integrative Graph Convolutional Networks (IGCN). IGCN learns node embeddings +from multiple topologies and fuses the multiple node embeddings into a weighted +form by assigning attention coefficients to the node embeddings. Our proposed +attention mechanism helps identify which types of data receive more emphasis +for each sample to predict a certain class. Therefore, IGCN has the potential +to unravel previously unknown characteristics within different node +classification tasks. We benchmarked IGCN on several datasets from different +domains, including a multi-omics dataset to predict cancer subtypes and a +multi-modal clinical dataset to predict the progression of Alzheimer's disease. +Experimental results show that IGCN outperforms or is on par with the +state-of-the-art and baseline methods. + +
+
+
+
+
+ + ☆ Propagation and Pitfalls: Reasoning-based Assessment of Knowledge + Editing through Counterfactual Tasks + + +
+ Current approaches of knowledge editing struggle to effectively propagate +updates to interconnected facts. In this work, we delve into the barriers that +hinder the appropriate propagation of updated knowledge within these models for +accurate reasoning. To support our analysis, we introduce a novel +reasoning-based benchmark -- ReCoE (Reasoning-based Counterfactual Editing +dataset) -- which covers six common reasoning schemes in real world. We conduct +a thorough analysis of existing knowledge editing techniques, including input +augmentation, finetuning, and locate-and-edit. We found that all model editing +methods show notably low performance on this dataset, especially in certain +reasoning schemes. Our analysis over the chain-of-thought generation of edited +models further uncover key reasons behind the inadequacy of existing knowledge +editing methods from a reasoning standpoint, involving aspects on fact-wise +editing, fact recall ability, and coherence in generation. We will make our +benchmark publicly available. + +
+
+ comment: 22 pages, 14 figures, 5 tables +
+
+
+
+
+ + ☆ Agile But Safe: Learning Collision-Free High-Speed Legged Locomotion + + +
+ Legged robots navigating cluttered environments must be jointly agile for +efficient task execution and safe to avoid collisions with obstacles or humans. +Existing studies either develop conservative controllers (< 1.0 m/s) to ensure +safety, or focus on agility without considering potentially fatal collisions. +This paper introduces Agile But Safe (ABS), a learning-based control framework +that enables agile and collision-free locomotion for quadrupedal robots. ABS +involves an agile policy to execute agile motor skills amidst obstacles and a +recovery policy to prevent failures, collaboratively achieving high-speed and +collision-free navigation. The policy switch in ABS is governed by a learned +control-theoretic reach-avoid value network, which also guides the recovery +policy as an objective function, thereby safeguarding the robot in a closed +loop. The training process involves the learning of the agile policy, the +reach-avoid value network, the recovery policy, and an exteroception +representation network, all in simulation. These trained modules can be +directly deployed in the real world with onboard sensing and computation, +leading to high-speed and collision-free navigation in confined indoor and +outdoor spaces with both static and dynamic obstacles. + +
+
+ comment: Project website: https://agile-but-safe.github.io/ +
+
+
+
+
+ + ☆ Graph Contrastive Learning with Cohesive Subgraph Awareness + + +
+ Graph contrastive learning (GCL) has emerged as a state-of-the-art strategy +for learning representations of diverse graphs including social and biomedical +networks. GCL widely uses stochastic graph topology augmentation, such as +uniform node dropping, to generate augmented graphs. However, such stochastic +augmentations may severely damage the intrinsic properties of a graph and +deteriorate the following representation learning process. We argue that +incorporating an awareness of cohesive subgraphs during the graph augmentation +and learning processes has the potential to enhance GCL performance. To this +end, we propose a novel unified framework called CTAug, to seamlessly integrate +cohesion awareness into various existing GCL mechanisms. In particular, CTAug +comprises two specialized modules: topology augmentation enhancement and graph +learning enhancement. The former module generates augmented graphs that +carefully preserve cohesion properties, while the latter module bolsters the +graph encoder's ability to discern subgraph patterns. Theoretical analysis +shows that CTAug can strictly improve existing GCL mechanisms. Empirical +experiments verify that CTAug can achieve state-of-the-art performance for +graph representation learning, especially for graphs with high degrees. The +code is available at https://doi.org/10.5281/zenodo.10594093, or +https://github.com/wuyucheng2002/CTAug. + +
+
+
+
+
+ + ☆ Scavenging Hyena: Distilling Transformers into Long Convolution Models + + +
+ The rapid evolution of Large Language Models (LLMs), epitomized by +architectures like GPT-4, has reshaped the landscape of natural language +processing. This paper introduces a pioneering approach to address the +efficiency concerns associated with LLM pre-training, proposing the use of +knowledge distillation for cross-architecture transfer. Leveraging insights +from the efficient Hyena mechanism, our method replaces attention heads in +transformer models by Hyena, offering a cost-effective alternative to +traditional pre-training while confronting the challenge of processing long +contextual information, inherent in quadratic attention mechanisms. Unlike +conventional compression-focused methods, our technique not only enhances +inference speed but also surpasses pre-training in terms of both accuracy and +efficiency. In the era of evolving LLMs, our work contributes to the pursuit of +sustainable AI solutions, striking a balance between computational power and +environmental impact. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ☆ Tensor-based process control and monitoring for semiconductor + manufacturing with unstable disturbances + + +
+ With the development and popularity of sensors installed in manufacturing +systems, complex data are collected during manufacturing processes, which +brings challenges for traditional process control methods. This paper proposes +a novel process control and monitoring method for the complex structure of +high-dimensional image-based overlay errors (modeled in tensor form), which are +collected in semiconductor manufacturing processes. The proposed method aims to +reduce overlay errors using limited control recipes. We first build a +high-dimensional process model and propose different tensor-on-vector +regression algorithms to estimate parameters in the model to alleviate the +curse of dimensionality. Then, based on the estimate of tensor parameters, the +exponentially weighted moving average (EWMA) controller for tensor data is +designed whose stability is theoretically guaranteed. Considering the fact that +low-dimensional control recipes cannot compensate for all high-dimensional +disturbances on the image, control residuals are monitored to prevent +significant drifts of uncontrollable high-dimensional disturbances. Through +extensive simulations and real case studies, the performances of parameter +estimation algorithms and the EWMA controller in tensor space are evaluated. +Compared with existing image-based feedback controllers, the superiority of our +method is verified especially when disturbances are not stable. + +
+
+ comment: 30 pages, 5 figures +
+
+
+
+
+ + ☆ Rethinking Channel Dependence for Multivariate Time Series Forecasting: + Learning from Leading Indicators ICLR 2024 + + +
+ Recently, channel-independent methods have achieved state-of-the-art +performance in multivariate time series (MTS) forecasting. Despite reducing +overfitting risks, these methods miss potential opportunities in utilizing +channel dependence for accurate predictions. We argue that there exist locally +stationary lead-lag relationships between variates, i.e., some lagged variates +may follow the leading indicators within a short time period. Exploiting such +channel dependence is beneficial since leading indicators offer advance +information that can be used to reduce the forecasting difficulty of the lagged +variates. In this paper, we propose a new method named LIFT that first +efficiently estimates leading indicators and their leading steps at each time +step and then judiciously allows the lagged variates to utilize the advance +information from leading indicators. LIFT plays as a plugin that can be +seamlessly collaborated with arbitrary time series forecasting methods. +Extensive experiments on six real-world datasets demonstrate that LIFT improves +the state-of-the-art methods by 5.5% in average forecasting performance. + +
+
+ comment: Accepted to ICLR 2024. Preprint version +
+
+
+
+
+ + ☆ Effective Multi-Stage Training Model For Edge Computing Devices In + Intrusion Detection + + +
+ Intrusion detection poses a significant challenge within expansive and +persistently interconnected environments. As malicious code continues to +advance and sophisticated attack methodologies proliferate, various advanced +deep learning-based detection approaches have been proposed. Nevertheless, the +complexity and accuracy of intrusion detection models still need further +enhancement to render them more adaptable to diverse system categories, +particularly within resource-constrained devices, such as those embedded in +edge computing systems. This research introduces a three-stage training +paradigm, augmented by an enhanced pruning methodology and model compression +techniques. The objective is to elevate the system's effectiveness, +concurrently maintaining a high level of accuracy for intrusion detection. +Empirical assessments conducted on the UNSW-NB15 dataset evince that this +solution notably reduces the model's dimensions, while upholding accuracy +levels equivalent to similar proposals. + +
+
+
+
+
+ + ☆ Trainable Fixed-Point Quantization for Deep Learning Acceleration on + FPGAs + + +
+ Quantization is a crucial technique for deploying deep learning models on +resource-constrained devices, such as embedded FPGAs. Prior efforts mostly +focus on quantizing matrix multiplications, leaving other layers like BatchNorm +or shortcuts in floating-point form, even though fixed-point arithmetic is more +efficient on FPGAs. A common practice is to fine-tune a pre-trained model to +fixed-point for FPGA deployment, but potentially degrading accuracy. + This work presents QFX, a novel trainable fixed-point quantization approach +that automatically learns the binary-point position during model training. +Additionally, we introduce a multiplier-free quantization strategy within QFX +to minimize DSP usage. QFX is implemented as a PyTorch-based library that +efficiently emulates fixed-point arithmetic, supported by FPGA HLS, in a +differentiable manner during backpropagation. With minimal effort, models +trained with QFX can readily be deployed through HLS, producing the same +numerical results as their software counterparts. Our evaluation shows that +compared to post-training quantization, QFX can quantize models trained with +element-wise layers quantized to fewer bits and achieve higher accuracy on both +CIFAR-10 and ImageNet datasets. We further demonstrate the efficacy of +multiplier-free quantization using a state-of-the-art binarized neural network +accelerator designed for an embedded FPGA (AMD Xilinx Ultra96 v2). We plan to +release QFX in open-source format. + +
+
+
+
+
+ + ☆ Data-Effective Learning: A Comprehensive Medical Benchmark + + +
+ Data-effective learning aims to use data in the most impactful way to train +AI models, which involves strategies that focus on data quality rather than +quantity, ensuring the data used for training has high informational value. +Data-effective learning plays a profound role in accelerating AI training, +reducing computational costs, and saving data storage, which is very important +as the volume of medical data in recent years has grown beyond many people's +expectations. However, due to the lack of standards and comprehensive +benchmark, research on medical data-effective learning is poorly studied. To +address this gap, our paper introduces a comprehensive benchmark specifically +for evaluating data-effective learning in the medical field. This benchmark +includes a dataset with millions of data samples from 31 medical centers +(DataDEL), a baseline method for comparison (MedDEL), and a new evaluation +metric (NormDEL) to objectively measure data-effective learning performance. +Our extensive experimental results show the baseline MedDEL can achieve +performance comparable to the original large dataset with only 5% of the data. +Establishing such an open data-effective learning benchmark is crucial for the +medical AI research community because it facilitates efficient data use, +promotes collaborative breakthroughs, and fosters the development of +cost-effective, scalable, and impactful healthcare solutions. The project can +be accessed at +https://github.com/shadow2469/Data-Effective-Learning-A-Comprehensive-Medical-Benchmark.git. + +
+
+
+
+
+ + ☆ Towards Understanding Variants of Invariant Risk Minimization through + the Lens of Calibration + + +
+ Machine learning models traditionally assume that training and test data are +independently and identically distributed. However, in real-world applications, +the test distribution often differs from training. This problem, known as +out-of-distribution generalization, challenges conventional models. Invariant +Risk Minimization (IRM) emerges as a solution, aiming to identify features +invariant across different environments to enhance out-of-distribution +robustness. However, IRM's complexity, particularly its bi-level optimization, +has led to the development of various approximate methods. Our study +investigates these approximate IRM techniques, employing the Expected +Calibration Error (ECE) as a key metric. ECE, which measures the reliability of +model prediction, serves as an indicator of whether models effectively capture +environment-invariant features. Through a comparative analysis of datasets with +distributional shifts, we observe that Information Bottleneck-based IRM, which +condenses representational information, achieves a balance in improving ECE +while preserving accuracy relatively. This finding is pivotal, as it +demonstrates a feasible path to maintaining robustness without compromising +accuracy. Nonetheless, our experiments also caution against +over-regularization, which can diminish accuracy. This underscores the +necessity for a systematic approach in evaluating out-of-distribution +generalization metrics, one that beyond mere accuracy to address the nuanced +interplay between accuracy and calibration. + +
+
+
+
+
+ + ☆ Enhancing Score-Based Sampling Methods with Ensembles + + +
+ We introduce ensembles within score-based sampling methods to develop +gradient-free approximate sampling techniques that leverage the collective +dynamics of particle ensembles to compute approximate reverse diffusion drifts. +We introduce the underlying methodology, emphasizing its relationship with +generative diffusion models and the previously introduced F\"ollmer sampler. We +demonstrate the efficacy of ensemble strategies through various examples, +ranging from low- to medium-dimensionality sampling problems, including +multi-modal and highly non-Gaussian probability distributions, and provide +comparisons to traditional methods like NUTS. Our findings highlight the +potential of ensemble strategies for modeling complex probability distributions +in situations where gradients are unavailable. Finally, we showcase its +application in the context of Bayesian inversion problems within the +geophysical sciences. + +
+
+
+
+
+ + ☆ Game-Theoretic Unlearnable Example Generator + + +
+ Unlearnable example attacks are data poisoning attacks aiming to degrade the +clean test accuracy of deep learning by adding imperceptible perturbations to +the training samples, which can be formulated as a bi-level optimization +problem. However, directly solving this optimization problem is intractable for +deep neural networks. In this paper, we investigate unlearnable example attacks +from a game-theoretic perspective, by formulating the attack as a nonzero sum +Stackelberg game. First, the existence of game equilibria is proved under the +normal setting and the adversarial training setting. It is shown that the game +equilibrium gives the most powerful poison attack in that the victim has the +lowest test accuracy among all networks within the same hypothesis space, when +certain loss functions are used. Second, we propose a novel attack method, +called the Game Unlearnable Example (GUE), which has three main gradients. (1) +The poisons are obtained by directly solving the equilibrium of the Stackelberg +game with a first-order algorithm. (2) We employ an autoencoder-like generative +network model as the poison attacker. (3) A novel payoff function is introduced +to evaluate the performance of the poison. Comprehensive experiments +demonstrate that GUE can effectively poison the model in various scenarios. +Furthermore, the GUE still works by using a relatively small percentage of the +training data to train the generator, and the poison generator can generalize +to unseen data well. Our implementation code can be found at +https://github.com/hong-xian/gue. + +
+
+
+
+
+ + ♻ ☆ High-Quality Image Restoration Following Human Instructions + + +
+ Image restoration is a fundamental problem that involves recovering a +high-quality clean image from its degraded observation. All-In-One image +restoration models can effectively restore images from various types and levels +of degradation using degradation-specific information as prompts to guide the +restoration model. In this work, we present the first approach that uses +human-written instructions to guide the image restoration model. Given natural +language prompts, our model can recover high-quality images from their degraded +counterparts, considering multiple degradation types. Our method, InstructIR, +achieves state-of-the-art results on several restoration tasks including image +denoising, deraining, deblurring, dehazing, and (low-light) image enhancement. +InstructIR improves +1dB over previous all-in-one restoration methods. +Moreover, our dataset and results represent a novel benchmark for new research +on text-guided image restoration and enhancement. Our code, datasets and models +are available at: https://github.com/mv-lab/InstructIR + +
+
+
+
+
+ + ♻ ☆ Privacy Risks Analysis and Mitigation in Federated Learning for Medical + Images + + +
+ Federated learning (FL) is gaining increasing popularity in the medical +domain for analyzing medical images, which is considered an effective technique +to safeguard sensitive patient data and comply with privacy regulations. +However, several recent studies have revealed that the default settings of FL +may leak private training data under privacy attacks. Thus, it is still unclear +whether and to what extent such privacy risks of FL exist in the medical +domain, and if so, "how to mitigate such risks?". In this paper, first, we +propose a holistic framework for Medical data Privacy risk analysis and +mitigation in Federated Learning (MedPFL) to analyze privacy risks and develop +effective mitigation strategies in FL for protecting private medical data. +Second, we demonstrate the substantial privacy risks of using FL to process +medical images, where adversaries can easily perform privacy attacks to +reconstruct private medical images accurately. Third, we show that the defense +approach of adding random noises may not always work effectively to protect +medical images against privacy attacks in FL, which poses unique and pressing +challenges associated with medical data for privacy protection. + +
+
+ comment: V1 +
+
+
+
+
+ + ♻ ☆ Deep Network Approximation: Beyond ReLU to Diverse Activation Functions + + +
+ This paper explores the expressive power of deep neural networks for a +diverse range of activation functions. An activation function set $\mathscr{A}$ +is defined to encompass the majority of commonly used activation functions, +such as $\mathtt{ReLU}$, $\mathtt{LeakyReLU}$, $\mathtt{ReLU}^2$, +$\mathtt{ELU}$, $\mathtt{CELU}$, $\mathtt{SELU}$, $\mathtt{Softplus}$, +$\mathtt{GELU}$, $\mathtt{SiLU}$, $\mathtt{Swish}$, $\mathtt{Mish}$, +$\mathtt{Sigmoid}$, $\mathtt{Tanh}$, $\mathtt{Arctan}$, $\mathtt{Softsign}$, +$\mathtt{dSiLU}$, and $\mathtt{SRS}$. We demonstrate that for any activation +function $\varrho\in \mathscr{A}$, a $\mathtt{ReLU}$ network of width $N$ and +depth $L$ can be approximated to arbitrary precision by a $\varrho$-activated +network of width $3N$ and depth $2L$ on any bounded set. This finding enables +the extension of most approximation results achieved with $\mathtt{ReLU}$ +networks to a wide variety of other activation functions, albeit with slightly +increased constants. Significantly, we establish that the (width,$\,$depth) +scaling factors can be further reduced from $(3,2)$ to $(1,1)$ if $\varrho$ +falls within a specific subset of $\mathscr{A}$. This subset includes +activation functions such as $\mathtt{ELU}$, $\mathtt{CELU}$, $\mathtt{SELU}$, +$\mathtt{Softplus}$, $\mathtt{GELU}$, $\mathtt{SiLU}$, $\mathtt{Swish}$, and +$\mathtt{Mish}$. + +
+
+
+
+
+ + ♻ ☆ Baichuan2-Sum: Instruction Finetune Baichuan2-7B Model for Dialogue + Summarization + + +
+ Large language models (LLMs) like Llama, Baichuan and Bloom models show +remarkable ability with instruction fine-tuning in many natural language tasks. +Nevertheless, for the dialogue summarization task, which aims to generate +summaries for different roles in dialogue, most of the state-of-the-art methods +conduct on small models (e.g Bart and Bert). Existing methods try to add task +specified optimization on small models like adding global-local centrality +score to models. In this paper, we propose an instruction fine-tuning model: +Baichuan2-Sum, for role-oriented diaglouge summarization. By setting different +instructions for different roles, the model can learn from the dialogue +interactions and output the expected summaries. Furthermore, we applied NEFTune +technique to add suitable noise during training to improve the results. The +experiments demonstrate that the proposed model achieves the new +state-of-the-art results on two public dialogue summarization datasets: CSDS +and SAMSUM. We release our model and related codes to facilitate future studies +on dialogue summarization task. + +
+
+
+
+
+ + ♻ ☆ Domain-Generalizable Multiple-Domain Clustering + + +
+ This work generalizes the problem of unsupervised domain generalization to +the case in which no labeled samples are available (completely unsupervised). +We are given unlabeled samples from multiple source domains, and we aim to +learn a shared predictor that assigns examples to semantically related +clusters. Evaluation is done by predicting cluster assignments in previously +unseen domains. Towards this goal, we propose a two-stage training framework: +(1) self-supervised pre-training for extracting domain invariant semantic +features. (2) multi-head cluster prediction with pseudo labels, which rely on +both the feature space and cluster head prediction, further leveraging a novel +prediction-based label smoothing scheme. We demonstrate empirically that our +model is more accurate than baselines that require fine-tuning using samples +from the target domain or some level of supervision. Our code is available at +https://github.com/AmitRozner/domain-generalizable-multiple-domain-clustering. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Extracting Dynamical Models from Data + + +
+ The problem of determining the underlying dynamics of a system when only +given data of its state over time has challenged scientists for decades. In +this paper, the approach of using machine learning to model the updates of the +phase space variables is introduced; this is done as a function of the phase +space variables. (More generally, the modeling is done over functions of the +jet space.) This approach (named FJet) allows one to accurately replicate the +dynamics, and is demonstrated on the examples of the damped harmonic +oscillator, the damped pendulum, and the Duffing oscillator; the underlying +differential equation is also accurately recovered for each example. In +addition, the results in no way depend on how the data is sampled over time +(i.e., regularly or irregularly). It is demonstrated that a regression +implementation of FJet is similar to the model resulting from a Taylor series +expansion of the Runge-Kutta (RK) numerical integration scheme. This +identification confers the advantage of explicitly revealing the function space +to use in the modeling, as well as the associated uncertainty quantification +for the updates. Finally, it is shown in the undamped harmonic oscillator +example that the stability of the updates is stable $10^9$ times longer than +with $4$th-order RK (with time step $0.1$). + +
+
+ comment: 19 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Predicting small molecules solubilities on endpoint devices using deep + ensemble neural networks + + +
+ Aqueous solubility is a valuable yet challenging property to predict. +Computing solubility using first-principles methods requires accounting for the +competing effects of entropy and enthalpy, resulting in long computations for +relatively poor accuracy. Data-driven approaches, such as deep learning, offer +improved accuracy and computational efficiency but typically lack uncertainty +quantification. Additionally, ease of use remains a concern for any +computational technique, resulting in the sustained popularity of group-based +contribution methods. In this work, we addressed these problems with a deep +learning model with predictive uncertainty that runs on a static website +(without a server). This approach moves computing needs onto the website +visitor without requiring installation, removing the need to pay for and +maintain servers. Our model achieves satisfactory results in solubility +prediction. Furthermore, we demonstrate how to create molecular property +prediction models that balance uncertainty and ease of use. The code is +available at https://github.com/ur-whitelab/mol.dev, and the model is usable at +https://mol.dev. + +
+
+
+
+
+ + ♻ ☆ Try with Simpler -- An Evaluation of Improved Principal Component + Analysis in Log-based Anomaly Detection + + +
+ The rapid growth of deep learning (DL) has spurred interest in enhancing +log-based anomaly detection. This approach aims to extract meaning from log +events (log message templates) and develop advanced DL models for anomaly +detection. However, these DL methods face challenges like heavy reliance on +training data, labels, and computational resources due to model complexity. In +contrast, traditional machine learning and data mining techniques are less +data-dependent and more efficient but less effective than DL. To make log-based +anomaly detection more practical, the goal is to enhance traditional techniques +to match DL's effectiveness. Previous research in a different domain (linking +questions on Stack Overflow) suggests that optimized traditional techniques can +rival state-of-the-art DL methods. Drawing inspiration from this concept, we +conducted an empirical study. We optimized the unsupervised PCA (Principal +Component Analysis), a traditional technique, by incorporating lightweight +semantic-based log representation. This addresses the issue of unseen log +events in training data, enhancing log representation. Our study compared seven +log-based anomaly detection methods, including four DL-based, two traditional, +and the optimized PCA technique, using public and industrial datasets. Results +indicate that the optimized unsupervised PCA technique achieves similar +effectiveness to advanced supervised/semi-supervised DL methods while being +more stable with limited training data and resource-efficient. This +demonstrates the adaptability and strength of traditional techniques through +small yet impactful adaptations. + +
+
+ comment: Accepted by TOSEM +
+
+
+
+
+ + ♻ ☆ On the Generalizability of ECG-based Stress Detection Models ICML + + +
+ Stress is prevalent in many aspects of everyday life including work, +healthcare, and social interactions. Many works have studied handcrafted +features from various bio-signals that are indicators of stress. Recently, deep +learning models have also been proposed to detect stress. Typically, stress +models are trained and validated on the same dataset, often involving one +stressful scenario. However, it is not practical to collect stress data for +every scenario. So, it is crucial to study the generalizability of these models +and determine to what extent they can be used in other scenarios. In this +paper, we explore the generalization capabilities of Electrocardiogram +(ECG)-based deep learning models and models based on handcrafted ECG features, +i.e., Heart Rate Variability (HRV) features. To this end, we train three HRV +models and two deep learning models that use ECG signals as input. We use ECG +signals from two popular stress datasets - WESAD and SWELL-KW - differing in +terms of stressors and recording devices. First, we evaluate the models using +leave-one-subject-out (LOSO) cross-validation using training and validation +samples from the same dataset. Next, we perform a cross-dataset validation of +the models, that is, LOSO models trained on the WESAD dataset are validated +using SWELL-KW samples and vice versa. While deep learning models achieve the +best results on the same dataset, models based on HRV features considerably +outperform them on data from a different dataset. This trend is observed for +all the models on both datasets. Therefore, HRV models are a better choice for +stress recognition in applications that are different from the dataset +scenario. To the best of our knowledge, this is the first work to compare the +cross-dataset generalizability between ECG-based deep learning models and HRV +models. + +
+
+ comment: Published in Proceedings of 2022 21st IEEE International Conference + on Machine Learning and Applications (ICMLA) +
+
+
+
+
+ + ♻ ☆ An Empathetic AI Coach for Self-Attachment Therapy + + +
+ In this work, we present a new dataset and a computational strategy for a +digital coach that aims to guide users in practicing the protocols of +self-attachment therapy. Our framework augments a rule-based conversational +agent with a deep-learning classifier for identifying the underlying emotion in +a user's text response, as well as a deep-learning assisted retrieval method +for producing novel, fluent and empathetic utterances. We also craft a set of +human-like personas that users can choose to interact with. Our goal is to +achieve a high level of engagement during virtual therapy sessions. We evaluate +the effectiveness of our framework in a non-clinical trial with N=16 +participants, all of whom have had at least four interactions with the agent +over the course of five days. We find that our platform is consistently rated +higher for empathy, user engagement and usefulness than the simple rule-based +framework. Finally, we provide guidelines to further improve the design and +performance of the application, in accordance with the feedback received. + +
+
+
+
+
+ + ♻ ☆ A Generic Machine Learning Framework for Fully-Unsupervised Anomaly + Detection with Contaminated Data + + +
+ Anomaly detection (AD) tasks have been solved using machine learning +algorithms in various domains and applications. The great majority of these +algorithms use normal data to train a residual-based model and assign anomaly +scores to unseen samples based on their dissimilarity with the learned normal +regime. The underlying assumption of these approaches is that anomaly-free data +is available for training. This is, however, often not the case in real-world +operational settings, where the training data may be contaminated with an +unknown fraction of abnormal samples. Training with contaminated data, in turn, +inevitably leads to a deteriorated AD performance of the residual-based +algorithms. + In this paper we introduce a framework for a fully unsupervised refinement of +contaminated training data for AD tasks. The framework is generic and can be +applied to any residual-based machine learning model. We demonstrate the +application of the framework to two public datasets of multivariate time series +machine data from different application fields. We show its clear superiority +over the naive approach of training with contaminated data without refinement. +Moreover, we compare it to the ideal, unrealistic reference in which +anomaly-free data would be available for training. The method is based on +evaluating the contribution of individual samples to the generalization ability +of a given model, and contrasting the contribution of anomalies with the one of +normal samples. As a result, the proposed approach is comparable to, and often +outperforms training with normal samples only. + +
+
+
+
+
+ + ♻ ☆ Efficiently Solving High-Order and Nonlinear ODEs with Rational Fraction + Polynomial: the Ratio Net + + +
+ Recent advances in solving ordinary differential equations (ODEs) with neural +networks have been remarkable. Neural networks excel at serving as trial +functions and approximating solutions within functional spaces, aided by +gradient backpropagation algorithms. However, challenges remain in solving +complex ODEs, including high-order and nonlinear cases, emphasizing the need +for improved efficiency and effectiveness. Traditional methods have typically +relied on established knowledge integration to improve problem-solving +efficiency. In contrast, this study takes a different approach by introducing a +new neural network architecture for constructing trial functions, known as +ratio net. This architecture draws inspiration from rational fraction +polynomial approximation functions, specifically the Pade approximant. Through +empirical trials, it demonstrated that the proposed method exhibits higher +efficiency compared to existing approaches, including polynomial-based and +multilayer perceptron (MLP) neural network-based methods. The ratio net holds +promise for advancing the efficiency and effectiveness of solving differential +equations. + +
+
+
+
+
+ + ♻ ☆ ConcatPlexer: Additional Dim1 Batching for Faster ViTs + + +
+ Transformers have demonstrated tremendous success not only in the natural +language processing (NLP) domain but also the field of computer vision, +igniting various creative approaches and applications. Yet, the superior +performance and modeling flexibility of transformers came with a severe +increase in computation costs, and hence several works have proposed methods to +reduce this burden. Inspired by a cost-cutting method originally proposed for +language models, Data Multiplexing (DataMUX), we propose a novel approach for +efficient visual recognition that employs additional dim1 batching (i.e., +concatenation) that greatly improves the throughput with little compromise in +the accuracy. We first introduce a naive adaptation of DataMux for vision +models, Image Multiplexer, and devise novel components to overcome its +weaknesses, rendering our final model, ConcatPlexer, at the sweet spot between +inference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and +CIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and +83.4% validation accuracy, respectively. + +
+
+
+
+
+ + ♻ ☆ Efficient Learning of Long-Range and Equivariant Quantum Systems + + +
+ In this work, we consider a fundamental task in quantum many-body physics - +finding and learning ground states of quantum Hamiltonians and their +properties. Recent works have studied the task of predicting the ground state +expectation value of sums of geometrically local observables by learning from +data. For short-range gapped Hamiltonians, a sample complexity that is +logarithmic in the number of qubits and quasipolynomial in the error was +obtained. Here we extend these results beyond the local requirements on both +Hamiltonians and observables, motivated by the relevance of long-range +interactions in molecular and atomic systems. For interactions decaying as a +power law with exponent greater than twice the dimension of the system, we +recover the same efficient logarithmic scaling with respect to the number of +qubits, but the dependence on the error worsens to exponential. Further, we +show that learning algorithms equivariant under the automorphism group of the +interaction hypergraph achieve a sample complexity reduction, leading in +particular to a constant number of samples for learning sums of local +observables in systems with periodic boundary conditions. We demonstrate the +efficient scaling in practice by learning from DMRG simulations of $1$D +long-range and disordered systems with up to $128$ qubits. Finally, we provide +an analysis of the concentration of expectation values of global observables +stemming from the central limit theorem, resulting in increased prediction +accuracy. + +
+
+ comment: 51 pages +
+
+
+
+
+ + ♻ ☆ Fundamental Limits of Membership Inference Attacks on Machine Learning + Models + + +
+ Membership inference attacks (MIA) can reveal whether a particular data point +was part of the training dataset, potentially exposing sensitive information +about individuals. This article provides theoretical guarantees by exploring +the fundamental statistical limitations associated with MIAs on machine +learning models. More precisely, we first derive the statistical quantity that +governs the effectiveness and success of such attacks. We then deduce that in a +very general regression setting with overfitting algorithms, attacks may have a +high probability of success. Finally, we investigate several situations for +which we provide bounds on this quantity of interest. Our results enable us to +deduce the accuracy of potential attacks based on the number of samples and +other structural parameters of learning models. In certain instances, these +parameters can be directly estimated from the dataset. + +
+
+
+
+
+ + ♻ ☆ CARD: Channel Aligned Robust Blend Transformer for Time Series + Forecasting ICLR 2024 + + +
+ Recent studies have demonstrated the great power of Transformer models for +time series forecasting. One of the key elements that lead to the transformer's +success is the channel-independent (CI) strategy to improve the training +robustness. However, the ignorance of the correlation among different channels +in CI would limit the model's forecasting capacity. In this work, we design a +special Transformer, i.e., {\bf C}hannel {\bf A}ligned {\bf R}obust Blen{\bf d} +Transformer (CARD for short), that addresses key shortcomings of CI type +Transformer in time series forecasting. First, CARD introduces a +channel-aligned attention structure that allows it to capture both temporal +correlations among signals and dynamical dependence among multiple variables +over time. Second, in order to efficiently utilize the multi-scale knowledge, +we design a token blend module to generate tokens with different resolutions. +Third, we introduce a robust loss function for time series forecasting to +alleviate the potential overfitting issue. This new loss function weights the +importance of forecasting over a finite horizon based on prediction +uncertainties. Our evaluation of multiple long-term and short-term forecasting +datasets demonstrates that CARD significantly outperforms state-of-the-art time +series forecasting methods. The code is available at the following anonymous +repository: \url{https://anonymous.4open.science/r/CARD-6EEC} + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Variational Autoencoding of Dental Point Clouds + + +
+ Digital dentistry has made significant advancements, yet numerous challenges +remain. This paper introduces the FDI 16 dataset, an extensive collection of +tooth meshes and point clouds. Additionally, we present a novel approach: +Variational FoldingNet (VF-Net), a fully probabilistic variational autoencoder +designed for point clouds. Notably, prior latent variable models for point +clouds lack a one-to-one correspondence between input and output points. +Instead, they rely on optimizing Chamfer distances, a metric that lacks a +normalized distributional counterpart, rendering it unsuitable for +probabilistic modeling. We replace the explicit minimization of Chamfer +distances with a suitable encoder, increasing computational efficiency while +simplifying the probabilistic extension. This allows for straightforward +application in various tasks, including mesh generation, shape completion, and +representation learning. Empirically, we provide evidence of lower +reconstruction error in dental reconstruction and interpolation, showcasing +state-of-the-art performance in dental sample generation while identifying +valuable latent representations. + +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Vanishing Gradients in Reinforcement Finetuning of Language Models ICLR 2024 + + +
+ Pretrained language models are commonly aligned with human preferences and +downstream tasks via reinforcement finetuning (RFT), which refers to maximizing +a (possibly learned) reward function using policy gradient algorithms. This +work identifies a fundamental optimization obstacle in RFT: we prove that the +expected gradient for an input vanishes when its reward standard deviation +under the model is small, even if the expected reward is far from optimal. +Through experiments on an RFT benchmark and controlled environments, as well as +a theoretical analysis, we then demonstrate that vanishing gradients due to +small reward standard deviation are prevalent and detrimental, leading to +extremely slow reward maximization. Lastly, we explore ways to overcome +vanishing gradients in RFT. We find the common practice of an initial +supervised finetuning (SFT) phase to be the most promising candidate, which +sheds light on its importance in an RFT pipeline. Moreover, we show that a +relatively small number of SFT optimization steps on as few as 1% of the input +samples can suffice, indicating that the initial SFT phase need not be +expensive in terms of compute and data labeling efforts. Overall, our results +emphasize that being mindful for inputs whose expected gradient vanishes, as +measured by the reward standard deviation, is crucial for successful execution +of RFT. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Consistent Signal Reconstruction from Streaming Multivariate Time Series + + +
+ Digitalizing real-world analog signals typically involves sampling in time +and discretizing in amplitude. Subsequent signal reconstructions inevitably +incur an error that depends on the amplitude resolution and the temporal +density of the acquired samples. From an implementation viewpoint, consistent +signal reconstruction methods have proven a profitable error-rate decay as the +sampling rate increases. Despite that, these results are obtained under offline +settings. Therefore, a research gap exists regarding methods for consistent +signal reconstruction from data streams. Solving this problem is of great +importance because such methods could run at a lower computational cost than +the existing offline ones or be used under real-time requirements without +losing the benefits of ensuring consistency. In this paper, we formalize for +the first time the concept of consistent signal reconstruction from streaming +time-series data. Then, we present a signal reconstruction method able to +enforce consistency and also exploit the spatiotemporal dependencies of +streaming multivariate time-series data to further reduce the signal +reconstruction error. Our experiments show that our proposed method achieves a +favorable error-rate decay with the sampling rate compared to a similar but +non-consistent reconstruction. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Federated Full-Parameter Tuning of Billion-Sized Language Models with + Communication Cost under 18 Kilobytes + + +
+ Pre-trained large language models (LLMs) need fine-tuning to improve their +responsiveness to natural language instructions. Federated learning offers a +way to fine-tune LLMs using the abundant data on end devices without +compromising data privacy. Most existing federated fine-tuning methods for LLMs +rely on parameter-efficient fine-tuning techniques, which may not reach the +performance height possible with full-parameter tuning. However, federated +full-parameter tuning of LLMs is a non-trivial problem due to the immense +communication cost. This work introduces FedKSeed that employs zeroth-order +optimization with a finite set of random seeds. It significantly reduces +transmission requirements between the server and clients to just a few random +seeds and scalar gradients, amounting to only a few thousand bytes, making +federated full-parameter tuning of billion-sized LLMs possible on devices. +Building on it, we develop a strategy enabling probability-differentiated seed +sampling, prioritizing perturbations with greater impact on model accuracy. +Experiments across six scenarios with various LLMs, datasets and data +partitions demonstrate that our approach outperforms existing federated LLM +fine-tuning methods in both communication efficiency and new task +generalization. + +
+
+ comment: Codes are available at + https://github.com/alibaba/FederatedScope/tree/FedKSeed. We will continuously + update the codebase and arXiv version +
+
+
+
+
+ + ♻ ☆ Causal Machine Learning for Cost-Effective Allocation of Development Aid + + +
+ The Sustainable Development Goals (SDGs) of the United Nations provide a +blueprint of a better future by 'leaving no one behind', and, to achieve the +SDGs by 2030, poor countries require immense volumes of development aid. In +this paper, we develop a causal machine learning framework for predicting +heterogeneous treatment effects of aid disbursements to inform effective aid +allocation. Specifically, our framework comprises three components: (i) a +balancing autoencoder that uses representation learning to embed +high-dimensional country characteristics while addressing treatment selection +bias; (ii) a counterfactual generator to compute counterfactual outcomes for +varying aid volumes to address small sample-size settings; and (iii) an +inference model that is used to predict heterogeneous treatment-response +curves. We demonstrate the effectiveness of our framework using data with +official development aid earmarked to end HIV/AIDS in 105 countries, amounting +to more than USD 5.2 billion. For this, we first show that our framework +successfully computes heterogeneous treatment-response curves using +semi-synthetic data. Then, we demonstrate our framework using real-world HIV +data. Our framework points to large opportunities for a more effective aid +allocation, suggesting that the total number of new HIV infections could be +reduced by up to 3.3% (~50,000 cases) compared to the current allocation +practice. + +
+
+
+
+
+ + ♻ ☆ Semantic Sensitivities and Inconsistent Predictions: Measuring the + Fragility of NLI Models EACL 2024 + + +
+ Recent studies of the emergent capabilities of transformer-based Natural +Language Understanding (NLU) models have indicated that they have an +understanding of lexical and compositional semantics. We provide evidence that +suggests these claims should be taken with a grain of salt: we find that +state-of-the-art Natural Language Inference (NLI) models are sensitive towards +minor semantics preserving surface-form variations, which lead to sizable +inconsistent model decisions during inference. Notably, this behaviour differs +from valid and in-depth comprehension of compositional semantics, however does +neither emerge when evaluating model accuracy on standard benchmarks nor when +probing for syntactic, monotonic, and logically robust reasoning. We propose a +novel framework to measure the extent of semantic sensitivity. To this end, we +evaluate NLI models on adversarially generated examples containing minor +semantics-preserving surface-form input noise. This is achieved using +conditional text generation, with the explicit condition that the NLI model +predicts the relationship between the original and adversarial inputs as a +symmetric equivalence entailment. We systematically study the effects of the +phenomenon across NLI models for $\textbf{in-}$ and $\textbf{out-of-}$ domain +settings. Our experiments show that semantic sensitivity causes performance +degradations of $12.92\%$ and $23.71\%$ average over $\textbf{in-}$ and +$\textbf{out-of-}$ domain settings, respectively. We further perform ablation +studies, analysing this phenomenon across models, datasets, and variations in +inference and show that semantic sensitivity can lead to major inconsistency +within model predictions. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ♻ ☆ Wind speed super-resolution and validation: from ERA5 to CERRA via + diffusion models + + +
+ The Copernicus Regional Reanalysis for Europe, CERRA, is a high-resolution +regional reanalysis dataset for the European domain. In recent years it has +shown significant utility across various climate-related tasks, ranging from +forecasting and climate change research to renewable energy prediction, +resource management, air quality risk assessment, and the forecasting of rare +events, among others. Unfortunately, the availability of CERRA is lagging two +years behind the current date, due to constraints in acquiring the requisite +external data and the intensive computational demands inherent in its +generation. As a solution, this paper introduces a novel method using diffusion +models to approximate CERRA downscaling in a data-driven manner, without +additional informations. By leveraging the lower resolution ERA5 dataset, which +provides boundary conditions for CERRA, we approach this as a super-resolution +task. Focusing on wind speed around Italy, our model, trained on existing CERRA +data, shows promising results, closely mirroring original CERRA data. +Validation with in-situ observations further confirms the model's accuracy in +approximating ground measurements. + +
+
+
+
+
+ + ♻ ☆ What Is Fairness? On the Role of Protected Attributes and Fictitious + Worlds + + +
+ A growing body of literature in fairness-aware ML (fairML) aspires to +mitigate machine learning (ML)-related unfairness in automated decision-making +(ADM) by defining metrics that measure fairness of an ML model and by proposing +methods that ensure that trained ML models achieve low values in those metrics. +However, the underlying concept of fairness, i.e., the question of what +fairness is, is rarely discussed, leaving a considerable gap between centuries +of philosophical discussion and recent adoption of the concept in the ML +community. In this work, we try to bridge this gap by formalizing a consistent +concept of fairness and by translating the philosophical considerations into a +formal framework for the training and evaluation of ML models in ADM systems. +We derive that fairness problems can already arise without the presence of +protected attributes (PAs), pointing out that fairness and predictive +performance are not irreconcilable counterparts, but rather that the latter is +necessary to achieve the former. Moreover, we argue why and how causal +considerations are necessary when assessing fairness in the presence of PAs by +proposing a fictitious, normatively desired (FiND) world where the PAs have no +causal effects. In practice, this FiND world must be approximated by a warped +world, for which the causal effects of the PAs must be removed from the +real-world data. Eventually, we achieve greater linguistic clarity for the +discussion of fairML. We propose first algorithms for practical applications +and present illustrative experiments on COMPAS data. + +
+
+
+
+
+ + ♻ ☆ Multilingual Text-to-Image Generation Magnifies Gender Stereotypes and + Prompt Engineering May Not Help You + + +
+ Text-to-image generation models have recently achieved astonishing results in +image quality, flexibility, and text alignment and are consequently employed in +a fast-growing number of applications. Through improvements in multilingual +abilities, a larger community now has access to this kind of technology. Yet, +as we will show, multilingual models suffer similarly from (gender) biases as +monolingual models. Furthermore, the natural expectation is that these models +will provide similar results across languages, but this is not the case and +there are important differences between languages. Thus, we propose a novel +benchmark MAGBIG intending to foster research in multilingual models without +gender bias. We investigate whether multilingual T2I models magnify gender bias +with MAGBIG. To this end, we use multilingual prompts requesting portrait +images of persons of a certain occupation or trait (using adjectives). Our +results show not only that models deviate from the normative assumption that +each gender should be equally likely to be generated, but that there are also +big differences across languages. Furthermore, we investigate prompt +engineering strategies, i.e. the use of indirect, neutral formulations, as a +possible remedy for these biases. Unfortunately, they help only to a limited +extent and result in worse text-to-image alignment. Consequently, this work +calls for more research into diverse representations across languages in image +generators. + +
+
+
+
+
+ + ♻ ☆ Fast Cell Library Characterization for Design Technology Co-Optimization + Based on Graph Neural Networks + + +
+ Design technology co-optimization (DTCO) plays a critical role in achieving +optimal power, performance, and area (PPA) for advanced semiconductor process +development. Cell library characterization is essential in DTCO flow, but +traditional methods are time-consuming and costly. To overcome these +challenges, we propose a graph neural network (GNN)-based machine learning +model for rapid and accurate cell library characterization. Our model +incorporates cell structures and demonstrates high prediction accuracy across +various process-voltage-temperature (PVT) corners and technology parameters. +Validation with 512 unseen technology corners and over one million test data +points shows accurate predictions of delay, power, and input pin capacitance +for 33 types of cells, with a mean absolute percentage error (MAPE) $\le$ 0.95% +and a speed-up of 100X compared with SPICE simulations. Additionally, we +investigate system-level metrics such as worst negative slack (WNS), leakage +power, and dynamic power using predictions obtained from the GNN-based model on +unseen corners. Our model achieves precise predictions, with absolute error +$\le$3.0 ps for WNS, percentage errors $\le$0.60% for leakage power, and +$\le$0.99% for dynamic power, when compared to golden reference. With the +developed model, we further proposed a fine-grained drive strength +interpolation methodology to enhance PPA for small-to-medium-scale designs, +resulting in an approximate 1-3% improvement. + +
+
+
+
+
+ + ♻ ☆ Improving Antibody Humanness Prediction using Patent Data + + +
+ We investigate the potential of patent data for improving the antibody +humanness prediction using a multi-stage, multi-loss training process. +Humanness serves as a proxy for the immunogenic response to antibody +therapeutics, one of the major causes of attrition in drug discovery and a +challenging obstacle for their use in clinical settings. We pose the initial +learning stage as a weakly-supervised contrastive-learning problem, where each +antibody sequence is associated with possibly multiple identifiers of function +and the objective is to learn an encoder that groups them according to their +patented properties. We then freeze a part of the contrastive encoder and +continue training it on the patent data using the cross-entropy loss to predict +the humanness score of a given antibody sequence. We illustrate the utility of +the patent data and our approach by performing inference on three different +immunogenicity datasets, unseen during training. Our empirical results +demonstrate that the learned model consistently outperforms the alternative +baselines and establishes new state-of-the-art on five out of six inference +tasks, irrespective of the used metric. + +
+
+ comment: 13 pages, 6 figures, Code: https://github.com/AstraZeneca/SelfPAD +
+
+
+
+
+ + ♻ ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ HAAQI-Net: A non-intrusive neural music quality assessment model for + hearing aids + + +
+ This paper introduces HAAQI-Net, a non-intrusive deep learning model for +music quality assessment tailored to hearing aid users. In contrast to +traditional methods like the Hearing Aid Audio Quality Index (HAAQI), HAAQI-Net +utilizes a Bidirectional Long Short-Term Memory (BLSTM) with attention. It +takes an assessed music sample and a hearing loss pattern as input, generating +a predicted HAAQI score. The model employs the pre-trained Bidirectional +Encoder representation from Audio Transformers (BEATs) for acoustic feature +extraction. Comparing predicted scores with ground truth, HAAQI-Net achieves a +Longitudinal Concordance Correlation (LCC) of 0.9368, Spearman's Rank +Correlation Coefficient (SRCC) of 0.9486, and Mean Squared Error (MSE) of +0.0064. Notably, this high performance comes with a substantial reduction in +inference time: from 62.52 seconds (by HAAQI) to 2.54 seconds (by HAAQI-Net), +serving as an efficient music quality assessment model for hearing aid users. + +
+
+
+
+
+ + ♻ ☆ Variational Transfer Learning using Cross-Domain Latent Modulation WACV + + +
+ To successfully apply trained neural network models to new domains, powerful +transfer learning solutions are essential. We propose to introduce a novel +cross-domain latent modulation mechanism to a variational autoencoder framework +so as to achieve effective transfer learning. Our key idea is to procure deep +representations from one data domain and use it to influence the +reparameterization of the latent variable of another domain. Specifically, deep +representations of the source and target domains are first extracted by a +unified inference model and aligned by employing gradient reversal. The learned +deep representations are then cross-modulated to the latent encoding of the +alternative domain, where consistency constraints are also applied. In the +empirical validation that includes a number of transfer learning benchmark +tasks for unsupervised domain adaptation and image-to-image translation, our +model demonstrates competitive performance, which is also supported by evidence +obtained from visualization. + +
+
+ comment: Under review. Extended version of a previous WACV paper + (arXiv:2012.11727). 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Learning to Predict Gradients for Semi-Supervised Continual Learning + + +
+ A key challenge for machine intelligence is to learn new visual concepts +without forgetting the previously acquired knowledge. Continual learning is +aimed towards addressing this challenge. However, there is a gap between +existing supervised continual learning and human-like intelligence, where human +is able to learn from both labeled and unlabeled data. How unlabeled data +affects learning and catastrophic forgetting in the continual learning process +remains unknown. To explore these issues, we formulate a new semi-supervised +continual learning method, which can be generically applied to existing +continual learning models. Specifically, a novel gradient learner learns from +labeled data to predict gradients on unlabeled data. Hence, the unlabeled data +could fit into the supervised continual learning method. Different from +conventional semi-supervised settings, we do not hypothesize that the +underlying classes, which are associated to the unlabeled data, are known to +the learning process. In other words, the unlabeled data could be very distinct +from the labeled data. We evaluate the proposed method on mainstream continual +learning, adversarial continual learning, and semi-supervised learning tasks. +The proposed method achieves state-of-the-art performance on classification +accuracy and backward transfer in the continual learning setting while +achieving desired performance on classification accuracy in the semi-supervised +learning setting. This implies that the unlabeled images can enhance the +generalizability of continual learning models on the predictive ability on +unseen data and significantly alleviate catastrophic forgetting. The code is +available at \url{https://github.com/luoyan407/grad_prediction.git}. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems + (TNNLS) +
+
+
+
+
+ + ♻ ☆ Integral Operator Approaches for Scattered Data Fitting on Spheres + + +
+ This paper focuses on scattered data fitting problems on spheres. We study +the approximation performance of a class of weighted spectral filter +algorithms, including Tikhonov regularization, Landaweber iteration, spectral +cut-off, and iterated Tikhonov, in fitting noisy data with possibly unbounded +random noise. For the analysis, we develop an integral operator approach that +can be regarded as an extension of the widely used sampling inequality approach +and norming set method in the community of scattered data fitting. After +providing an equivalence between the operator differences and quadrature rules, +we succeed in deriving optimal Sobolev-type error estimates of weighted +spectral filter algorithms. Our derived error estimates do not suffer from the +saturation phenomenon for Tikhonov regularization in the literature, +native-space-barrier for existing error analysis and adapts to different +embedding spaces. We also propose a divide-and-conquer scheme to equip weighted +spectral filter algorithms to reduce their computational burden and present the +optimal approximation error bounds. + +
+
+
+
+
+ + ♻ ☆ Widely Linear Matched Filter: A Lynchpin towards the Interpretability of + Complex-valued CNNs + + +
+ A recent study on the interpretability of real-valued convolutional neural +networks (CNNs) {Stankovic_Mandic_2023CNN} has revealed a direct and physically +meaningful link with the task of finding features in data through matched +filters. However, applying this paradigm to illuminate the interpretability of +complex-valued CNNs meets a formidable obstacle: the extension of matched +filtering to a general class of noncircular complex-valued data, referred to +here as the widely linear matched filter (WLMF), has been only implicit in the +literature. To this end, to establish the interpretability of the operation of +complex-valued CNNs, we introduce a general WLMF paradigm, provide its solution +and undertake analysis of its performance. For rigor, our WLMF solution is +derived without imposing any assumption on the probability density of noise. +The theoretical advantages of the WLMF over its standard strictly linear +counterpart (SLMF) are provided in terms of their output signal-to-noise-ratios +(SNRs), with WLMF consistently exhibiting enhanced SNR. Moreover, the lower +bound on the SNR gain of WLMF is derived, together with condition to attain +this bound. This serves to revisit the convolution-activation-pooling chain in +complex-valued CNNs through the lens of matched filtering, which reveals the +potential of WLMFs to provide physical interpretability and enhance +explainability of general complex-valued CNNs. Simulations demonstrate the +agreement between the theoretical and numerical results. + +
+
+
+
+
+ + ♻ ☆ What Do Self-Supervised Speech Models Know About Words? + + +
+ Many self-supervised speech models (S3Ms) have been introduced over the last +few years, improving performance and data efficiency on various speech tasks. +However, these empirical successes alone do not give a complete picture of what +is learned during pre-training. Recent work has begun analyzing how S3Ms encode +certain properties, such as phonetic and speaker information, but we still lack +a proper understanding of knowledge encoded at the word level and beyond. In +this work, we use lightweight analysis methods to study segment-level +linguistic properties -- word identity, boundaries, pronunciation, syntactic +features, and semantic features -- encoded in S3Ms. We present a comparative +study of layer-wise representations from ten S3Ms and find that (i) the +frame-level representations within each word segment are not all equally +informative, and (ii) the pre-training objective and model size heavily +influence the accessibility and distribution of linguistic information across +layers. We also find that on several tasks -- word discrimination, word +segmentation, and semantic sentence similarity -- S3Ms trained with visual +grounding outperform their speech-only counterparts. Finally, our task-based +analyses demonstrate improved performance on word segmentation and acoustic +word discrimination while using simpler methods than prior work. + +
+
+ comment: Pre-MIT Press publication version +
+
+
+
+
+ + ♻ ☆ Moderately Supervised Learning: Definition, Framework and Generality + + +
+ Learning with supervision has achieved remarkable success in numerous +artificial intelligence (AI) applications. In the current literature, by +referring to the properties of the labels prepared for the training dataset, +learning with supervision is categorized as supervised learning (SL) and weakly +supervised learning (WSL). SL concerns the situation where the training data +set is assigned with ideal (complete, exact and accurate) labels, while WSL +concerns the situation where the training data set is assigned with non-ideal +(incomplete, inexact or inaccurate) labels. However, various solutions for SL +tasks have shown that the given labels are not always easy to learn, and the +transformation from the given labels to easy-to-learn targets can significantly +affect the performance of the final SL solutions. Without considering the +properties of the transformation from the given labels to easy-to-learn +targets, the definition of SL conceals some details that can be critical to +building the appropriate solutions for specific SL tasks. Thus, for engineers +in the AI application field, it is desirable to reveal these details +systematically. This article attempts to achieve this goal by expanding the +categorization of SL and investigating the sub-type moderately supervised +learning (MSL) that concerns the situation where the given labels are ideal, +but due to the simplicity in annotation, careful designs are required to +transform the given labels into easy-to-learn targets. From the perspectives of +the definition, framework and generality, we conceptualize MSL to present a +complete fundamental basis to systematically analyse MSL tasks. At meantime, +revealing the relation between the conceptualization of MSL and the +mathematicians' vision, this paper as well establishes a tutorial for AI +application engineers to refer to viewing a problem to be solved from the +mathematicians' vision. + +
+
+ comment: This is the final published version (33 pages) +
+
+
+
+
+ + ♻ ☆ A RelEntLess Benchmark for Modelling Graded Relations between Named + Entities EACL 2024 + + +
+ Relations such as "is influenced by", "is known for" or "is a competitor of" +are inherently graded: we can rank entity pairs based on how well they satisfy +these relations, but it is hard to draw a line between those pairs that satisfy +them and those that do not. Such graded relations play a central role in many +applications, yet they are typically not covered by existing Knowledge Graphs. +In this paper, we consider the possibility of using Large Language Models +(LLMs) to fill this gap. To this end, we introduce a new benchmark, in which +entity pairs have to be ranked according to how much they satisfy a given +graded relation. The task is formulated as a few-shot ranking problem, where +models only have access to a description of the relation and five prototypical +instances. We use the proposed benchmark to evaluate state-of-the-art relation +embedding strategies as well as several recent LLMs, covering both publicly +available LLMs and closed models such as GPT-4. Overall, we find a strong +correlation between model size and performance, with smaller Language Models +struggling to outperform a naive baseline. The results of the largest Flan-T5 +and OPT models are remarkably strong, although a clear gap with human +performance remains. + +
+
+ comment: EACL 2024 main conference +
+
+
+
+
+ + ♻ ☆ ECNR: Efficient Compressive Neural Representation of Time-Varying + Volumetric Datasets + + +
+ Due to its conceptual simplicity and generality, compressive neural +representation has emerged as a promising alternative to traditional +compression methods for managing massive volumetric datasets. The current +practice of neural compression utilizes a single large multilayer perceptron +(MLP) to encode the global volume, incurring slow training and inference. This +paper presents an efficient compressive neural representation (ECNR) solution +for time-varying data compression, utilizing the Laplacian pyramid for adaptive +signal fitting. Following a multiscale structure, we leverage multiple small +MLPs at each scale for fitting local content or residual blocks. By assigning +similar blocks to the same MLP via size uniformization, we enable balanced +parallelization among MLPs to significantly speed up training and inference. +Working in concert with the multiscale structure, we tailor a deep compression +strategy to compact the resulting model. We show the effectiveness of ECNR with +multiple datasets and compare it with state-of-the-art compression methods +(mainly SZ3, TTHRESH, and neurcomp). The results position ECNR as a promising +solution for volumetric data compression. + +
+
+ comment: Accepted by IEEE PacificVis 2024 (conference papers track) +
+
+
+
+
+ + ♻ ☆ Some Primal-Dual Theory for Subgradient Methods for Strongly Convex + Optimization + + +
+ We consider (stochastic) subgradient methods for strongly convex but +potentially nonsmooth non-Lipschitz optimization. We provide new equivalent +dual descriptions (in the style of dual averaging) for the classic subgradient +method, the proximal subgradient method, and the switching subgradient method. +These equivalences enable $O(1/T)$ convergence guarantees in terms of both +their classic primal gap and a not previously analyzed dual gap for strongly +convex optimization. Consequently, our theory provides these classic methods +with simple, optimal stopping criteria and optimality certificates at no added +computational cost. Our results apply to a wide range of stepsize selections +and of non-Lipschitz ill-conditioned problems where the early iterations of the +subgradient method may diverge exponentially quickly (a phenomenon which, to +the best of our knowledge, no prior works address). Even in the presence of +such undesirable behaviors, our theory still ensures and bounds eventual +convergence. + +
+
+ comment: 22 pages, major revision shortened the write-up and unified the + analysis to be done just once in a single "super" setting +
+
+
+
+
+ + ♻ ☆ Beyond Surprise: Improving Exploration Through Surprise Novelty + + +
+ We present a new computing model for intrinsic rewards in reinforcement +learning that addresses the limitations of existing surprise-driven +explorations. The reward is the novelty of the surprise rather than the +surprise norm. We estimate the surprise novelty as retrieval errors of a memory +network wherein the memory stores and reconstructs surprises. Our surprise +memory (SM) augments the capability of surprise-based intrinsic motivators, +maintaining the agent's interest in exciting exploration while reducing +unwanted attraction to unpredictable or noisy observations. Our experiments +demonstrate that the SM combined with various surprise predictors exhibits +efficient exploring behaviors and significantly boosts the final performance in +sparse reward environments, including Noisy-TV, navigation and challenging +Atari games. + +
+
+ comment: 17 pages including Appendix +
+
+
+
+
+ + ♻ ☆ Bayesian Self-Supervised Contrastive Learning + + +
+ Recent years have witnessed many successful applications of contrastive +learning in diverse domains, yet its self-supervised version still remains many +exciting challenges. As the negative samples are drawn from unlabeled datasets, +a randomly selected sample may be actually a false negative to an anchor, +leading to incorrect encoder training. This paper proposes a new +self-supervised contrastive loss called the BCL loss that still uses random +samples from the unlabeled data while correcting the resulting bias with +importance weights. The key idea is to design the desired sampling distribution +for sampling hard true negative samples under the Bayesian framework. The +prominent advantage lies in that the desired sampling distribution is a +parametric structure, with a location parameter for debiasing false negative +and concentration parameter for mining hard negative, respectively. Experiments +validate the effectiveness and superiority of the BCL loss. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Over-the-air Federated Policy Gradient + + +
+ In recent years, over-the-air aggregation has been widely considered in +large-scale distributed learning, optimization, and sensing. In this paper, we +propose the over-the-air federated policy gradient algorithm, where all agents +simultaneously broadcast an analog signal carrying local information to a +common wireless channel, and a central controller uses the received aggregated +waveform to update the policy parameters. We investigate the effect of noise +and channel distortion on the convergence of the proposed algorithm, and +establish the complexities of communication and sampling for finding an +$\epsilon$-approximate stationary point. Finally, we present some simulation +results to show the effectiveness of the algorithm. + +
+
+ comment: To appear at IEEE ICC 2024 +
+
+
+
+
+ + ♻ ☆ Rademacher Complexity of Neural ODEs via Chen-Fliess Series + + +
+ We show how continuous-depth neural ODE models can be framed as single-layer, +infinite-width nets using the Chen--Fliess series expansion for nonlinear ODEs. +In this net, the output "weights" are taken from the signature of the control +input -- a tool used to represent infinite-dimensional paths as a sequence of +tensors -- which comprises iterated integrals of the control input over a +simplex. The "features" are taken to be iterated Lie derivatives of the output +function with respect to the vector fields in the controlled ODE model. The +main result of this work applies this framework to derive compact expressions +for the Rademacher complexity of ODE models that map an initial condition to a +scalar output at some terminal time. The result leverages the straightforward +analysis afforded by single-layer architectures. We conclude with some examples +instantiating the bound for some specific systems and discuss potential +follow-up work. + +
+
+ comment: 14 pages; submitted to L4DC 2024 +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks with polynomial activations have limited + expressivity + + +
+ The expressivity of Graph Neural Networks (GNNs) can be entirely +characterized by appropriate fragments of the first order logic. Namely, any +query of the two variable fragment of graded modal logic (GC2) interpreted over +labeled graphs can be expressed using a GNN whose size depends only on the +depth of the query. As pointed out by [Barcelo & Al., 2020, Grohe, 2021], this +description holds for a family of activation functions, leaving the +possibibility for a hierarchy of logics expressible by GNNs depending on the +chosen activation function. In this article, we show that such hierarchy indeed +exists by proving that GC2 queries cannot be expressed by GNNs with polynomial +activation functions. This implies a separation between polynomial and popular +non polynomial activations (such as Rectified Linear Units) and answers an open +question formulated by [Grohe, 21]. + +
+
+
+
+
+ + ♻ ☆ An Algorithm for Streaming Differentially Private Data + + +
+ Much of the research in differential privacy has focused on offline +applications with the assumption that all data is available at once. When these +algorithms are applied in practice to streams where data is collected over +time, this either violates the privacy guarantees or results in poor utility. +We derive an algorithm for differentially private synthetic streaming data +generation, especially curated towards spatial datasets. Furthermore, we +provide a general framework for online selective counting among a collection of +queries which forms a basis for many tasks such as query answering and +synthetic data generation. The utility of our algorithm is verified on both +real-world and simulated datasets. + +
+
+
+
+
+ + ♻ ☆ Adaptive Block Sparse Regularization under Arbitrary Linear Transform + + +
+ We propose a convex signal reconstruction method for block sparsity under +arbitrary linear transform with unknown block structure. The proposed method is +a generalization of the existing method LOP-$\ell_2$/$\ell_1$ and can +reconstruct signals with block sparsity under non-invertible transforms, unlike +LOP-$\ell_2$/$\ell_1$. Our work broadens the scope of block sparse +regularization, enabling more versatile and powerful applications across +various signal processing domains. We derive an iterative algorithm for solving +proposed method and provide conditions for its convergence to the optimal +solution. Numerical experiments demonstrate the effectiveness of the proposed +method. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification via Spatial-Temporal Tweedie Model for + Zero-inflated and Long-tail Travel Demand Prediction CIKM 2023 + + +
+ Understanding Origin-Destination (O-D) travel demand is crucial for +transportation management. However, traditional spatial-temporal deep learning +models grapple with addressing the sparse and long-tail characteristics in +high-resolution O-D matrices and quantifying prediction uncertainty. This +dilemma arises from the numerous zeros and over-dispersed demand patterns +within these matrices, which challenge the Gaussian assumption inherent to +deterministic deep learning models. To address these challenges, we propose a +novel approach: the Spatial-Temporal Tweedie Graph Neural Network (STTD). The +STTD introduces the Tweedie distribution as a compelling alternative to the +traditional 'zero-inflated' model and leverages spatial and temporal embeddings +to parameterize travel demand distributions. Our evaluations using real-world +datasets highlight STTD's superiority in providing accurate predictions and +precise confidence intervals, particularly in high-resolution scenarios. + +
+
+ comment: In proceeding of CIKM 2023. Doi: + https://dl.acm.org/doi/10.1145/3583780.3615215 +
+
+
+
+
+ + ♻ ☆ Gaussian Adaptive Attention is All You Need: Robust Contextual + Representations Across Multiple Modalities + + +
+ We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a +novel probabilistic attention framework, and the Gaussian Adaptive Transformer +(GAT), designed to enhance information aggregation across multiple modalities, +including Speech, Text and Vision. GAAM integrates learnable mean and variance +into its attention mechanism, implemented in a Multi-Headed framework enabling +it to collectively model any Probability Distribution for dynamic recalibration +of feature significance. This method demonstrates significant improvements, +especially with highly non-stationary data, surpassing the state-of-the-art +attention techniques in model performance (up to approximately +20% in +accuracy) by identifying key elements within the feature space. GAAM's +compatibility with dot-product-based attention models and relatively low number +of parameters showcases its adaptability and potential to boost existing +attention frameworks. Empirically, GAAM exhibits superior adaptability and +efficacy across a diverse range of tasks, including emotion recognition in +speech, image classification, and text classification, thereby establishing its +robustness and versatility in handling multi-modal data. Furthermore, we +introduce the Importance Factor (IF), a new learning-based metric that enhances +the explainability of models trained with GAAM-based methods. Overall, GAAM +represents an advancement towards development of better performing and more +explainable attention models across multiple modalities. + +
+
+
+
+
+ + ♻ ☆ Generative Design of Crystal Structures by Point Cloud Representations + and Diffusion Model + + +
+ Efficiently generating energetically stable crystal structures has long been +a challenge in material design, primarily due to the immense arrangement of +atoms in a crystal lattice. To facilitate the discovery of stable material, we +present a framework for the generation of synthesizable materials, leveraging a +point cloud representation to encode intricate structural information. At the +heart of this framework lies the introduction of a diffusion model as its +foundational pillar. To gauge the efficacy of our approach, we employ it to +reconstruct input structures from our training datasets, rigorously validating +its high reconstruction performance. Furthermore, we demonstrate the profound +potential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely +new materials, emphasizing their synthesizability. Our research stands as a +noteworthy contribution to the advancement of materials design and synthesis +through the cutting-edge avenue of generative design instead of the +conventional substitution or experience-based discovery. + +
+
+ comment: I have submitted to a journal +
+
+
+
+
+ + ♻ ☆ Associative Transformer + + +
+ Emerging from the pairwise attention in conventional Transformers, there is a +growing interest in sparse attention mechanisms that align more closely with +localized, contextual learning in the biological brain. Existing studies such +as the Coordination method employ iterative cross-attention mechanisms with a +bottleneck to enable the sparse association of inputs. However, these methods +are parameter inefficient and fail in more complex relational reasoning tasks. +To this end, we propose Associative Transformer (AiT) to enhance the +association among sparsely attended input patches, improving parameter +efficiency and performance in relational reasoning tasks. AiT leverages a +learnable explicit memory, comprised of various specialized priors, with a +bottleneck attention to facilitate the extraction of diverse localized +features. Moreover, we propose a novel associative memory-enabled patch +reconstruction with a Hopfield energy function. The extensive experiments in +four image classification tasks with three different sizes of AiT demonstrate +that AiT requires significantly fewer parameters and attention layers while +outperforming Vision Transformers and a broad range of sparse Transformers. +Additionally, AiT establishes new SOTA performance in the Sort-of-CLEVR +dataset, outperforming the previous Coordination method. + +
+
+
+
+
+ + ♻ ☆ Hyperspectral Pixel Unmixing with Latent Dirichlet Variational + Autoencoder + + +
+ We present a method for hyperspectral pixel {\it unmixing}. The proposed +method assumes that (1) {\it abundances} can be encoded as Dirichlet +distributions and (2) spectra of {\it endmembers} can be represented as +multivariate Normal distributions. The method solves the problem of abundance +estimation and endmember extraction within a variational autoencoder setting +where a Dirichlet bottleneck layer models the abundances, and the decoder +performs endmember extraction. The proposed method can also leverage transfer +learning paradigm, where the model is only trained on synthetic data containing +pixels that are linear combinations of one or more endmembers of interest. In +this case, we retrieve endmembers (spectra) from the United States Geological +Survey Spectral Library. The model thus trained can be subsequently used to +perform pixel unmixing on "real data" that contains a subset of the endmembers +used to generated the synthetic data. The model achieves state-of-the-art +results on several benchmarks: Cuprite, Urban Hydice and Samson. We also +present new synthetic dataset, OnTech-HSI-Syn-21, that can be used to study +hyperspectral pixel unmixing methods. We showcase the transfer learning +capabilities of the proposed model on Cuprite and OnTech-HSI-Syn-21 datasets. +In summary, the proposed method can be applied for pixel unmixing a variety of +domains, including agriculture, forestry, mineralogy, analysis of materials, +healthcare, etc. Additionally, the proposed method eschews the need for +labelled data for training by leveraging the transfer learning paradigm, where +the model is trained on synthetic data generated using the endmembers present +in the "real" data. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Dance-to-Music Generation with Encoder-based Textual Inversion of + Diffusion Models + + +
+ The harmonious integration of music with dance movements is pivotal in +vividly conveying the artistic essence of dance. This alignment also +significantly elevates the immersive quality of gaming experiences and +animation productions. While there has been remarkable advancement in creating +high-fidelity music from textual descriptions, current methodologies mainly +concentrate on modulating overarching characteristics such as genre and +emotional tone. They often overlook the nuanced management of temporal rhythm, +which is indispensable in crafting music for dance, since it intricately aligns +the musical beats with the dancers' movements. Recognizing this gap, we propose +an encoder-based textual inversion technique for augmenting text-to-music +models with visual control, facilitating personalized music generation. +Specifically, we develop dual-path rhythm-genre inversion to effectively +integrate the rhythm and genre of a dance motion sequence into the textual +space of a text-to-music model. Contrary to the classical textual inversion +method, which directly updates text embeddings to reconstruct a single target +object, our approach utilizes separate rhythm and genre encoders to obtain text +embeddings for two pseudo-words, adapting to the varying rhythms and genres. To +achieve a more accurate evaluation, we propose improved evaluation metrics for +rhythm alignment. We demonstrate that our approach outperforms state-of-the-art +methods across multiple evaluation metrics. Furthermore, our method seamlessly +adapts to in-the-wild data and effectively integrates with the inherent +text-guided generation capability of the pre-trained model. Samples are +available at \url{https://youtu.be/D7XDwtH1YwE}. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ SNP-S3: Shared Network Pre-training and Significant Semantic + Strengthening for Various Video-Text Tasks + + +
+ We present a framework for learning cross-modal video representations by +directly pre-training on raw data to facilitate various downstream video-text +tasks. Our main contributions lie in the pre-training framework and proxy +tasks. First, based on the shortcomings of two mainstream pixel-level +pre-training architectures (limited applications or less efficient), we propose +Shared Network Pre-training (SNP). By employing one shared BERT-type network to +refine textual and cross-modal features simultaneously, SNP is lightweight and +could support various downstream applications. Second, based on the intuition +that people always pay attention to several "significant words" when +understanding a sentence, we propose the Significant Semantic Strengthening +(S3) strategy, which includes a novel masking and matching proxy task to +promote the pre-training performance. Experiments conducted on three downstream +video-text tasks and six datasets demonstrate that, we establish a new +state-of-the-art in pixel-level video-text pre-training; we also achieve a +satisfactory balance between the pre-training efficiency and the fine-tuning +performance. The codebase are available at +https://github.com/alipay/Ant-Multi-Modal-Framework/tree/main/prj/snps3_vtp. + +
+
+ comment: Accepted by TCSVT (IEEE Transactions on Circuits and Systems for + Video Technology) +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`